def get_article(url, subject, tears=15, verbose=False): base_url = 'http://www.ddanzi.com/index.php?m=1&document_srl=' # 사이트에서 html 가져오기 s = utils.sess('http://www.ddanzi.com/') resp = s.get(url) soup = BeautifulSoup(resp.text, 'lxml') table = soup.select('table > tbody')[1] articles_total = table.select('tr') # 공지글 제거 articles = [] for article in articles_total: try: if article['class'][0] == 'notice': continue except: articles.append(article) utils.print_log(verbose, "articles cnt", len(articles)) # 게시글이 없는 경우 빈 데이터 프레임 리턴 if len(articles) == 0: return pd.DataFrame() # 추출 요소 a_list = [] # print(len(articles)) for a in articles: l = [] title = mod_char(a.select_one('td.title > a').text, 'for_title') # print(title) article_id = re.search(r'(\d{9})', a.find('a')['href']).group() # print(article_link) article_link = base_url + article_id try: # 리플수가 없을 경우에 발생하는 None type error reply_num = mod_reply(a.find('span', {'class': 'talk'}).text) except: reply_num = '0' # 날짜를 구하기 위해 게시글 클릭 cont = BeautifulSoup(s.get(article_link).text, 'html.parser') # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop if subject == 'touching': yn = touch_article(cont, tears) if not yn: continue temp = cont.find('div', {'class': 'right'}) user_id = mod_char(temp.find('a').text) view_num = cont.find('span', {'class': 'read'}).text date = cont.find('p', {'class': 'time'}).text if len(date) < 9: date = str(datetime.now()).split(' ')[0] + ' ' + date else: date = date.replace('.', '-') content = '' # 추출항목 리스트로 생성 l.append(title) l.append(date) l.append(article_id) l.append(user_id) l.append(article_link) l.append(content) l.append(reply_num) l.append(view_num) a_list.append(l) utils.print_log(verbose, "article line 1", l) time.sleep(random.randint(2, 7) / 3) if len(a_list) == 0: # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환 s.close() return pd.DataFrame() s.close() # close Session # 결과 데이터 프레임 생성 result = pd.DataFrame(a_list) result.columns = [ 'title', 'date_time', 'article_id', 'member_id', 'article_link', 'content', 'reply_num', 'view_num' ] # Excepting the particular articles by '펌쟁이' result = result[result['member_id'] != '펌쟁이'] result.loc[:, 'date_time'] = pd.to_datetime(result['date_time']) result.set_index('article_id') return result
def get_article(url, subject, tears=15, verbose=False): base_url = 'http://starboard.kr/todayhumor' search_url = 'http://starboard.kr/conn/board/search' # Get a html s = utils.sess('http://starboard.kr/') payload = {'search_text': url} resp = s.post(search_url, data=payload) soup = BeautifulSoup(resp.text, 'lxml') articles = soup.find_all('div', attrs={'class': 'ItemContent Discussion'}) utils.print_log(verbose, "articles cnt", len(articles)) # 게시글이 없는 경우 리턴 if len(articles) == 0: s.close() return pd.DataFrame() a_list = [] for a in articles: l = [] title = mod_title( a.find('div', attrs={ 'class': 'Title' }).find('a').text) utils.print_log(verbose, "1 article title", title) user_id = '' try: article_link = a.find('div', attrs={ 'class': 'Title' }).find('a')['href'] utils.print_log(verbose, "2 article link", article_link) except: return pd.DataFrame() # print(article_link) article_id = re.search(r's_no=(\d+)', article_link).group(1) utils.print_log(verbose, "3 article id", article_id) date = a.find('time')['datetime'] utils.print_log(verbose, "4 article date", date) content = '' try: reply_num = re.search(r'\[([0-9]+)\]', title).group(1) except: reply_num = '0' utils.print_log(verbose, "5 article reply cnt", reply_num) view_num = a.select_one( 'div.Meta.Meta-Discussion > span.MItem.MCount.ViewCount').text utils.print_log(verbose, "6 article view cnt", view_num) user_id = '' # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop if subject == 'touching': yn = touch_article(s, article_link, tears) if not yn: continue # Making the list l.append(title) l.append(date) l.append(article_id) l.append(user_id) l.append(article_link) l.append(content) l.append(reply_num) l.append(view_num) a_list.append(l) utils.print_log(verbose, "article line 1", l) time.sleep(random.randint(17, 41)) if len(a_list) == 0: # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환 s.close() return pd.DataFrame() s.close() result = pd.DataFrame(a_list) # munging of the dataframe result.columns = [ 'title', 'date_time', 'article_id', 'member_id', 'article_link', 'content', 'reply_num', 'view_num' ] result.loc[:, 'date_time'] = pd.to_datetime(result['date_time']) result.set_index('article_id') return result
def get_article(url, subject, tears=15, verbose=False): base_url = 'https://dvdprime.com/g2/bbs/board.php?bo_table=comm&wr_id=' # Get a html s = utils.sess('https://dvdprime.com/') resp = s.get(url) soup = BeautifulSoup(resp.text, 'lxml') # Extracting articles from the html articles = soup.find_all('a', {'class': 'list_subject_a'}) utils.print_log(verbose, "articles cnt", len(articles)) if len(articles) == 0: s.close() return pd.DataFrame() # Extracting elements from articles a_list = [] for a in articles: l = [] try: title = a.find('span', {'class': 'list_subject_span_pc'}).text except: continue if title.find('알림]') != -1: # 공지글은 넘어가기 continue # print(title) article_id = re.search(r'(\d{8})', a['href']).group() try: reply_num = a.select_one( 'span > span > span > span.list_comment_num_a > em').text except: reply_num = '0' article_link = base_url + article_id # 게시글 내용 가져오기 content = BeautifulSoup(s.get(article_link).text, 'html.parser') # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop if subject == 'touching': yn = touch_article(content, tears) if not yn: continue user_id = mod_user_id(content.find('span', {'class': 'member'}).text) view_num = re.search(r'[0-9]+', content.find('div', { 'id': 'view_hit' }).text).group() # Gathering the cotent of each article date = mod_date(content.find('div', {'id': 'view_datetime'}).text) content = '' # Making the list l.append(title) l.append(date) l.append(article_id) l.append(user_id) l.append(article_link) l.append(content) l.append(reply_num) l.append(view_num) a_list.append(l) utils.print_log(verbose, "article line 1", l) time.sleep(random.randint(2, 7) / 3) if len(a_list) == 0: # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환 s.close() return pd.DataFrame() s.close() result = pd.DataFrame(a_list) # munging of the dataframe result.columns = [ 'title', 'date_time', 'article_id', 'member_id', 'article_link', 'content', 'reply_num', 'view_num' ] result.loc[:, 'date_time'] = pd.to_datetime(result['date_time']) return result
def get_article(url, subject, tears=15, verbose=False): base_url = 'http://m.ruliweb.com/community/board/300148/read/' # Get a html s = utils.sess(base_url) resp = s.get(url) soup = BeautifulSoup(resp.text, 'lxml') # Extracting articles from the html articles = soup.select('tr.table_body') articles = [x for x in articles if len(x.get('class')) < 2] # 공지글/베스트 제외 utils.print_log(verbose, "articles cnt", len(articles)) # 유동적인 결과없음 글이 있으면 리턴 if articles[0].find('strong').text == '결과없음': return pd.DataFrame() # Extracting elements from articles a_list = [] for a in articles: l = [] title = a.find_all('a', {'class': 'subject_link'})[0].text.strip() user_id = mod_user_id(a.find_all('span', {'class': 'writer'})[0].text) article_link = a.find_all('a', {'class': 'subject_link'})[0].get('href') # print(article_link) article_id = re.search(r'(\d{8})', article_link).group() article_link = base_url + article_id # 링크에서 검색어 나오지 않게 수정 reply_num = mod_reply(a) view_num = re.search(r'[0-9]+', a.find('span', { 'class': 'hit' }).text).group() # Gathering the cotent of each article resp = s.get(article_link) con = BeautifulSoup(resp.text, 'html.parser') # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop if subject == 'touching': yn = touch_article(con, tears) if not yn: continue date = mod_date(con.find('span', {'class': 'regdate'}).text) content = '' # Making the list l.append(title) l.append(date) l.append(article_id) l.append(user_id) l.append(article_link) l.append(content) l.append(reply_num) l.append(view_num) a_list.append(l) utils.print_log(verbose, "article line 1", l) time.sleep(random.randint(2, 7) / 3) if len(a_list) == 0: # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환 return pd.DataFrame() result = pd.DataFrame(a_list) # munging of the dataframe result.columns = [ 'title', 'date_time', 'article_id', 'member_id', 'article_link', 'content', 'reply_num', 'view_num' ] result.loc[:, 'date_time'] = pd.to_datetime(result['date_time']) # 루리웹 도배글 삭제 result = result[~result['title'].str.contains('주식아') & ~result['title'].str.contains('주식이')] result.set_index('article_id') return result
def get_article(url, subject, tears=15, verbose=False): base_url = 'http://starboard.kr/slr' search_url = 'http://starboard.kr/conn/board/search' # Get a html s = utils.sess('http://starboard.kr/') base = s.get(base_url) # s_result.encoding = 'euc-kr' # Revising the encoding # Extracting articles from the html payload = {'search_text': url} resp = s.post(search_url, data=payload) soup = BeautifulSoup(resp.text, 'lxml') articles = soup.find_all('div', attrs={'class': 'ItemContent Discussion'}) utils.print_log(verbose, "articles cnt", len(articles)) if len(articles) == 0: s.close() return pd.DataFrame() # print(articles) s.close() # starboard session 종료 s = utils.sess('http://www.slrclub.com/') a_list = [] for a in articles: l = [] title = mod_title( a.find('div', attrs={ 'class': 'Title' }).find('a').text) utils.print_log(verbose, "1 article title", title) try: article_link = a.find('div', attrs={ 'class': 'Title' }).find('a')['href'] utils.print_log(verbose, "2 article link", article_link) except: continue # print(article_link) article_id = re.search(r'(\d{8})', article_link).group() utils.print_log(verbose, "3 article id", article_id) date = a.find('time')['datetime'] utils.print_log(verbose, "4 article date", date) try: reply_num = re.search(r'\[([0-9]+)\]', title).group(1) except: reply_num = '0' utils.print_log(verbose, "5 article reply cnt", reply_num) view_num = a.select_one( 'div.Meta.Meta-Discussion > span.MItem.MCount.ViewCount').text utils.print_log(verbose, "6 article view cnt", view_num) # Get a content of the article con = BeautifulSoup(s.get(article_link).text, 'html.parser') # 존재하지 않는 게시물 예외 처리 if con.find('p', {'class': 'err_msg'}) is not None: continue # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop if subject == 'touching': yn = touch_article(con, tears) if not yn: continue member_id = con.find('span', attrs={'class': 'lop'}) if member_id is None: member_id = '' else: member_id = con.find('span', attrs={'class': 'lop'}).text content = '' # Making the list l.append(title) l.append(date) l.append(article_id) l.append(member_id) l.append(article_link) l.append(content) l.append(reply_num) l.append(view_num) a_list.append(l) utils.print_log(verbose, "article line 1", l) time.sleep(random.randint(2, 7) / 3) if len(a_list) == 0: # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환 base.close() s.close() return pd.DataFrame() # 세션 종료 base.close() s.close() result = pd.DataFrame(a_list) # munging of the dataframe result.columns = [ 'title', 'date_time', 'article_id', 'member_id', 'article_link', 'content', 'reply_num', 'view_num' ] result.loc[:, 'date_time'] = pd.to_datetime(result['date_time']) result = result[result['member_id'] != ''] result.set_index('article_id') return result
def get_article(url, subject, tears=15, verbose=False): base_url = 'https://www.clien.net/service/board/park/' # Get a html s = utils.sess('https://www.clien.net') resp = s.get(url) # Extracting articles from the html soup = BeautifulSoup(resp.text, 'lxml') articles = soup.select('div.list_item') utils.print_log(verbose, "articles cnt", len(articles)) if len(articles) == 0: del resp return pd.DataFrame() # Extracting elements from the html document a_list = [] for a in articles: l = [] try: title = mod_text(a.select_one('span.list_subject > a').text) except AttributeError: continue date_time = a.select_one('span.timestamp').text #print("datetime", date_time) article_link = a.select_one('span.list_subject > a')['href'] article_id = re.search(r'[0-9]{8}', article_link).group(0) article_link = base_url + article_id try: con = BeautifulSoup(s.get(article_link).text, 'lxml') # 감동 주제일 경우 Y값을 판단해서 Y가 아니면 next loop if subject == 'touching': yn = touch_article(con, tears) if not yn: continue if con.select_one('span.nickname > img') is None: member_id = mod_text(con.select_one('span.nickname').text) else: member_id = con.select_one('span.nickname > img')['alt'] content = '' reply_num = con.select_one('div#comment-point strong').text view_num = con.select_one('span.view_count > strong').text except Exception as e: print(e) continue l.append(title) l.append(date_time) l.append(article_id) l.append(member_id) l.append(article_link) l.append(content) l.append(reply_num) l.append(view_num) a_list.append(l) utils.print_log(verbose, "article line 1", l) time.sleep(random.randint(2, 7) / 3) if len(a_list) == 0: # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환 del resp return pd.DataFrame() del resp result = pd.DataFrame(a_list) # 데이터 프레임 컬럼 생성 result.columns = [ 'title', 'date_time', 'article_id', 'member_id', 'article_link', 'content', 'reply_num', 'view_num' ] result.loc[:, 'date_time'] = pd.to_datetime(result['date_time'].tolist()) result.set_index('article_id') return result
def get_article(url, subject, tears=15, verbose=False): base_url = 'http://www.etoland.co.kr/plugin/mobile/board.php?bo_table=etoboard01&wr_id=' # Get a html s = utils.sess('http://www.etoland.co.kr/') resp = s.get(url) soup = BeautifulSoup(resp.text, 'lxml') articles = soup.find_all('li', {'class': 'subject'}) utils.print_log(verbose, "articles cnt", len(articles)) # 게시글이 없으면 리턴 if len(articles) == 0: s.close() return pd.DataFrame() a_list = [] for a in articles: l = [] title = mod_title(a.find('div').text) try: user_id = mod_user_id(a.find('span', {'class': 'name'}).text) except: continue article_id = re.search(r'wr_id=\d+', a.find('a')['href']).group().replace( 'wr_id=', '') article_link = base_url + article_id reply_num = mod_reply(a.find('div').text) view_num = re.search( r'[0-9]+', a.find_all('span', {'class': 'datetime'})[1].text).group() # Gathering the cotent of each article con = s.get(article_link) temp = BeautifulSoup(con.text, 'html.parser') if subject == 'touching': yn = touch_article(temp, tears) if not yn: continue try: ''' content = temp.cssselect('td.mw_basic_view_content')[0].\ cssselect('div[id="view_content"]')[0].text_content() ''' content = '' date = mod_date(temp.find('span', {'class': 'write_date'}).text) if temp.find('span', {'class': 'write_date'}).text == date: continue except: continue # Making the list l.append(title) l.append(date) l.append(article_id) l.append(user_id) l.append(article_link) l.append(content) l.append(reply_num) l.append(view_num) a_list.append(l) utils.print_log(verbose, "article line 1", l) time.sleep(random.randint(2, 7) / 3) if len(a_list) == 0: # 감동 주제일 경우 적합 게시물이 없을 경우 빈 DF 반환 s.close() return pd.DataFrame() s.close() result = pd.DataFrame(a_list) # munging of the dataframe result.columns = [ 'title', 'date_time', 'article_id', 'member_id', 'article_link', 'content', 'reply_num', 'view_num' ] result.loc[:, 'date_time'] = pd.to_datetime(result['date_time']) result.set_index('article_id') return result