def search(self, query, start_date, end_date=None): """start_date: str ex) 2017-05-01 """ if end_date == None: end_date = start_date start_date = convert_str_date_to_datetime(start_date) end_date = convert_str_date_to_datetime(end_date) for i in range((end_date - start_date).days + 1): scrap_date = start_date + timedelta(days=i) scrap_date = convert_datetime_to_str(scrap_date) year, month, date = scrap_date.split('-') urls = get_article_urls(query, scrap_date, scrap_date, verbose=self.verbose, debug=self.debug) docs = [] indexs = [] comments = [] for i, url in enumerate(urls): if self.verbose and i % 50 == 0: print('\r - scrapping {} / {} news'.format(i+1, len(urls)), end='') try: json_dict = scrap(url) content = json_dict.get('content', '') if not content: continue index = '{}\t{}\t{}\t{}'.format( get_path(json_dict['oid'], year, month, date, json_dict['aid']), json_dict.get('sid1',''), json_dict.get('writtenTime', ''), json_dict.get('title', '') ) docs.append(content.replace('\n', ' ').replace('\r\n', ' ').strip()) indexs.append(index) if self.comments: comments.append(get_comments(url)) time.sleep(self.sleep) except Exception as e: print('Exception: {}\n{}'.format(url, str(e))) continue self._save_news_as_corpus(scrap_date, docs, indexs) if self.comments: self._save_comments(scrap_date, indexs, comments) if self.verbose: print('\r .. search crawler saved {} articles in {} on {}\n\n' .format(len(urls), len(urls), year+month+date)) if self.verbose: print('Search Crawling For Query [{}] Time Between [{}] ~ [{}] Finished' .format(query, start_date, end_date)) return True
def search(self, query, start_date, end_date=None): """ start_date: str ex) 2017-05-01 """ if end_date == None: end_date = start_date start_date = convert_str_date_to_datetime(start_date) end_date = convert_str_date_to_datetime(end_date) """ 날짜별로 크롤링 """ for i in range((end_date - start_date).days + 1): scrap_date = start_date + timedelta(days=i) scrap_date = convert_datetime_to_str(scrap_date) year, month, date = scrap_date.split('-') urls = get_article_urls(query, scrap_date, verbose=self.verbose, debug=self.debug) docs = [] indexs = [] comments = [] for i in range((end_date - start_date).days + 1): scrap_date = start_date + timedelta(days=i) # 시작날짜부터 하루씩 증가 scrap_date = convert_datetime_to_str(scrap_date) # 문자열로 다시 변경 year, month, date = scrap_date.split('-') # 년, 월, 일 분리 # 해당 날짜에 해당하는 검색 기사 url 리스트를 구해옴 urls = get_article_urls(query, scrap_date, self.verbose, self.debug) docs = [] indexs = [] comments = [] for i, url in enumerate(urls): # verbose 옵션이 있으면 50개마다 진행상황 출력 if self.verbose and i % 50 == 0: print('\r - scrapping {} / {} news'.format( i + 1, len(urls)), end='') try: json_dict = scrap(url) content = json_dict.get('content', '') if not content: continue # { 언론사ID/년/월/일/뉴스기사ID } \t { sid1 카테고리 이름 or 번호 } \t { 뉴스기사 작성시간 } \t { 기사제목 } index = '{}\t{}\t{}\t{}'.format( get_path(json_dict['oid'], year, month, date, json_dict['aid']), json_dict.get('sid1', ''), json_dict.get('writtenTime', ''), json_dict.get('title', '')) # docs에 기사 내용 추가 docs.append( content.replace('\n', ' ').replace('\r\n', ' ').strip()) # indexs에 index 추가 indexs.append(index) # 댓글 수집 옵션이 있다면 get_comments 함수 실행 if self.comments: comments.append(get_comments(url)) time.sleep(self.sleep) except Exception as e: print('Exception: {}\n{}'.format(url, str(e))) continue if self.verbose: print( '\r .. search crawler saved {} articles in {} on {}'. format(len(urls), len(urls), year + month + date)) if not docs: continue if (self.dbsave): self._save_mongodb(scrap_date, docs, indexs, comments, query) else: self._save_news_as_corpus(scrap_date, docs, indexs) if self.comments: self._save_comments(scrap_date, indexs, comments) if self.verbose: print( 'Search Crawling For Query [{}] Time Between [{}] ~ [{}] Finished' .format(query, start_date, end_date)) return True