def extract(self, url): try: html, after_url = self.extract_html(url) a = Article(url, language='zh') a.download() a.parse() try: row_t = str(a.publish_date)[0:16] create_time = self.extract_time_str(html, row_t) except: create_time = 0 if not a.title: a.title = '' # url = self.url_pattern.search(html).group(1) # split()使用默认分隔符 d_r = { 'title': a.title, 'article': a.text.split(), 'html': html, 'create_time': int(create_time), 'url': after_url } return d_r except: print('抽取错误!!!') print(url)
def getArticleInfo(): post_data = (literal_eval(request.data.decode('utf8'))) country = post_data["country"] articleInfo = {} urls = getNewsUrls(country) count = 0 goodCount = 0 while count < len(urls): article = Article(urls[count]) try: article.download() article.parse() if (isinstance(article.publish_date, datetime)): date = article.publish_date.strftime('%m/%d/%Y') else: date = article.publish_date authors = [] for x in article.authors: if len(x.split(" ")) == 2: authors.append(x) if not authors: authors[0] = "No Author" if date == None: date = "No Date" if article.top_image == None: article.top_image = "No imageURL" if article.title == None: article.title = "No title" if count != 0 and goodCount != 0 and urls[count] == articleInfo[ goodCount - 1]["url"]: print("Inside if statement") raise Exception articleInfo[goodCount] = { "authors": authors, "date": date, "url": urls[count], "imageURL": article.top_image, "title": article.title } count = count + 1 goodCount = goodCount + 1 except Exception as e: print(e) count = count + 1 print("bad article") return articleInfo
def home(url): data = {} data['url'] = url # Validate url if urlparse.urlparse(url).scheme not in ('http', 'https'): data['error'] = 'Invalid URL' return json.dumps(data) a = Article(url) a.download() a.parse() data['title'] = a.title data['authors'] = a.authors data['text'] = a.text try: a.nlp() except UnicodeDecodeError: # Strip non-ascii characters a.title = to_ascii(a.title) a.text = to_ascii(a.text) a.nlp() # NLP data['summary'] = a.summary data['keywords'] = a.keywords data['tags'] = list(a.tags) # Media data['top_image'] = a.top_image data['images'] = a.images data['movies'] = a.movies # Meta data['source_url'] = a.source_url data['published_date'] = a.published_date data['meta_img'] = a.meta_img data['meta_keywords'] = a.meta_keywords data['meta_lang'] = a.meta_lang return json.dumps(data)
def ProcessPage(keyword, vBrowser, vNews_name, vNews_url, language): """ Process search result page get articles and save them to a pandas dataframe (articles_page) (1) list results from page (2) loop over results, get article (3) return dataframe """ # output: pandas dataframe with title, publishing date, article text and url articles_page = pd.DataFrame( columns=['title', 'publish_date', 'text', 'url']) # 1) list results search_result_page_source = vBrowser.page_source # make url regex-usable url_any = vNews_url url_any = re.sub(re.escape('?s=' + keyword), '', url_any) url_any = re.sub(re.escape('search?k=' + keyword), '', url_any) url_any = re.sub('\?m\=[0-9]{6}', '', url_any) url_any = re.escape(url_any) + '(?=\S*[-])([0-9a-zA-Z-\/\.]+)' regex = re.compile(url_any) logger.info('searching for {}'.format(url_any)) search_results = list( set([ match[0] for match in regex.finditer(search_result_page_source) if keyword in match[0].lower() ])) if vNews_name in ['NewVision']: regex = re.compile( '\/new\_vision\/news\/(?=\S*[-])([0-9a-zA-Z-\/\.]+)') search_results = list( set([ match[0] for match in regex.finditer(search_result_page_source) if keyword in match[0].lower() ])) search_results = [ 'https://www.newvision.co.ug' + search_result for search_result in search_results ] if len(search_results) > 0: logger.info("found {0} article(s):".format(len(search_results))) for title in search_results: logger.info("url: {0}".format(title)) else: logger.info('no articles found') # 2) for each result, get article and save it for idx, search_result in enumerate(search_results): logger.info('processing {}'.format(search_result)) # download article article = Article(search_result, keep_article_html=True) article.download() attempts = 0 while (article.download_state != 2) & (attempts < 5): #ArticleDownloadState.SUCCESS is 2 attempts += 1 time.sleep(1) if article.download_state != 2: logger.warning( 'unable to download article: {}'.format(search_result)) continue article.parse() article_html = str(article.html) # select articles with keyword regex = re.compile(keyword, re.IGNORECASE) if re.search(regex, article.html) is not None: logger.debug('{}'.format(article_html)) # get date date = article.publish_date date_str = "" search_date = False if date is not None: # keep date found only if older than today if pd.to_datetime(date_str).date() < pd.to_datetime( datetime.today()).date(): date_str = date.strftime(DATE_FORMAT) else: search_date = True else: search_date = True if search_date: article_html = re.sub('\s+', ' ', article_html) dates_found = [] res_date = [ re.compile('[a-zA-ZÀ-ÿ]\w+\s[0-9]+\,\s[0-9]{4}'), re.compile('[a-zA-ZÀ-ÿ]\w+\s[0-9]+\s[0-9]{4}'), re.compile('[0-9]\w+\s[a-zA-ZÀ-ÿ]+\,\s[0-9]{4}'), re.compile('[0-9]\w+\s[a-zA-ZÀ-ÿ]+\s[0-9]{4}'), re.compile('[0-9]+\s[a-zA-ZÀ-ÿ]+\,\s[0-9]{4}'), re.compile('[0-9]+\s[a-zA-ZÀ-ÿ]+\s[0-9]{4}'), re.compile('[0-9]{2}\/[0-9]{2}\/[0-9]{4}'), re.compile('[0-9]{2}\-[0-9]{2}\-[0-9]{4}'), re.compile('[0-9]{2}\.[0-9]{2}\.[0-9]{4}') ] for re_date in res_date: for match in re_date.finditer(article_html): if is_date(match.group(), language): dates_found.append((match.start(), match.group())) if len(dates_found) > 0: logger.info('{}'.format(dates_found)) dates_found.sort(key=lambda tup: tup[0]) for res in dates_found: try: res_date = dateparser.parse(res[1], languages=[language], settings={ 'DATE_ORDER': 'DMY' }).date() if (res_date < pd.to_datetime( datetime.today()).date() and res_date > pd.to_datetime('30/04/1993', format="%d/%m/%Y").date()): date_str = res_date.strftime(DATE_FORMAT) break except: pass if date_str == "": logger.warning( 'Publication date not found or wrongly assigned, skipping article' ) continue else: logger.info('Publication date assigned: {}'.format(date_str)) # Take newspaper name out of article title article.title = remove_newspaper_name_from_title( article.title, vNews_name) # if no text is present (e.g. only video), use title as text article_text = article.text if len(str(article.text)) == 0: article_text = article.title # add to dataframe logger.info('{0} : {1}'.format(article.title, date_str)) articles_page.loc[idx] = [ article.title, date_str, article_text, article.url ] # 3) return dataframe if len(search_results) > 0: logger.info('{}'.format(articles_page.head())) return articles_page
def ProcessPage(vBrowser, vArticles_all): """Process search result page get articles and save them to a pandas dataframe (articles_page) (1) list results from page (2) loop over results, get article and store it (3) return dataframe """ # output: pandas dataframe with title, publishing date, article text and url articles_page = pd.DataFrame( columns=['title', 'publish_date', 'text', 'url']) # 1) list results search_result_page_source = vBrowser.page_source # for ZambianObserver ********************************* regex = re.compile( 'https:\/\/www\.zambianobserver\.com\/(?=\S*[-])([0-9a-zA-Z-]+)\/') search_results = list( set([ match[0] for match in regex.finditer(search_result_page_source) if "flood" in match[0].lower() ])) # for TimesOfZambia *********************************** # search_results_we = vBrowser.find_elements_by_class_name("readmore"); # search_results = [search_result.get_attribute("href") for search_result in search_results_we] # for Lusakatimes *********************************** # search_results_we = vBrowser.find_elements_by_class_name("td-image-wrap"); # regex_prefilter = re.compile(r'flood', re.IGNORECASE) # search_results = [search_result.get_attribute("href") for search_result in search_results_we if re.search(regex_prefilter, search_result.get_attribute("title")) is not None] # for ZambiaDailyMail *********************************** # regex = re.compile('http:\/\/www\.daily-mail\.co\.zm\/(?=\S*[-])([0-9a-zA-Z-]+)\/') # search_results = list(set([ match[0] for match in regex.finditer(search_result_page_source) if "flood" in match[0].lower()])) # for ZambianWatchdog *********************************** # regex = re.compile('https:\/\/www\.zambiawatchdog\.com\/(?=\S*[-])([0-9a-zA-Z-]+)\/') # search_results = list(set([ match[0] for match in regex.finditer(search_result_page_source) if "flood" in match[0].lower()])) if len(search_results) > 0: print("found article(s):") for title in search_results: print("url: {0}".format(title)) # 2) for each result, get article and save it for idx, search_result in enumerate(search_results): # download article article = Article(search_result) article.download() while article.download_state != 2: #ArticleDownloadState.SUCCESS is 2 time.sleep(1) article.parse() # select articles with "flood" regex = re.compile(r'flood', re.IGNORECASE) if re.search(regex, article.title) is not None: # get date date = article.publish_date if date is not None: date_str = date.strftime('%m/%d/%Y') else: # for TimesOfZambia ******************************************* # date_re = re.search('[a-zA-z]\w+\s[0-9][0-9]\,\s[0-9]{4}', article.html) # date_str = date_re[0] # for ZambiaDailyMail, LusakaTimes **************************** dates_all = [ m.group(0) for m in re.finditer( r'[a-zA-z]\w+\s[0-9]+\,\s[0-9]{4}', article.html) ] if len(dates_all) > 1: date_str = dates_all[1] else: date_str = "" # for ZambianWatchdog ***************************************** # dates_all = [m.group(0) for m in re.finditer(r'[a-zA-z]\w+\s[0-9]+\,\s[0-9]{4}', article.html)] # if len(dates_all) > 1: # date_str = dates_all[0] # else: # date_str = "" # ************************************************************* # fix title, if necessary (only for LusakaTimes) article.title = re.sub('Zambia : ', '', article.title) # add to dataframe articles_page.loc[idx] = [ article.title, date_str, article.text, article.url ] # print dataframe head, to check that things make sense if idx == 3: print(articles_page.head()) # 3) return dataframe vArticles_all = vArticles_all.append(articles_page, ignore_index=True) return vArticles_all