def test_download_file_success(self): url = "file://" + os.path.join(HTML_FN, "cnn_article.html") article = Article(url=url) article.download() self.assertTrue(DOWNLOADED in article.workflow) self.assertEqual(article.download_exception_msg, None) self.assertEqual(75406, len(article.html))
def parse(url, language): article = Article(url, language=language) article.build() if article.keywords: print('Article Keywords: ' + json.dumps(article.keywords) + '\n') if article.summary: print('Article Summary: ' + article.summary + '\n') print('Article Text: ' + article.text)
def test_download_file_failure(self): url = "file://" + os.path.join(HTML_FN, "does_not_exist.html") article = Article(url=url) # noinspection PyUnusedLocal try: article.download() except ArticleException as ex: self.assertEqual(0, len(article.html)) self.assertTrue(DOWNLOADED not in article.workflow) self.assertEqual(article.download_exception_msg, "No such file or directory")
def test_article_pdf_fetching(): article = Article(url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf') article.build() assert not article.html.startswith('%PDF-') assert len(article.keywords) assert len(article.authors) assert article.publish_date assert article.summary assert len(article.text) > len(article.summary) assert article.text assert article.url
def test_thai_pdf_extract(): article = Article( url="http://tpch-th.listedcompany.com/misc/ShareholderMTG/egm201701/20170914-tpch-egm201701-enc02-th.pdf", language='th') article.build() assert not article.html.startswith('%PDF-') assert len(article.keywords) assert len(article.authors) assert article.publish_date assert article.summary assert len(article.text) > len(article.summary) assert article.text assert article.url
def test_article_pdf_ignoring(): config = Configuration() empty_pdf = "%PDF-" # empty PDF constant config.ignored_content_types_defaults = {"application/pdf": empty_pdf, "application/x-pdf": empty_pdf, "application/x-bzpdf": empty_pdf, "application/x-gzpdf": empty_pdf} article = Article( url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf', config=config ) article.download() assert empty_pdf == article.html
def test_article_custom_params(): a = Article(url='http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', language='zh', memoize_articles=False) assert 'zh' == a.config.language assert not a.config.memoize_articles assert not a.config.use_meta_language
def test_wikipedia_tables(self): url = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects" article = Article(url=url) article.build() self.assertTrue(DOWNLOADED in article.workflow) self.assertEqual(article.download_exception_msg, None) # write data out to tab seperated format page = os.path.split(url)[1] for table in article.tables: fname = '../{}_t{}.tsv'.format(page, table['name']) with codecs.open(fname, 'w') as f: for i in range(len(table['rows'])): rowStr = '\t'.join(table['rows'][i]) rowStr = rowStr.replace('\n', '') # print(rowStr) f.write(rowStr + '\n') f.close()
def validate(url, language): config = Configuration() config.follow_meta_refresh = True # BUG was that website reported language as zh-Hant-TW when it really was en! config.use_meta_language = False config.set_language(language) config.http_success_only = False article = Article(url, config=config) article.download() article.parse() assert len(article.text) article.nlp() return article
def test_hindi_news(): url = "https://www.indiatv.in/maharashtra/maharashtra-coronavirus-latest-update-news-721708" article = Article(url=url, language='hi') article.download() article.parse() article.nlp() assert len(article.keywords) assert len(article.authors) # assert article.publish_date assert article.summary assert article.text assert len(article.summary) <= len(article.text) assert article.url
def test_arabic_news(): url = "https://www.bbc.com/arabic/live/53203730" article = Article(url=url, language='ar') article.download() article.parse() article.nlp() assert len(article.keywords) assert len(article.authors) # assert article.publish_date assert article.summary assert article.text # assert len(article.summary) <= len(article.text) assert article.url
def test_spanish_news(): url = "https://www.elespectador.com/noticias/nacional/embarcacion-nicaragueense-realizaba-pesca-ilegal-aguas-articulo-616181" url = "https://www.elespectador.com/noticias/nacional/embarcacion-nicaraguense-realizaba-pesca-ilegal-en-aguas-colombianas/" article = Article(url=url, language='es') article.download() article.parse() article.nlp() assert len(article.keywords) assert len(article.authors) # assert article.publish_date assert article.summary assert article.text # assert len(article.summary) <= len(article.text) assert article.url
def check_url(args): """ :param args: """ url, res_filename = args pubdate_failed, fulltext_failed = False, False html = mock_resource_with(res_filename, 'html') try: a = Article(url) a.download(html) a.parse() if a.publish_date is None: pubdate_failed = True print(f"BAD_PUBDATE={url}") except Exception: print('<< URL: %s parse ERROR >>' % url) traceback.print_exc() pubdate_failed, fulltext_failed = True, True else: correct_text = mock_resource_with(res_filename, 'txt') if not (a.text == correct_text): # print('Diff: ', simplediff.diff(correct_text, a.text)) # `correct_text` holds the reason of failure if failure print('%s -- %s -- %s' % ('Fulltext failed', res_filename, correct_text.strip())) fulltext_failed = True # TODO: assert statements are commented out for full-text # extraction tests because we are constantly tweaking the # algorithm and improving # assert a.text == correct_text return pubdate_failed, fulltext_failed
def create_article(url, language): config = Configuration() # initialization runtime configuration config.follow_meta_refresh = True config.use_meta_language = False config.set_language(language) config.http_success_only = False config.ignored_content_types_defaults = { # "application/pdf": "%PDF-", # "application/x-pdf": "%PDF-", "application/x-bzpdf": "%PDF-", "application/x-gzpdf": "%PDF-" } return Article(url, config=config)
def test_thai_fulltext_extract(): url = 'https://prachatai.com/journal/2019/01/80642' article = Article(url=url, language='th') html = mock_resource_with('thai_article', 'html') article.download(html) article.parse() text = mock_resource_with('thai', 'txt') assert text == article.text assert text == fulltext(article.html, 'th')
def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press')
def test_japanese_fulltext_extract2(): url = 'http://www.afpbb.com/articles/-/3178894' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article2', 'html') article.download(html) article.parse() text = mock_resource_with('japanese2', 'txt') assert text == article.text assert text == fulltext(article.html, 'ja')
def test_japanese_fulltext_extract(): url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article', 'html') article.download(html) article.parse() text = mock_resource_with('japanese', 'txt') assert text == article.text assert text == fulltext(article.html, 'ja')
def test_chinese_fulltext_extract(): url = 'http://news.sohu.com/20050601/n225789219.shtml' article = Article(url=url, language='zh') html = mock_resource_with('chinese_article', 'html') article.download(html) article.parse() text = mock_resource_with('chinese', 'txt') assert text == article.text assert text == fulltext(article.html, 'zh')
def test_spanish_fulltext_extract(): url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \ 'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html' article = Article(url=url, language='es') html = mock_resource_with('spanish_article', 'html') article.download(html) article.parse() text = mock_resource_with('spanish', 'txt') assert text == article.text assert text == fulltext(article.html, 'es')
def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain')
def test_arabic_fulltext_extract(): url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \ 'index.html' article = Article(url=url) html = mock_resource_with('arabic_article', 'html') article.download(html) article.parse() assert 'ar' == article.meta_lang text = mock_resource_with('arabic', 'txt') assert text == article.text assert text == fulltext(article.html, 'ar')
def test_pre_download_nlp(self): """Test running NLPED algos before even downloading the article """ self.setup_stage('initial') new_article = Article(self.article.url) self.assertRaises(ArticleException, new_article.nlp)
def setUp(self): """Called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html?iref=allsearch')
def test_pre_download_parse(self): """Calling `parse()` before `download()` should yield an error """ article = Article(self.article.url) self.assertRaises(ArticleException, article.parse)
def test_article_default_params(): a = Article(url='http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html') assert 'en' == a.config.language assert a.config.memoize_articles assert a.config.use_meta_language
def article_thread_pool(urls, config): articles = [Article(url.replace("\n", ""), config=config) for url in urls] news_pool = NewsPool(config=config) news_pool.set(articles) news_pool.join() return articles
class ArticleTestCase(unittest.TestCase): def setup_stage(self, stage_name): stages = OrderedDict([ ('initial', lambda: None), ('download', lambda: self.article.download( mock_resource_with('cnn_article', 'html'))), ('parse', lambda: self.article.parse()), ('meta', lambda: None), # Alias for nlp ('nlp', lambda: self.article.nlp()) ]) assert stage_name in stages for name, action in stages.items(): if name == stage_name: break action() def setUp(self): """Called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html?iref=allsearch') @print_test def test_url(self): self.assertEqual( 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html?iref=allsearch', self.article.url) @print_test def test_download_html(self): self.setup_stage('download') html = mock_resource_with('cnn_article', 'html') self.article.download(html) self.assertTrue(DOWNLOADED in self.article.workflow) self.assertEqual(self.article.download_exception_msg, None) self.assertEqual(75406, len(self.article.html)) @print_test def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain') @print_test def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press') @print_test def test_pre_download_parse(self): """Calling `parse()` before `download()` should yield an error """ article = Article(self.article.url) self.assertRaises(ArticleException, article.parse) @print_test def test_parse_html(self): self.setup_stage('parse') AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' META_SITE_NAME = 'CNN' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') self.maxDiff=None self.assertEqual(text.strip(), self.article.text) self.assertEqual(text, fulltext(self.article.html)) # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') self.assertEqual(TOP_IMG, self.article.top_img) self.assertCountEqual(AUTHORS, self.article.authors) self.assertEqual(TITLE, self.article.title) self.assertEqual(LEN_IMGS, len(self.article.imgs)) self.assertEqual(META_LANG, self.article.meta_lang) self.assertEqual(META_SITE_NAME, self.article.meta_site_name) self.assertEqual('2013-11-27', str(self.article.publish_date)) @print_test def test_meta_type_extraction(self): self.setup_stage('meta') meta_type = self.article.extractor.get_meta_type( self.article.clean_doc) self.assertEqual('article', meta_type) @print_test def test_meta_extraction(self): self.setup_stage('meta') meta = self.article.extractor.get_meta_data(self.article.clean_doc) META_DATA = defaultdict(dict, { 'medium': 'news', 'googlebot': 'noarchive', 'pubdate': '2013-11-27T08:36:32Z', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com', 'og': {'site_name': 'CNN', 'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'}, 'section': 'travel', 'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN', 'robots': 'index,follow', 'vr': { 'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'}, 'source': 'CNN', 'fb': {'page_id': 18793419640, 'app_id': 80401312489}, 'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm', 'article': { 'publisher': 'https://www.facebook.com/cnninternational'}, 'lastmod': '2013-11-28T02:03:23Z', 'twitter': {'site': {'identifier': '@CNNI', 'id': 2097571}, 'card': 'summary', 'creator': {'identifier': '@cnntravel', 'id': 174377718}}, 'viewport': 'width=1024', 'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm' }) self.assertDictEqual(META_DATA, meta) # if the value for a meta key is another dict, that dict ought to be # filled with keys and values dict_values = [v for v in list(meta.values()) if isinstance(v, dict)] self.assertTrue(all([len(d) > 0 for d in dict_values])) # there are exactly 5 top-level "og:type" type keys is_dict = lambda v: isinstance(v, dict) self.assertEqual(5, len([i for i in meta.values() if is_dict(i)])) # there are exactly 12 top-level "pubdate" type keys is_string = lambda v: isinstance(v, str) self.assertEqual(12, len([i for i in meta.values() if is_string(i)])) @print_test def test_pre_download_nlp(self): """Test running NLPED algos before even downloading the article """ self.setup_stage('initial') new_article = Article(self.article.url) self.assertRaises(ArticleException, new_article.nlp) @print_test def test_pre_parse_nlp(self): """Test running NLPED algos before parsing the article """ self.setup_stage('parse') self.assertRaises(ArticleException, self.article.nlp) @print_test def test_nlp_body(self): self.setup_stage('nlp') self.article.nlp() KEYWORDS = [ 'storm', 'weather', 'new', 'york', 'flight', 'balloons', 'roads', 'delays', 'parade', 'people', 'winds', 'snow' ] self.assertCountEqual(KEYWORDS, self.article.keywords) SUMMARY = mock_resource_with('cnn_summary', 'txt') self.assertEqual(SUMMARY, self.article.summary) @print_test def test_download_file_success(self): url = "file://" + os.path.join(HTML_FN, "cnn_article.html") article = Article(url=url) article.download() self.assertTrue(DOWNLOADED in article.workflow) self.assertEqual(article.download_exception_msg, None) self.assertEqual(75406, len(article.html)) @print_test def test_download_file_failure(self): url = "file://" + os.path.join(HTML_FN, "does_not_exist.html") article = Article(url=url) # noinspection PyUnusedLocal try: article.download() except ArticleException as ex: self.assertEqual(0, len(article.html)) self.assertTrue(DOWNLOADED not in article.workflow) self.assertEqual(article.download_exception_msg, "No such file or directory") @print_test def test_wikipedia_tables(self): url = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects" article = Article(url=url) article.build() self.assertTrue(DOWNLOADED in article.workflow) self.assertEqual(article.download_exception_msg, None) # write data out to tab seperated format page = os.path.split(url)[1] for table in article.tables: fname = '../{}_t{}.tsv'.format(page, table['name']) with codecs.open(fname, 'w') as f: for i in range(len(table['rows'])): rowStr = '\t'.join(table['rows'][i]) rowStr = rowStr.replace('\n', '') # print(rowStr) f.write(rowStr + '\n') f.close()