Beispiel #1
0
 def test_download_file_success(self):
     url = "file://" + os.path.join(HTML_FN, "cnn_article.html")
     article = Article(url=url)
     article.download()
     self.assertTrue(DOWNLOADED in article.workflow)
     self.assertEqual(article.download_exception_msg, None)
     self.assertEqual(75406, len(article.html))
Beispiel #2
0
def parse(url, language):
    article = Article(url, language=language)
    article.build()
    if article.keywords:
        print('Article Keywords: ' + json.dumps(article.keywords) + '\n')
    if article.summary:
        print('Article Summary: ' + article.summary + '\n')
    print('Article Text: ' + article.text)
Beispiel #3
0
 def test_download_file_failure(self):
     url = "file://" + os.path.join(HTML_FN, "does_not_exist.html")
     article = Article(url=url)
     # noinspection PyUnusedLocal
     try:
         article.download()
     except ArticleException as ex:
         self.assertEqual(0, len(article.html))
         self.assertTrue(DOWNLOADED not in article.workflow)
         self.assertEqual(article.download_exception_msg, "No such file or directory")
def test_article_pdf_fetching():
    article = Article(url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf')
    article.build()
    assert not article.html.startswith('%PDF-')
    assert len(article.keywords)
    assert len(article.authors)
    assert article.publish_date
    assert article.summary
    assert len(article.text) > len(article.summary)
    assert article.text
    assert article.url
Beispiel #5
0
def test_thai_pdf_extract():
    article = Article(
        url="http://tpch-th.listedcompany.com/misc/ShareholderMTG/egm201701/20170914-tpch-egm201701-enc02-th.pdf",
        language='th')
    article.build()
    assert not article.html.startswith('%PDF-')
    assert len(article.keywords)
    assert len(article.authors)
    assert article.publish_date
    assert article.summary
    assert len(article.text) > len(article.summary)
    assert article.text
    assert article.url
def test_article_pdf_ignoring():
    config = Configuration()
    empty_pdf = "%PDF-"  # empty PDF constant
    config.ignored_content_types_defaults = {"application/pdf": empty_pdf,
                                             "application/x-pdf": empty_pdf,
                                             "application/x-bzpdf": empty_pdf,
                                             "application/x-gzpdf": empty_pdf}
    article = Article(
        url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf',
        config=config
    )
    article.download()
    assert empty_pdf == article.html
def test_article_custom_params():
    a = Article(url='http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
                language='zh',
                memoize_articles=False)
    assert 'zh' == a.config.language
    assert not a.config.memoize_articles
    assert not a.config.use_meta_language
Beispiel #8
0
 def test_wikipedia_tables(self):
     url = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects"
     article = Article(url=url)
     article.build()
     self.assertTrue(DOWNLOADED in article.workflow)
     self.assertEqual(article.download_exception_msg, None)
     # write data out to tab seperated format
     page = os.path.split(url)[1]
     for table in article.tables:
         fname = '../{}_t{}.tsv'.format(page, table['name'])
         with codecs.open(fname, 'w') as f:
             for i in range(len(table['rows'])):
                 rowStr = '\t'.join(table['rows'][i])
                 rowStr = rowStr.replace('\n', '')
                 # print(rowStr)
                 f.write(rowStr + '\n')
             f.close()
def validate(url, language):
    config = Configuration()
    config.follow_meta_refresh = True
    # BUG was that website reported language as zh-Hant-TW when it really was en!
    config.use_meta_language = False
    config.set_language(language)
    config.http_success_only = False
    article = Article(url, config=config)
    article.download()
    article.parse()
    assert len(article.text)
    article.nlp()
    return article
Beispiel #10
0
def test_hindi_news():
    url = "https://www.indiatv.in/maharashtra/maharashtra-coronavirus-latest-update-news-721708"
    article = Article(url=url, language='hi')
    article.download()
    article.parse()
    article.nlp()
    assert len(article.keywords)
    assert len(article.authors)
    # assert article.publish_date
    assert article.summary
    assert article.text
    assert len(article.summary) <= len(article.text)
    assert article.url
Beispiel #11
0
def test_arabic_news():
    url = "https://www.bbc.com/arabic/live/53203730"
    article = Article(url=url, language='ar')
    article.download()
    article.parse()
    article.nlp()
    assert len(article.keywords)
    assert len(article.authors)
    # assert article.publish_date
    assert article.summary
    assert article.text
    # assert len(article.summary) <= len(article.text)
    assert article.url
def test_spanish_news():
    url = "https://www.elespectador.com/noticias/nacional/embarcacion-nicaragueense-realizaba-pesca-ilegal-aguas-articulo-616181"
    url = "https://www.elespectador.com/noticias/nacional/embarcacion-nicaraguense-realizaba-pesca-ilegal-en-aguas-colombianas/"
    article = Article(url=url, language='es')
    article.download()
    article.parse()
    article.nlp()
    assert len(article.keywords)
    assert len(article.authors)
    # assert article.publish_date
    assert article.summary
    assert article.text
    # assert len(article.summary) <= len(article.text)
    assert article.url
Beispiel #13
0
 def check_url(args):
     """
     :param args:
   """
     url, res_filename = args
     pubdate_failed, fulltext_failed = False, False
     html = mock_resource_with(res_filename, 'html')
     try:
         a = Article(url)
         a.download(html)
         a.parse()
         if a.publish_date is None:
             pubdate_failed = True
             print(f"BAD_PUBDATE={url}")
     except Exception:
         print('<< URL: %s parse ERROR >>' % url)
         traceback.print_exc()
         pubdate_failed, fulltext_failed = True, True
     else:
         correct_text = mock_resource_with(res_filename, 'txt')
         if not (a.text == correct_text):
             # print('Diff: ', simplediff.diff(correct_text, a.text))
             # `correct_text` holds the reason of failure if failure
             print('%s -- %s -- %s' %
                   ('Fulltext failed', res_filename, correct_text.strip()))
             fulltext_failed = True
             # TODO: assert statements are commented out for full-text
             # extraction tests because we are constantly tweaking the
             # algorithm and improving
             # assert a.text == correct_text
     return pubdate_failed, fulltext_failed
 def create_article(url, language):
     config = Configuration()
     # initialization runtime configuration
     config.follow_meta_refresh = True
     config.use_meta_language = False
     config.set_language(language)
     config.http_success_only = False
     config.ignored_content_types_defaults = {
         # "application/pdf": "%PDF-",
         # "application/x-pdf": "%PDF-",
         "application/x-bzpdf": "%PDF-",
         "application/x-gzpdf": "%PDF-"
     }
     return Article(url, config=config)
Beispiel #15
0
def test_thai_fulltext_extract():
    url = 'https://prachatai.com/journal/2019/01/80642'
    article = Article(url=url, language='th')
    html = mock_resource_with('thai_article', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('thai', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'th')
Beispiel #16
0
 def test_meta_refresh_no_url_redirect(self):
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article(
         '', config=config)
     html = mock_resource_with('ap_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'News from The Associated Press')
Beispiel #17
0
def test_japanese_fulltext_extract2():
    url = 'http://www.afpbb.com/articles/-/3178894'
    article = Article(url=url, language='ja')
    html = mock_resource_with('japanese_article2', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('japanese2', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'ja')
Beispiel #18
0
def test_japanese_fulltext_extract():
    url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001'
    article = Article(url=url, language='ja')
    html = mock_resource_with('japanese_article', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('japanese', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'ja')
Beispiel #19
0
def test_chinese_fulltext_extract():
    url = 'http://news.sohu.com/20050601/n225789219.shtml'
    article = Article(url=url, language='zh')
    html = mock_resource_with('chinese_article', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('chinese', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'zh')
Beispiel #20
0
def test_spanish_fulltext_extract():
    url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \
          'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
    article = Article(url=url, language='es')
    html = mock_resource_with('spanish_article', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('spanish', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'es')
Beispiel #21
0
 def test_meta_refresh_redirect(self):
     # TODO: We actually hit example.com in this unit test ... which is bad
     # Figure out how to mock an actual redirect
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article('', config=config)
     html = mock_resource_with('google_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'Example Domain')
Beispiel #22
0
def test_arabic_fulltext_extract():
    url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \
          'index.html'
    article = Article(url=url)
    html = mock_resource_with('arabic_article', 'html')
    article.download(html)
    article.parse()
    assert 'ar' == article.meta_lang
    text = mock_resource_with('arabic', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'ar')
Beispiel #23
0
 def test_pre_download_nlp(self):
     """Test running NLPED algos before even downloading the article
     """
     self.setup_stage('initial')
     new_article = Article(self.article.url)
     self.assertRaises(ArticleException, new_article.nlp)
Beispiel #24
0
 def setUp(self):
     """Called before the first test case of this unit begins
     """
     self.article = Article(
         url='http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html?iref=allsearch')
Beispiel #25
0
 def test_pre_download_parse(self):
     """Calling `parse()` before `download()` should yield an error
     """
     article = Article(self.article.url)
     self.assertRaises(ArticleException, article.parse)
def test_article_default_params():
    a = Article(url='http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html')
    assert 'en' == a.config.language
    assert a.config.memoize_articles
    assert a.config.use_meta_language
def article_thread_pool(urls, config):
    articles = [Article(url.replace("\n", ""), config=config) for url in urls]
    news_pool = NewsPool(config=config)
    news_pool.set(articles)
    news_pool.join()
    return articles
Beispiel #28
0
class ArticleTestCase(unittest.TestCase):
    def setup_stage(self, stage_name):
        stages = OrderedDict([
            ('initial', lambda: None),
            ('download', lambda: self.article.download(
                mock_resource_with('cnn_article', 'html'))),
            ('parse', lambda: self.article.parse()),
            ('meta', lambda: None),  # Alias for nlp
            ('nlp', lambda: self.article.nlp())
        ])
        assert stage_name in stages
        for name, action in stages.items():
            if name == stage_name:
                break
            action()

    def setUp(self):
        """Called before the first test case of this unit begins
        """
        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html?iref=allsearch')

    @print_test
    def test_url(self):
        self.assertEqual(
            'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html?iref=allsearch',
            self.article.url)

    @print_test
    def test_download_html(self):
        self.setup_stage('download')
        html = mock_resource_with('cnn_article', 'html')
        self.article.download(html)
        self.assertTrue(DOWNLOADED in self.article.workflow)
        self.assertEqual(self.article.download_exception_msg, None)
        self.assertEqual(75406, len(self.article.html))

    @print_test
    def test_meta_refresh_redirect(self):
        # TODO: We actually hit example.com in this unit test ... which is bad
        # Figure out how to mock an actual redirect
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article('', config=config)
        html = mock_resource_with('google_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'Example Domain')

    @print_test
    def test_meta_refresh_no_url_redirect(self):
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article(
            '', config=config)
        html = mock_resource_with('ap_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'News from The Associated Press')

    @print_test
    def test_pre_download_parse(self):
        """Calling `parse()` before `download()` should yield an error
        """
        article = Article(self.article.url)
        self.assertRaises(ArticleException, article.parse)

    @print_test
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
                   'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'
        META_SITE_NAME = 'CNN'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.maxDiff=None
        self.assertEqual(text.strip(), self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual(META_SITE_NAME, self.article.meta_site_name)
        self.assertEqual('2013-11-27', str(self.article.publish_date))

    @print_test
    def test_meta_type_extraction(self):
        self.setup_stage('meta')
        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        self.assertEqual('article', meta_type)

    @print_test
    def test_meta_extraction(self):
        self.setup_stage('meta')
        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(dict, {
            'medium': 'news',
            'googlebot': 'noarchive',
            'pubdate': '2013-11-27T08:36:32Z',
            'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
            'og': {'site_name': 'CNN',
                   'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.',
                   'title': 'After storm, forecasters see smooth sailing for Thanksgiving',
                   'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
                   'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg',
                   'type': 'article'},
            'section': 'travel',
            'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN',
            'robots': 'index,follow',
            'vr': {
                'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'},
            'source': 'CNN',
            'fb': {'page_id': 18793419640, 'app_id': 80401312489},
            'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
            'article': {
                'publisher': 'https://www.facebook.com/cnninternational'},
            'lastmod': '2013-11-28T02:03:23Z',
            'twitter': {'site': {'identifier': '@CNNI', 'id': 2097571},
                        'card': 'summary',
                        'creator': {'identifier': '@cnntravel',
                                    'id': 174377718}},
            'viewport': 'width=1024',
            'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm'
        })

        self.assertDictEqual(META_DATA, meta)

        # if the value for a meta key is another dict, that dict ought to be
        # filled with keys and values
        dict_values = [v for v in list(meta.values()) if isinstance(v, dict)]
        self.assertTrue(all([len(d) > 0 for d in dict_values]))

        # there are exactly 5 top-level "og:type" type keys
        is_dict = lambda v: isinstance(v, dict)
        self.assertEqual(5, len([i for i in meta.values() if is_dict(i)]))

        # there are exactly 12 top-level "pubdate" type keys
        is_string = lambda v: isinstance(v, str)
        self.assertEqual(12, len([i for i in meta.values() if is_string(i)]))

    @print_test
    def test_pre_download_nlp(self):
        """Test running NLPED algos before even downloading the article
        """
        self.setup_stage('initial')
        new_article = Article(self.article.url)
        self.assertRaises(ArticleException, new_article.nlp)

    @print_test
    def test_pre_parse_nlp(self):
        """Test running NLPED algos before parsing the article
        """
        self.setup_stage('parse')
        self.assertRaises(ArticleException, self.article.nlp)

    @print_test
    def test_nlp_body(self):
        self.setup_stage('nlp')
        self.article.nlp()
        KEYWORDS = [
            'storm',
            'weather',
            'new',
            'york',
            'flight',
            'balloons',
            'roads',
            'delays',
            'parade',
            'people',
            'winds',
            'snow'
        ]
        self.assertCountEqual(KEYWORDS, self.article.keywords)
        SUMMARY = mock_resource_with('cnn_summary', 'txt')
        self.assertEqual(SUMMARY, self.article.summary)

    @print_test
    def test_download_file_success(self):
        url = "file://" + os.path.join(HTML_FN, "cnn_article.html")
        article = Article(url=url)
        article.download()
        self.assertTrue(DOWNLOADED in article.workflow)
        self.assertEqual(article.download_exception_msg, None)
        self.assertEqual(75406, len(article.html))

    @print_test
    def test_download_file_failure(self):
        url = "file://" + os.path.join(HTML_FN, "does_not_exist.html")
        article = Article(url=url)
        # noinspection PyUnusedLocal
        try:
            article.download()
        except ArticleException as ex:
            self.assertEqual(0, len(article.html))
            self.assertTrue(DOWNLOADED not in article.workflow)
            self.assertEqual(article.download_exception_msg, "No such file or directory")

    @print_test
    def test_wikipedia_tables(self):
        url = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects"
        article = Article(url=url)
        article.build()
        self.assertTrue(DOWNLOADED in article.workflow)
        self.assertEqual(article.download_exception_msg, None)
        # write data out to tab seperated format
        page = os.path.split(url)[1]
        for table in article.tables:
            fname = '../{}_t{}.tsv'.format(page, table['name'])
            with codecs.open(fname, 'w') as f:
                for i in range(len(table['rows'])):
                    rowStr = '\t'.join(table['rows'][i])
                    rowStr = rowStr.replace('\n', '')
                    # print(rowStr)
                    f.write(rowStr + '\n')
                f.close()