Exemple #1
0
 def check_url(args):
     """
     :param args:
   """
     url, res_filename = args
     pubdate_failed, fulltext_failed = False, False
     html = mock_resource_with(res_filename, 'html')
     try:
         a = Article(url)
         a.download(html)
         a.parse()
         if a.publish_date is None:
             pubdate_failed = True
             print(f"BAD_PUBDATE={url}")
     except Exception:
         print('<< URL: %s parse ERROR >>' % url)
         traceback.print_exc()
         pubdate_failed, fulltext_failed = True, True
     else:
         correct_text = mock_resource_with(res_filename, 'txt')
         if not (a.text == correct_text):
             # print('Diff: ', simplediff.diff(correct_text, a.text))
             # `correct_text` holds the reason of failure if failure
             print('%s -- %s -- %s' %
                   ('Fulltext failed', res_filename, correct_text.strip()))
             fulltext_failed = True
             # TODO: assert statements are commented out for full-text
             # extraction tests because we are constantly tweaking the
             # algorithm and improving
             # assert a.text == correct_text
     return pubdate_failed, fulltext_failed
Exemple #2
0
def test_thai_fulltext_extract():
    url = 'https://prachatai.com/journal/2019/01/80642'
    article = Article(url=url, language='th')
    html = mock_resource_with('thai_article', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('thai', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'th')
Exemple #3
0
def test_japanese_fulltext_extract2():
    url = 'http://www.afpbb.com/articles/-/3178894'
    article = Article(url=url, language='ja')
    html = mock_resource_with('japanese_article2', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('japanese2', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'ja')
Exemple #4
0
def test_japanese_fulltext_extract():
    url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001'
    article = Article(url=url, language='ja')
    html = mock_resource_with('japanese_article', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('japanese', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'ja')
Exemple #5
0
def test_chinese_fulltext_extract():
    url = 'http://news.sohu.com/20050601/n225789219.shtml'
    article = Article(url=url, language='zh')
    html = mock_resource_with('chinese_article', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('chinese', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'zh')
Exemple #6
0
def test_spanish_fulltext_extract():
    url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \
          'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
    article = Article(url=url, language='es')
    html = mock_resource_with('spanish_article', 'html')
    article.download(html)
    article.parse()
    text = mock_resource_with('spanish', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'es')
Exemple #7
0
def test_arabic_fulltext_extract():
    url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \
          'index.html'
    article = Article(url=url)
    html = mock_resource_with('arabic_article', 'html')
    article.download(html)
    article.parse()
    assert 'ar' == article.meta_lang
    text = mock_resource_with('arabic', 'txt')
    assert text == article.text
    assert text == fulltext(article.html, 'ar')
Exemple #8
0
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
                   'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'
        META_SITE_NAME = 'CNN'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.maxDiff=None
        self.assertEqual(text.strip(), self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual(META_SITE_NAME, self.article.meta_site_name)
        self.assertEqual('2013-11-27', str(self.article.publish_date))
Exemple #9
0
 def test_download_html(self):
     self.setup_stage('download')
     html = mock_resource_with('cnn_article', 'html')
     self.article.download(html)
     self.assertTrue(DOWNLOADED in self.article.workflow)
     self.assertEqual(self.article.download_exception_msg, None)
     self.assertEqual(75406, len(self.article.html))
Exemple #10
0
 def test_meta_refresh_no_url_redirect(self):
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article(
         '', config=config)
     html = mock_resource_with('ap_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'News from The Associated Press')
Exemple #11
0
 def test_meta_refresh_redirect(self):
     # TODO: We actually hit example.com in this unit test ... which is bad
     # Figure out how to mock an actual redirect
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article('', config=config)
     html = mock_resource_with('google_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'Example Domain')
Exemple #12
0
 def setup_stage(self, stage_name):
     stages = OrderedDict([
         ('initial', lambda: None),
         ('download', lambda: self.article.download(
             mock_resource_with('cnn_article', 'html'))),
         ('parse', lambda: self.article.parse()),
         ('meta', lambda: None),  # Alias for nlp
         ('nlp', lambda: self.article.nlp())
     ])
     assert stage_name in stages
     for name, action in stages.items():
         if name == stage_name:
             break
         action()
Exemple #13
0
 def test_nlp_body(self):
     self.setup_stage('nlp')
     self.article.nlp()
     KEYWORDS = [
         'storm',
         'weather',
         'new',
         'york',
         'flight',
         'balloons',
         'roads',
         'delays',
         'parade',
         'people',
         'winds',
         'snow'
     ]
     self.assertCountEqual(KEYWORDS, self.article.keywords)
     SUMMARY = mock_resource_with('cnn_summary', 'txt')
     self.assertEqual(SUMMARY, self.article.summary)