Beispiel #1
0
def parse(url, language):
    article = Article(url, language=language)
    article.build()
    if article.keywords:
        print('Article Keywords: ' + json.dumps(article.keywords) + '\n')
    if article.summary:
        print('Article Summary: ' + article.summary + '\n')
    print('Article Text: ' + article.text)
def test_article_pdf_fetching():
    article = Article(url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf')
    article.build()
    assert not article.html.startswith('%PDF-')
    assert len(article.keywords)
    assert len(article.authors)
    assert article.publish_date
    assert article.summary
    assert len(article.text) > len(article.summary)
    assert article.text
    assert article.url
Beispiel #3
0
def test_thai_pdf_extract():
    article = Article(
        url="http://tpch-th.listedcompany.com/misc/ShareholderMTG/egm201701/20170914-tpch-egm201701-enc02-th.pdf",
        language='th')
    article.build()
    assert not article.html.startswith('%PDF-')
    assert len(article.keywords)
    assert len(article.authors)
    assert article.publish_date
    assert article.summary
    assert len(article.text) > len(article.summary)
    assert article.text
    assert article.url
Beispiel #4
0
 def test_wikipedia_tables(self):
     url = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects"
     article = Article(url=url)
     article.build()
     self.assertTrue(DOWNLOADED in article.workflow)
     self.assertEqual(article.download_exception_msg, None)
     # write data out to tab seperated format
     page = os.path.split(url)[1]
     for table in article.tables:
         fname = '../{}_t{}.tsv'.format(page, table['name'])
         with codecs.open(fname, 'w') as f:
             for i in range(len(table['rows'])):
                 rowStr = '\t'.join(table['rows'][i])
                 rowStr = rowStr.replace('\n', '')
                 # print(rowStr)
                 f.write(rowStr + '\n')
             f.close()