def parse(url, language): article = Article(url, language=language) article.build() if article.keywords: print('Article Keywords: ' + json.dumps(article.keywords) + '\n') if article.summary: print('Article Summary: ' + article.summary + '\n') print('Article Text: ' + article.text)
def test_article_pdf_fetching(): article = Article(url='https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf') article.build() assert not article.html.startswith('%PDF-') assert len(article.keywords) assert len(article.authors) assert article.publish_date assert article.summary assert len(article.text) > len(article.summary) assert article.text assert article.url
def test_thai_pdf_extract(): article = Article( url="http://tpch-th.listedcompany.com/misc/ShareholderMTG/egm201701/20170914-tpch-egm201701-enc02-th.pdf", language='th') article.build() assert not article.html.startswith('%PDF-') assert len(article.keywords) assert len(article.authors) assert article.publish_date assert article.summary assert len(article.text) > len(article.summary) assert article.text assert article.url
def test_wikipedia_tables(self): url = "https://en.wikipedia.org/wiki/International_Phonetic_Alphabet_chart_for_English_dialects" article = Article(url=url) article.build() self.assertTrue(DOWNLOADED in article.workflow) self.assertEqual(article.download_exception_msg, None) # write data out to tab seperated format page = os.path.split(url)[1] for table in article.tables: fname = '../{}_t{}.tsv'.format(page, table['name']) with codecs.open(fname, 'w') as f: for i in range(len(table['rows'])): rowStr = '\t'.join(table['rows'][i]) rowStr = rowStr.replace('\n', '') # print(rowStr) f.write(rowStr + '\n') f.close()