def process_single(source): article, html_content = extract_article_data(source) if article: print(article.title) print(article.url) print_taggedURLs(article.links, 70) print("°" * 80) # from helpers.unittest_generator import generate_unittest # import os # generate_unittest("links_new_thinglink", "lavenir", dict(urls=article.links), html_content, source.name, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lavenir"), True) else: print('page was not recognized as an article')
def test_sample_data(): fpaths = [ "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-08/17.06.41/raw_data/0.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-09/14.05.14/raw_data/3.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-09/16.05.13/raw_data/3.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-09/11.05.13/raw_data/0.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-09/15.05.14/raw_data/3.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/17.05.26/raw_data/1.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/22.05.29/raw_data/3.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/20.05.26/raw_data/9.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/19.05.26/raw_data/9.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/23.05.27/raw_data/4.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/21.05.27/raw_data/0.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-02/19.05.32/raw_data/5.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-02/15.05.28/raw_data/8.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-16/12.05.33/raw_data/10.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-16/13.05.42/raw_data/0.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-16/01.05.31/raw_data/1.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-16/16.05.10/raw_data/16.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-21/13.05.07/raw_data/8.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-21/08.05.06/raw_data/5.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-21/07.05.07/raw_data/17.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-21/15.05.06/raw_data/11.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-23/09.05.09/raw_data/3.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/08.05.06/raw_data/5.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/07.05.06/raw_data/7.html", "/Volumes/Curst/csxj/json_db_wtf_sudpresse/sudpresse/2011-12-22/07.05.06/raw_data/7.html", "/Volumes/Curst/csxj/json_db_wtf_sudpresse/sudpresse/2011-12-21/07.05.07/raw_data/17.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/12.05.07/raw_data/1.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/16.05.13/raw_data/15.html", "/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/10.05.14/raw_data/1.html", ] for fpath in fpaths: with open(fpath, 'r') as f: article_data, raw = extract_article_data(f) print_taggedURLs(article_data.links)