コード例 #1
0
ファイル: lavenir.py プロジェクト: sevas/csxj-crawler
    def process_single(source):
        article, html_content = extract_article_data(source)
        if article:
            print(article.title)
            print(article.url)
            print_taggedURLs(article.links, 70)
            print("°" * 80)

            # from helpers.unittest_generator import generate_unittest
            # import os
            # generate_unittest("links_new_thinglink", "lavenir", dict(urls=article.links), html_content, source.name, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lavenir"), True)
        else:
            print('page was not recognized as an article')
コード例 #2
0
ファイル: sudpresse.py プロジェクト: sevas/csxj-crawler
def test_sample_data():
    fpaths = [
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-08/17.06.41/raw_data/0.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-09/14.05.14/raw_data/3.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-09/16.05.13/raw_data/3.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-09/11.05.13/raw_data/0.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2012-01-09/15.05.14/raw_data/3.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/17.05.26/raw_data/1.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/22.05.29/raw_data/3.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/20.05.26/raw_data/9.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/19.05.26/raw_data/9.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/23.05.27/raw_data/4.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-15/21.05.27/raw_data/0.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-02/19.05.32/raw_data/5.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-02/15.05.28/raw_data/8.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-16/12.05.33/raw_data/10.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-16/13.05.42/raw_data/0.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-16/01.05.31/raw_data/1.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-16/16.05.10/raw_data/16.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-21/13.05.07/raw_data/8.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-21/08.05.06/raw_data/5.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-21/07.05.07/raw_data/17.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-21/15.05.06/raw_data/11.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-23/09.05.09/raw_data/3.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/08.05.06/raw_data/5.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/07.05.06/raw_data/7.html",
"/Volumes/Curst/csxj/json_db_wtf_sudpresse/sudpresse/2011-12-22/07.05.06/raw_data/7.html",
"/Volumes/Curst/csxj/json_db_wtf_sudpresse/sudpresse/2011-12-21/07.05.07/raw_data/17.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/12.05.07/raw_data/1.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/16.05.13/raw_data/15.html",
"/Volumes/Curst/csxj/tartiflette/json_db_0_5/sudpresse/2011-12-22/10.05.14/raw_data/1.html",
    ]

    for fpath in fpaths:
        with open(fpath, 'r') as f:
            article_data, raw = extract_article_data(f)
            print_taggedURLs(article_data.links)