Esempio n. 1
0
def test_extract_article_custom_publication_datetime_selector():
    # Load test file
    html_filepath = os.path.join(UNIT_TEST_DATA_DIR,
                                 "addictinginfo.com-1_article.html")
    response = response_from_html_file(html_filepath)
    # Load expected article data
    article_filepath = os.path.join(
        UNIT_TEST_DATA_DIR,
        "addictinginfo.com-1_extracted_data_default_custom_publication_datetime_selector.json"
    )
    expected_article = article_from_json_file(article_filepath)

    # Mock config
    config_yaml = """
        site_name: 'example.com'
        start_url: 'http://addictinginfo.com/category/news/'
        article:
            publication_datetime:
                select_method: 'xpath'
                select_expression: '//time[contains(concat(" ", normalize-space(@class), " "), " entry-date ")]/@datetime'
                match_rule: 'single'
            content:
                select_method: 'xpath'
                select_expression: '//div[@class="entry entry-content"]'
                match_rule: 'single'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    # Test
    article = extract_article(response, config)
    assert article["publication_datetime"] == expected_article[
        "publication_datetime"]
Esempio n. 2
0
def test_extract_article_custom_content_selector():
    # Load test file
    html_filepath = os.path.join(UNIT_TEST_DATA_DIR,
                                 "addictinginfo.com-1_article.html")
    response = response_from_html_file(html_filepath)
    # Load expected article data
    article_filepath = os.path.join(
        UNIT_TEST_DATA_DIR,
        "addictinginfo.com-1_extracted_data_default_custom_content_selector.json"
    )
    expected_article = article_from_json_file(article_filepath)

    # Mock config
    config_yaml = """
        site_name: 'example.com'
        article:
            content:
                select_method: 'xpath'
                select_expression: '//div[@class="entry entry-content"]'
                match_rule: 'single'

    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    # Test
    article = extract_article(response, config)
    assert article["content"] == expected_article["content"]
Esempio n. 3
0
def test_extract_article_default_with_crawl_info():
    # Load test file
    html_filepath = os.path.join(UNIT_TEST_DATA_DIR,
                                 "addictinginfo.com-1_article.html")
    response = response_from_html_file(html_filepath)
    # Load expected article data
    article_filepath = os.path.join(
        UNIT_TEST_DATA_DIR,
        "addictinginfo.com-1_extracted_data_default_with_crawl_info.json")
    expected_article = article_from_json_file(article_filepath)

    # Mock config
    config_yaml = """
        site_name: 'example.com'
        start_url: 'http://addictinginfo.com/category/news/'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    # Mock crawl info
    crawl_info = MockDBEntry(crawl_id="bdbcf1cf-e4,1f-4c10-9958-4ab1b07e46ae",
                             crawl_datetime="2018-10-17T20:25:34.234567+0000")

    # Test
    article = extract_article(response, config, crawl_info)
    assert article == expected_article
Esempio n. 4
0
def validate_extract_article(response, config, expected):
    article = extract_article(response, config)
    # Check title extraction
    assert article['title'] == expected['title']
    # Check byline extraction
    assert article['byline'] == expected['byline']
    # Check publication datetime extraction
    assert article['publication_datetime'] == expected['publication_datetime']
    # Check plain content extraction
    assert article['plain_content'] == expected['plain_content']
    # Check plain text extraction
    assert article['plain_text'] == expected['plain_text']
Esempio n. 5
0
def test_extract_article_default():
    # Load test file
    html_filepath = os.path.join(UNIT_TEST_DATA_DIR,
                                 "addictinginfo.com-1_article.html")
    response = response_from_html_file(html_filepath)
    # Load expected article data
    article_filepath = os.path.join(
        UNIT_TEST_DATA_DIR, "addictinginfo.com-1_extracted_data_default.json")
    expected_article = article_from_json_file(article_filepath)

    # Mock config
    config_yaml = """
        site_name: 'example.com'
        start_url: 'http://addictinginfo.com/category/news/'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    # Test
    article = extract_article(response, config)
    assert article == expected_article
Esempio n. 6
0
def test_extract_article_with_no_data_has_all_fields_present_but_null():
    # Mock response using expected article data
    html = """<html>
    <head></head>
    <body>
        <div>
            No article here.
        </div>
    </body>
    </html>"""
    response = TextResponse(url="http://example.com",
                            body=html,
                            encoding="utf-8")

    # Mock config
    config_yaml = """
    site_name: 'example.com'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    expected_article = {
        'site_name': "example.com",
        'article_url': "http://example.com",
        'title': None,
        'byline': None,
        'publication_datetime': None,
        'content': "<div>No article here.</div>",
        'plain_content': "<div>No article here.</div>",
        'plain_text': [{
            'text': 'No article here.'
        }],
        'metadata': None
    }

    # Test
    article = extract_article(response, config)
    assert article == expected_article
Esempio n. 7
0
def test_extract_datetime_works_with_multiple_dates():
    # Mock response using expected article data
    html = """<html>
    <head></head>
    <body>
        <div class="subarticle">
            <p>October 22, 2018</p>
            <p>Article text here.</p>
            <p>May 15, 2006</p>
        </div>
    </body>
    </html>"""
    response = TextResponse(url="http://example.com",
                            body=html,
                            encoding="utf-8")

    # Mock config
    config_yaml = """
    site_name: 'example.com'
    article:
        publication_datetime:
            select_method: 'xpath'
            select_expression: '//div[@class="subarticle"]/p/text()'
            match_rule: 'comma_join'
            datetime_formats:
              - 'MMMM D YYYY'
        content:
            select_method: 'xpath'
            select_expression: '//div[@class="subarticle"]'
            match_rule: 'single'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    expected_article = {
        'site_name':
        'example.com',
        'article_url':
        'http://example.com',
        'title':
        None,
        'byline':
        None,
        'publication_datetime':
        "2006-05-15T00:00:00",
        'content':
        '<div><p>October 22, 2018</p><p>Article text here.</p><p>May 15, 2006</p></div>',
        'plain_content':
        '<div><p>October 22, 2018</p><p>Article text here.</p><p>May 15, 2006</p></div>',
        'plain_text': [{
            "text": "October 22, 2018"
        }, {
            "text": "Article text here."
        }, {
            "text": "May 15, 2006"
        }],
        'metadata':
        None,
    }

    # Test
    article = extract_article(response, config)
    assert article == expected_article