Esempio n. 1
0
def parse_page(page, url, dump=False):
    """
    Parse the page using all parsing methods available.

    :param str page: the content of the fatality page
    :param str url: detail page URL
    :return: a dictionary representing a fatality.
    :rtype: dict
    """
    report = model.Report(case='19-123456')

    # Parse the twitter fields.
    twitter_report, twitter_err = twitter.parse(page)
    report.update(twitter_report)

    # Parse the page.
    article_report, artricle_err = article.parse_content(page)
    report.update(article_report)
    if twitter_err or artricle_err:  # pragma: no cover
        twitter_err_str = f'\nTwitter fields:\n\t * ' + "\n\t * ".join(
            twitter_err) if twitter_err else ''
        article_err_str = f'\nArticle fields:\n\t * ' + "\n\t * ".join(
            artricle_err) if artricle_err else ''
        logger.debug(
            f'Errors while parsing {url}:{twitter_err_str}{article_err_str}')

        # Dump the file.
        if dump:
            dumpr_dir = Path(constant.DUMP_DIR)
            dumpr_dir.mkdir(parents=True, exist_ok=True)
            dump_file_name = url.split('/')[-1]
            dump_file = dumpr_dir / dump_file_name
            dump_file.write_text(page)

    return report
Esempio n. 2
0
 def test_parse_twitter_00(self, page, expected, errors):
     """Ensure information are properly extracted from the twitter fields in a detail page."""
     p = load_test_page(page)
     actual, err = twitter.parse(p)
     if errors or err:
         assert errors == len(err)
     else:
         assert actual == expected
Esempio n. 3
0
def test_dumped_page(page_dump):
    """
    Helper test to allow debugging offline.

    Run the following command: `pytest -s -n0 -x -vvv -m dump`
    """
    try:
        page = load_dumped_page(page_dump)
    except FileNotFoundError:
        raise FileNotFoundError(
            f'Dump file "{page_dump}" not found: run "scrapd --dump" first.')
    else:
        twitter_report, twitter_err = twitter.parse(page)
        assert not twitter_err
        article_report, artricle_err = article.parse_content(page)
        assert not artricle_err