Beispiel #1
0
def test_get_class_urls_fail(cazy_home_url, null_logger, monkeypatch):
    """Test get_cazy_class_urls home_page not returned"""
    def mock_get_home_page(*args, **kwargs):
        return [None, "error"]

    monkeypatch.setattr(crawler, "get_page", mock_get_home_page)

    with pytest.raises(SystemExit) as pytest_wrapped_e:
        crawler.get_cazy_class_urls(cazy_home_url, None, 1, null_logger)
    assert pytest_wrapped_e.type == SystemExit
def get_class_urls(cazy_home, excluded_classes, max_tries, logger):
    """Retrieve class urls, add storage of number of attempted scrapes of each class URL.

    :param cazy_url: str, URL to the CAZy home page.
    :param excluded_classes: list, list of CAZy classes not to be scraped
    :param max_tries: int, maximum number of times to try scrape if errors are encountered
    :param logger: logger object

    Return list of CAZy class URLs. Each item is a list of [URL, 0]
    - 0 is used to count number of attempted connections.
    """

    class_urls = crawler.get_cazy_class_urls(cazy_home, excluded_classes,
                                             max_tries, logger)

    try:
        if len(class_urls) == 0:
            logger.error(
                "Failed to retrieve URLs to CAZy class pages.\nTerminating program"
            )
            sys.exit(1)
    except TypeError:  # rased when class_pages is None
        logger.error(
            "Failed to retrieve URLs to CAZy class pages.\nTerminating program"
        )
        sys.exit(1)

    # add storing the number of times an attempt to scrape the class page as been performed
    index = 0
    for index in range(len(class_urls)):
        # list structure [class_url, number_of_tried_scrapes]
        class_urls[index] = [class_urls[index], 0]

    return class_urls
Beispiel #3
0
def test_get_class_urls_exclusions_given(
    cazy_home_url,
    cazy_home_page,
    null_logger,
    monkeypatch,
):
    """Test get_cazy_class_urls when excluded_classess is not None."""
    with open(cazy_home_page) as fp:
        soup = BeautifulSoup(fp, features="lxml")

    exclusions = ["<strong>Glycoside Hydrolases (GHs)</strong>"]

    def mock_get_home_page(*args, **kwargs):
        return [soup, None]

    monkeypatch.setattr(crawler, "get_page", mock_get_home_page)

    expected_result = [
        'http://www.cazy.org/GlycosylTransferases.html',
        'http://www.cazy.org/Polysaccharide-Lyases.html',
        'http://www.cazy.org/Carbohydrate-Esterases.html',
        'http://www.cazy.org/Auxiliary-Activities.html',
        'http://www.cazy.org/Carbohydrate-Binding-Modules.html',
    ]

    assert expected_result == crawler.get_cazy_class_urls(
        cazy_home_url,
        exclusions,
        1,
        null_logger,
    )
Beispiel #4
0
def test_get_class_urls_attribute(
    cazy_home_url,
    cazy_home_no_spip,
    null_logger,
    monkeypatch,
):
    """Test get_cazy_class_urls when attribute error is raised."""
    with open(cazy_home_no_spip) as fp:
        soup = fp.read()

    exclusions = ["<strong>Glycoside Hydrolases (GHs)</strong>"]

    def mock_get_home_page(*args, **kwargs):
        return [soup, None]

    monkeypatch.setattr(crawler, "get_page", mock_get_home_page)

    assert None is crawler.get_cazy_class_urls(
        cazy_home_url,
        exclusions,
        1,
        null_logger,
    )