def test_create_base_link(filename, deed_result, legalcode_result, rdf_result):
    # deeds
    args = link_checker.parse_arguments(["deeds"])
    baseURL = create_base_link(args, filename, for_deeds=True)
    assert baseURL == deed_result
    # legalcode
    args = link_checker.parse_arguments(["legalcode"])
    baseURL = create_base_link(args, filename)
    assert baseURL == legalcode_result
    # rdf
    args = link_checker.parse_arguments(["rdf"])
    baseURL = create_base_link(args, filename, for_rdfs=True)
    assert baseURL == rdf_result
def test_parser_shared_rdf():
    subcmds = ["rdf", "index"]

    # Test defaults
    for subcmd in subcmds:
        args = link_checker.parse_arguments([subcmd])
        assert args.local_index is False

    # Test argumetns
    for subcmd in subcmds:
        # Test --local
        args = link_checker.parse_arguments([subcmd, "--local-index"])
        assert args.local_index is True
def test_parser_shared_licenses():
    subcmds = ["deeds", "legalcode", "rdf", "combined", "canonical"]

    # Test defaults
    for subcmd in subcmds:
        args = link_checker.parse_arguments([subcmd])
        assert args.local is False

    # Test argumetns
    for subcmd in subcmds:
        # Test --local
        args = link_checker.parse_arguments([subcmd, "--local"])
        assert args.local is True
def test_write_response(tmpdir):
    # Set config
    output_file = tmpdir.join("errorlog.txt")
    args = link_checker.parse_arguments(
        ["deeds", "--output-errors", output_file.strpath]
    )

    # Text to extract valid_anchors
    text = (
        "<a href='http://httpbin.org/status/200'>Response 200</a>,"
        " <a href='file://link3'>Invalid Scheme</a>,"
        " <a href='http://httpbin.org/status/400'>Response 400</a>"
    )
    soup = BeautifulSoup(text, "lxml")
    valid_anchors = soup.find_all("a")

    # Setup function params
    all_links = [
        "http://httpbin.org/status/200",
        "file://link3",
        "http://httpbin.org/status/400",
    ]
    rs = (grequests.get(link) for link in all_links)
    response = grequests.map(rs, exception_handler=exception_handler)
    base_url = "https://baseurl/goes/here"
    license_name = "by-cc-nd_2.0"

    # Set output to external file
    caught_errors = write_response(
        args,
        all_links,
        response,
        base_url,
        license_name,
        valid_anchors,
        license_name,
        False,
    )
    assert caught_errors == 2
    args.output_errors.flush()
    lines = output_file.readlines()
    i = 0
    assert lines[i] == "\n"
    i += 1
    assert lines[i] == "by-cc-nd_2.0\n"
    i += 1
    assert lines[i] == "URL: https://baseurl/goes/here\n"
    i += 1
    assert lines[i] == f'  {"Invalid Schema":<24}file://link3\n'
    i += 1
    assert lines[i] == f'{"":<26}<a href="file://link3">Invalid Scheme</a>\n'
    i += 1
    assert lines[i] == f'  {"400":<24}http://httpbin.org/status/400\n'
    i += 1
    assert lines[i] == (
        f'{"":<26}<a href="http://httpbin.org/status/400">Response 400</a>\n'
    )
def test_output_write(tmpdir):
    # output_errors is set and written to
    output_file = tmpdir.join("errorlog.txt")
    args = link_checker.parse_arguments(
        ["deeds", "--output-errors", output_file.strpath]
    )
    output_write(args, "Output enabled")
    args.output_errors.flush()
    assert output_file.read() == "Output enabled\n"
def test_parser_shared_reporting(tmpdir):
    subcmds = ["deeds", "legalcode", "rdf", "index", "combined"]

    # Test defaults
    for subcmd in subcmds:
        args = link_checker.parse_arguments([subcmd])
        assert bool(args.output_errors) is False

    # Test arguments
    for subcmd in subcmds:
        # Test --output-errors with default value
        args = link_checker.parse_arguments([subcmd, "--output-errors"])
        assert bool(args.output_errors) is True
        assert args.output_errors.name == "errorlog.txt"
        # Test --output-errors with custom value
        output_file = tmpdir.join("errorlog.txt")
        args = link_checker.parse_arguments(
            [subcmd, "--output-errors", output_file.strpath])
        assert bool(args.output_errors) is True
        assert args.output_errors.name == output_file.strpath
def test_output_issues_summary(reset_global, tmpdir):
    # output_errors is set and written to
    output_file = tmpdir.join("errorlog.txt")
    args = link_checker.parse_arguments(
        ["deeds", "--output-errors", output_file.strpath]
    )
    utils.MAP_BROKEN_LINKS = {
        "https://link1.demo": [
            "https://file1.url/here",
            "https://file2.url/goes/here",
        ],
        "https://link2.demo": ["https://file4.url/here"],
    }
    all_links = ["some link"] * 5
    output_issues_summary(args, all_links, 3)
    args.output_errors.flush()
    lines = output_file.readlines()
    i = 0
    assert lines[i] == "\n"
    i += 1
    assert lines[i] == "\n"
    i += 1
    assert lines[i] == "***************************************\n"
    i += 1
    assert lines[i] == "                SUMMARY\n"
    i += 1
    assert lines[i] == "***************************************\n"
    i += 1
    assert lines[i] == "\n"
    i += 1
    assert str(lines[i]).startswith("Timestamp:")
    i += 1
    assert lines[i] == "Total files checked: 5\n"
    i += 1
    assert lines[i] == "Number of error links: 3\n"
    i += 1
    assert lines[i] == "Number of unique broken links: 2\n"
    i += 1
    assert lines[i] == "\n"
    i += 1
    assert lines[i] == "\n"
    i += 1
    assert lines[i] == "Broken link - https://link1.demo found in:\n"
    i += 1
    assert lines[i] == "https://file1.url/here\n"
    i += 1
    assert lines[i] == "https://file2.url/goes/here\n"
    i += 1
    assert lines[i] == "\n"
    i += 1
    assert lines[i] == "Broken link - https://link2.demo found in:\n"
    i += 1
    assert lines[i] == "https://file4.url/here\n"
def test_get_scrapable_links():
    args = link_checker.parse_arguments(["deeds"])
    test_file = (
        "<a name='hello'>without href</a>,"
        " <a href='#hello'>internal link</a>,"
        " <a href='mailto:[email protected]'>mailto protocol</a>,"
        " <a href='https://creativecommons.ca'>Absolute link</a>,"
        " <a href='/index'>Relative Link</a>"
    )
    soup = BeautifulSoup(test_file, "lxml")
    test_case = soup.find_all("a")
    base_url = "https://www.demourl.com/dir1/dir2"
    valid_anchors, valid_links, _ = get_scrapable_links(
        args, base_url, test_case, None, False
    )
    assert str(valid_anchors) == (
        '[<a href="https://creativecommons.ca">Absolute link</a>,'
        ' <a href="/index">Relative Link</a>]'
    )
    assert (
        str(valid_links)
        == "['https://creativecommons.ca', 'https://www.demourl.com/index']"
    )
    # Testing RDF
    args = link_checker.parse_arguments(["index", "--local-index"])
    rdf_obj_list = get_index_rdf(
        args, local_path=constants.TEST_RDF_LOCAL_PATH
    )
    rdf_obj = rdf_obj_list[0]
    base_url = rdf_obj["rdf:about"]
    links_found = get_links_from_rdf(rdf_obj)
    valid_anchors, valid_links, _ = get_scrapable_links(
        args, base_url, links_found, None, False, rdf=True,
    )
    expected_anchors = (
        "[<cc:permits "
        'rdf:resource="http://creativecommons.org/ns#DerivativeWorks"/>, '
        "<cc:permits "
        'rdf:resource="http://creativecommons.org/ns#Reproduction"/>, '
        "<cc:permits "
        'rdf:resource="http://creativecommons.org/ns#Distribution"/>, '
        "<cc:jurisdiction "
        'rdf:resource="http://creativecommons.org/international/ch/"/>, '
        "<foaf:logo "
        'rdf:resource="https://i.creativecommons.org/'
        'l/by-nc-sa/2.5/ch/88x31.png"/>, '
        "<foaf:logo "
        'rdf:resource="https://i.creativecommons.org/'
        'l/by-nc-sa/2.5/ch/80x15.png"/>, '
        "<cc:legalcode "
        'rdf:resource="http://creativecommons.org/'
        'licenses/by-nc-sa/2.5/ch/legalcode.de"/>, '
        "<dc:source "
        'rdf:resource="http://creativecommons.org/licenses/by-nc-sa/2.5/"/>, '
        "<dc:creator "
        'rdf:resource="http://creativecommons.org"/>, '
        "<cc:prohibits "
        'rdf:resource="http://creativecommons.org/ns#CommercialUse"/>, '
        "<cc:licenseClass "
        'rdf:resource="http://creativecommons.org/license/"/>, '
        "<cc:requires "
        'rdf:resource="http://creativecommons.org/ns#ShareAlike"/>, '
        "<cc:requires "
        'rdf:resource="http://creativecommons.org/ns#Attribution"/>, '
        "<cc:requires "
        'rdf:resource="http://creativecommons.org/ns#Notice"/>]'
    )
    assert str(valid_anchors) == expected_anchors
    valid_links.sort()
    expected_links = [
        "http://creativecommons.org",
        "http://creativecommons.org/international/ch/",
        "http://creativecommons.org/license/",
        "http://creativecommons.org/licenses/by-nc-sa/2.5/",
        "http://creativecommons.org/licenses/by-nc-sa/2.5/ch/legalcode.de",
        "http://creativecommons.org/ns#Attribution",
        "http://creativecommons.org/ns#CommercialUse",
        "http://creativecommons.org/ns#DerivativeWorks",
        "http://creativecommons.org/ns#Distribution",
        "http://creativecommons.org/ns#Notice",
        "http://creativecommons.org/ns#Reproduction",
        "http://creativecommons.org/ns#ShareAlike",
        "https://i.creativecommons.org/l/by-nc-sa/2.5/ch/80x15.png",
        "https://i.creativecommons.org/l/by-nc-sa/2.5/ch/88x31.png",
    ]
    expected_links.sort()
    assert valid_links == expected_links
def test_parser_shared():
    subcmds = ["deeds", "legalcode", "rdf", "index", "combined", "canonical"]

    # Test defaults
    for subcmd in subcmds:
        args = link_checker.parse_arguments([subcmd])
        assert args.limit == 0
        assert args.log_level == 30
        assert args.root_url == "https://creativecommons.org"

    # Test arguments
    for subcmd in subcmds:
        # Test --limit
        args = link_checker.parse_arguments([subcmd, "--limit", "10"])
        assert args.limit == 10
        args = link_checker.parse_arguments([subcmd, "--limit=100"])
        assert args.limit == 100
        # Test Logging Levels -q/--quiet
        args = link_checker.parse_arguments([subcmd, "-q"])
        assert args.log_level == 40
        args = link_checker.parse_arguments([subcmd, "-qq"])
        assert args.log_level == 50
        args = link_checker.parse_arguments([subcmd, "-qqq"])
        assert args.log_level == 50
        args = link_checker.parse_arguments([subcmd, "-q", "--quiet"])
        assert args.log_level == 50
        # Test Logging Levels -v/--verbose
        args = link_checker.parse_arguments([subcmd, "-v"])
        assert args.log_level == 20
        args = link_checker.parse_arguments([subcmd, "-vv"])
        assert args.log_level == 10
        args = link_checker.parse_arguments([subcmd, "-vvv"])
        assert args.log_level == 10
        args = link_checker.parse_arguments([subcmd, "-v", "--verbose"])
        assert args.log_level == 10
        # Test Logging Levels with both -v and -q
        args = link_checker.parse_arguments([subcmd, "-vq"])
        assert args.log_level == 30
        args = link_checker.parse_arguments([subcmd, "-vvq"])
        assert args.log_level == 20
        args = link_checker.parse_arguments([subcmd, "-vqq"])
        assert args.log_level == 40
        # Test --root-url
        args = link_checker.parse_arguments(
            [subcmd, "--root-url", "https://pytest.creativecommons.org"])
        assert args.root_url == "https://pytest.creativecommons.org"