def test_create_base_link(filename, deed_result, legalcode_result, rdf_result): # deeds args = link_checker.parse_arguments(["deeds"]) baseURL = create_base_link(args, filename, for_deeds=True) assert baseURL == deed_result # legalcode args = link_checker.parse_arguments(["legalcode"]) baseURL = create_base_link(args, filename) assert baseURL == legalcode_result # rdf args = link_checker.parse_arguments(["rdf"]) baseURL = create_base_link(args, filename, for_rdfs=True) assert baseURL == rdf_result
def test_parser_shared_rdf(): subcmds = ["rdf", "index"] # Test defaults for subcmd in subcmds: args = link_checker.parse_arguments([subcmd]) assert args.local_index is False # Test argumetns for subcmd in subcmds: # Test --local args = link_checker.parse_arguments([subcmd, "--local-index"]) assert args.local_index is True
def test_parser_shared_licenses(): subcmds = ["deeds", "legalcode", "rdf", "combined", "canonical"] # Test defaults for subcmd in subcmds: args = link_checker.parse_arguments([subcmd]) assert args.local is False # Test argumetns for subcmd in subcmds: # Test --local args = link_checker.parse_arguments([subcmd, "--local"]) assert args.local is True
def test_write_response(tmpdir): # Set config output_file = tmpdir.join("errorlog.txt") args = link_checker.parse_arguments( ["deeds", "--output-errors", output_file.strpath] ) # Text to extract valid_anchors text = ( "<a href='http://httpbin.org/status/200'>Response 200</a>," " <a href='file://link3'>Invalid Scheme</a>," " <a href='http://httpbin.org/status/400'>Response 400</a>" ) soup = BeautifulSoup(text, "lxml") valid_anchors = soup.find_all("a") # Setup function params all_links = [ "http://httpbin.org/status/200", "file://link3", "http://httpbin.org/status/400", ] rs = (grequests.get(link) for link in all_links) response = grequests.map(rs, exception_handler=exception_handler) base_url = "https://baseurl/goes/here" license_name = "by-cc-nd_2.0" # Set output to external file caught_errors = write_response( args, all_links, response, base_url, license_name, valid_anchors, license_name, False, ) assert caught_errors == 2 args.output_errors.flush() lines = output_file.readlines() i = 0 assert lines[i] == "\n" i += 1 assert lines[i] == "by-cc-nd_2.0\n" i += 1 assert lines[i] == "URL: https://baseurl/goes/here\n" i += 1 assert lines[i] == f' {"Invalid Schema":<24}file://link3\n' i += 1 assert lines[i] == f'{"":<26}<a href="file://link3">Invalid Scheme</a>\n' i += 1 assert lines[i] == f' {"400":<24}http://httpbin.org/status/400\n' i += 1 assert lines[i] == ( f'{"":<26}<a href="http://httpbin.org/status/400">Response 400</a>\n' )
def test_output_write(tmpdir): # output_errors is set and written to output_file = tmpdir.join("errorlog.txt") args = link_checker.parse_arguments( ["deeds", "--output-errors", output_file.strpath] ) output_write(args, "Output enabled") args.output_errors.flush() assert output_file.read() == "Output enabled\n"
def test_parser_shared_reporting(tmpdir): subcmds = ["deeds", "legalcode", "rdf", "index", "combined"] # Test defaults for subcmd in subcmds: args = link_checker.parse_arguments([subcmd]) assert bool(args.output_errors) is False # Test arguments for subcmd in subcmds: # Test --output-errors with default value args = link_checker.parse_arguments([subcmd, "--output-errors"]) assert bool(args.output_errors) is True assert args.output_errors.name == "errorlog.txt" # Test --output-errors with custom value output_file = tmpdir.join("errorlog.txt") args = link_checker.parse_arguments( [subcmd, "--output-errors", output_file.strpath]) assert bool(args.output_errors) is True assert args.output_errors.name == output_file.strpath
def test_output_issues_summary(reset_global, tmpdir): # output_errors is set and written to output_file = tmpdir.join("errorlog.txt") args = link_checker.parse_arguments( ["deeds", "--output-errors", output_file.strpath] ) utils.MAP_BROKEN_LINKS = { "https://link1.demo": [ "https://file1.url/here", "https://file2.url/goes/here", ], "https://link2.demo": ["https://file4.url/here"], } all_links = ["some link"] * 5 output_issues_summary(args, all_links, 3) args.output_errors.flush() lines = output_file.readlines() i = 0 assert lines[i] == "\n" i += 1 assert lines[i] == "\n" i += 1 assert lines[i] == "***************************************\n" i += 1 assert lines[i] == " SUMMARY\n" i += 1 assert lines[i] == "***************************************\n" i += 1 assert lines[i] == "\n" i += 1 assert str(lines[i]).startswith("Timestamp:") i += 1 assert lines[i] == "Total files checked: 5\n" i += 1 assert lines[i] == "Number of error links: 3\n" i += 1 assert lines[i] == "Number of unique broken links: 2\n" i += 1 assert lines[i] == "\n" i += 1 assert lines[i] == "\n" i += 1 assert lines[i] == "Broken link - https://link1.demo found in:\n" i += 1 assert lines[i] == "https://file1.url/here\n" i += 1 assert lines[i] == "https://file2.url/goes/here\n" i += 1 assert lines[i] == "\n" i += 1 assert lines[i] == "Broken link - https://link2.demo found in:\n" i += 1 assert lines[i] == "https://file4.url/here\n"
def test_get_scrapable_links(): args = link_checker.parse_arguments(["deeds"]) test_file = ( "<a name='hello'>without href</a>," " <a href='#hello'>internal link</a>," " <a href='mailto:[email protected]'>mailto protocol</a>," " <a href='https://creativecommons.ca'>Absolute link</a>," " <a href='/index'>Relative Link</a>" ) soup = BeautifulSoup(test_file, "lxml") test_case = soup.find_all("a") base_url = "https://www.demourl.com/dir1/dir2" valid_anchors, valid_links, _ = get_scrapable_links( args, base_url, test_case, None, False ) assert str(valid_anchors) == ( '[<a href="https://creativecommons.ca">Absolute link</a>,' ' <a href="/index">Relative Link</a>]' ) assert ( str(valid_links) == "['https://creativecommons.ca', 'https://www.demourl.com/index']" ) # Testing RDF args = link_checker.parse_arguments(["index", "--local-index"]) rdf_obj_list = get_index_rdf( args, local_path=constants.TEST_RDF_LOCAL_PATH ) rdf_obj = rdf_obj_list[0] base_url = rdf_obj["rdf:about"] links_found = get_links_from_rdf(rdf_obj) valid_anchors, valid_links, _ = get_scrapable_links( args, base_url, links_found, None, False, rdf=True, ) expected_anchors = ( "[<cc:permits " 'rdf:resource="http://creativecommons.org/ns#DerivativeWorks"/>, ' "<cc:permits " 'rdf:resource="http://creativecommons.org/ns#Reproduction"/>, ' "<cc:permits " 'rdf:resource="http://creativecommons.org/ns#Distribution"/>, ' "<cc:jurisdiction " 'rdf:resource="http://creativecommons.org/international/ch/"/>, ' "<foaf:logo " 'rdf:resource="https://i.creativecommons.org/' 'l/by-nc-sa/2.5/ch/88x31.png"/>, ' "<foaf:logo " 'rdf:resource="https://i.creativecommons.org/' 'l/by-nc-sa/2.5/ch/80x15.png"/>, ' "<cc:legalcode " 'rdf:resource="http://creativecommons.org/' 'licenses/by-nc-sa/2.5/ch/legalcode.de"/>, ' "<dc:source " 'rdf:resource="http://creativecommons.org/licenses/by-nc-sa/2.5/"/>, ' "<dc:creator " 'rdf:resource="http://creativecommons.org"/>, ' "<cc:prohibits " 'rdf:resource="http://creativecommons.org/ns#CommercialUse"/>, ' "<cc:licenseClass " 'rdf:resource="http://creativecommons.org/license/"/>, ' "<cc:requires " 'rdf:resource="http://creativecommons.org/ns#ShareAlike"/>, ' "<cc:requires " 'rdf:resource="http://creativecommons.org/ns#Attribution"/>, ' "<cc:requires " 'rdf:resource="http://creativecommons.org/ns#Notice"/>]' ) assert str(valid_anchors) == expected_anchors valid_links.sort() expected_links = [ "http://creativecommons.org", "http://creativecommons.org/international/ch/", "http://creativecommons.org/license/", "http://creativecommons.org/licenses/by-nc-sa/2.5/", "http://creativecommons.org/licenses/by-nc-sa/2.5/ch/legalcode.de", "http://creativecommons.org/ns#Attribution", "http://creativecommons.org/ns#CommercialUse", "http://creativecommons.org/ns#DerivativeWorks", "http://creativecommons.org/ns#Distribution", "http://creativecommons.org/ns#Notice", "http://creativecommons.org/ns#Reproduction", "http://creativecommons.org/ns#ShareAlike", "https://i.creativecommons.org/l/by-nc-sa/2.5/ch/80x15.png", "https://i.creativecommons.org/l/by-nc-sa/2.5/ch/88x31.png", ] expected_links.sort() assert valid_links == expected_links
def test_parser_shared(): subcmds = ["deeds", "legalcode", "rdf", "index", "combined", "canonical"] # Test defaults for subcmd in subcmds: args = link_checker.parse_arguments([subcmd]) assert args.limit == 0 assert args.log_level == 30 assert args.root_url == "https://creativecommons.org" # Test arguments for subcmd in subcmds: # Test --limit args = link_checker.parse_arguments([subcmd, "--limit", "10"]) assert args.limit == 10 args = link_checker.parse_arguments([subcmd, "--limit=100"]) assert args.limit == 100 # Test Logging Levels -q/--quiet args = link_checker.parse_arguments([subcmd, "-q"]) assert args.log_level == 40 args = link_checker.parse_arguments([subcmd, "-qq"]) assert args.log_level == 50 args = link_checker.parse_arguments([subcmd, "-qqq"]) assert args.log_level == 50 args = link_checker.parse_arguments([subcmd, "-q", "--quiet"]) assert args.log_level == 50 # Test Logging Levels -v/--verbose args = link_checker.parse_arguments([subcmd, "-v"]) assert args.log_level == 20 args = link_checker.parse_arguments([subcmd, "-vv"]) assert args.log_level == 10 args = link_checker.parse_arguments([subcmd, "-vvv"]) assert args.log_level == 10 args = link_checker.parse_arguments([subcmd, "-v", "--verbose"]) assert args.log_level == 10 # Test Logging Levels with both -v and -q args = link_checker.parse_arguments([subcmd, "-vq"]) assert args.log_level == 30 args = link_checker.parse_arguments([subcmd, "-vvq"]) assert args.log_level == 20 args = link_checker.parse_arguments([subcmd, "-vqq"]) assert args.log_level == 40 # Test --root-url args = link_checker.parse_arguments( [subcmd, "--root-url", "https://pytest.creativecommons.org"]) assert args.root_url == "https://pytest.creativecommons.org"