def check_legalcode(args):
    print("\n\nChecking LegalCode License...\n\n")
    license_names = get_legalcode(args)
    if args.log_level <= INFO:
        print("Number of files to be checked:", len(license_names))
    errors_total = 0
    exit_status = 0
    for license_name in license_names:
        caught_errors = 0
        context_printed = False
        filename = license_name[:-len(".html")]
        base_url = create_base_link(args, filename)
        context = f"\n\nChecking: legalcode\nURL: {base_url}"
        if args.local:
            source_html = request_local_text(LICENSE_LOCAL_PATH, license_name)
        else:
            page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name)
            source_html = request_text(page_url)
        license_soup = BeautifulSoup(source_html, "lxml")
        links_found = license_soup.find_all("a")
        link_count = len(links_found)
        if args.log_level <= INFO:
            print(f"{context}\nNumber of links found: {link_count}")
            context_printed = True
        valid_anchors, valid_links, context_printed = get_scrapable_links(
            args, base_url, links_found, context, context_printed)
        if valid_links:
            memoized_results = get_memoized_result(valid_links, valid_anchors)
            stored_links = memoized_results[0]
            stored_anchors = memoized_results[1]
            stored_result = memoized_results[2]
            check_links = memoized_results[3]
            check_anchors = memoized_results[4]
            if check_links:
                rs = (
                    # Since we're only checking for validity, we can retreive
                    # only the headers/metadata
                    grequests.head(link, timeout=REQUESTS_TIMEOUT)
                    for link in check_links)
                responses = list()
                # Explicitly close connections to free up file handles and
                # avoid Connection Errors per:
                # https://stackoverflow.com/a/22839550
                for response in grequests.map(
                        rs, exception_handler=exception_handler):
                    try:
                        responses.append(response.status_code)
                        response.close()
                    except AttributeError:
                        responses.append(response)
                memoize_result(check_links, responses)
                stored_anchors += check_anchors
                stored_result += responses
            stored_links += check_links
            caught_errors = write_response(
                args,
                stored_links,
                stored_result,
                base_url,
                license_name,
                stored_anchors,
                context,
                context_printed,
            )

        if caught_errors:
            errors_total += caught_errors
            exit_status = 1

    return license_names, errors_total, exit_status
def check_rdfs(args, index=False):
    if index:
        print("\n\nChecking index.rdf...\n\n")
        rdf_obj_list = get_index_rdf(args)
    else:
        print("\n\nChecking RDFs...\n\n")
        rdf_obj_list = get_rdf(args)
    if args.log_level <= INFO:
        if not index:
            print("Number of RDF files to be checked:", len(rdf_obj_list))
        else:
            print(
                "Number of RDF objects/sections to be checked in index.rdf:",
                len(rdf_obj_list),
            )
    errors_total = 0
    exit_status = 0
    for rdf_obj in rdf_obj_list:
        caught_errors = 0
        context_printed = False
        rdf_url = (rdf_obj["rdf:about"]
                   if index else f"{rdf_obj['rdf:about']}rdf")
        links_found = get_links_from_rdf(rdf_obj)
        checking = "URL" if not index else "RDF_ABOUT"
        context = f"\n\nChecking: \n{checking}: {rdf_url}"
        link_count = len(links_found)
        if args.log_level <= INFO:
            print(f"{context}\nNumber of links found: {link_count}")
            context_printed = True
        base_url = rdf_url
        valid_anchors, valid_links, context_printed = get_scrapable_links(
            args,
            base_url,
            links_found,
            context,
            context_printed,
            rdf=True,
        )
        if valid_links:
            memoized_results = get_memoized_result(valid_links, valid_anchors)
            stored_links = memoized_results[0]
            stored_anchors = memoized_results[1]
            stored_result = memoized_results[2]
            check_links = memoized_results[3]
            check_anchors = memoized_results[4]
            if check_links:
                rs = (
                    # Since we're only checking for validity,
                    # we can retreive
                    # only the headers/metadata
                    grequests.head(link, timeout=REQUESTS_TIMEOUT)
                    for link in check_links)
                responses = list()
                # Explicitly close connections to free up file handles and
                # avoid Connection Errors per:
                # https://stackoverflow.com/a/22839550
                for response in grequests.map(
                        rs, exception_handler=exception_handler):
                    try:
                        responses.append(response.status_code)
                        response.close()
                    except AttributeError:
                        responses.append(response)
                memoize_result(check_links, responses)
                stored_anchors += check_anchors
                stored_result += responses
            stored_links += check_links
            caught_errors = write_response(
                args,
                stored_links,
                stored_result,
                rdf_url,
                rdf_obj,
                stored_anchors,
                context,
                context_printed,
            )

        if caught_errors:
            errors_total += caught_errors
            exit_status = 1

    return rdf_obj_list, errors_total, exit_status
def check_deeds(args):
    print("\n\nChecking Deeds...\n\n")
    license_names = get_legalcode(args)
    if args.log_level <= INFO:
        print("Number of files to be checked:", len(license_names))
    errors_total = 0
    exit_status = 0
    for license_name in license_names:
        caught_errors = 0
        context_printed = False
        filename = license_name[:-len(".html")]
        deed_base_url = create_base_link(args, filename, for_deeds=True)
        # Deeds template:
        # https://github.com/creativecommons/cc.engine/blob/master/cc/engine/templates/licenses/standard_deed.html

        # Scrapping the html found on the active site
        if deed_base_url:
            context = f"\n\nChecking: deed\nURL: {deed_base_url}"
            page_url = deed_base_url
            source_html = request_text(page_url)
            license_soup = BeautifulSoup(source_html, "lxml")
            links_found = license_soup.find_all("a")
            link_count = len(links_found)
            if args.log_level <= INFO:
                print(f"{context}\nNumber of links found: {link_count}")
                context_printed = True
            base_url = deed_base_url
            valid_anchors, valid_links, context_printed = get_scrapable_links(
                args, base_url, links_found, context, context_printed)
            if valid_links:
                memoized_results = get_memoized_result(valid_links,
                                                       valid_anchors)
                stored_links = memoized_results[0]
                stored_anchors = memoized_results[1]
                stored_result = memoized_results[2]

                check_links = memoized_results[3]
                check_anchors = memoized_results[4]
                if check_links:
                    rs = (
                        # Since we're only checking for validity,
                        # we can retreive
                        # only the headers/metadata
                        grequests.head(link, timeout=REQUESTS_TIMEOUT)
                        for link in check_links)
                    responses = list()
                    # Explicitly close connections to free up file handles and
                    # avoid Connection Errors per:
                    # https://stackoverflow.com/a/22839550
                    for response in grequests.map(
                            rs, exception_handler=exception_handler):
                        try:
                            responses.append(response.status_code)
                            response.close()
                        except AttributeError:
                            responses.append(response)
                    memoize_result(check_links, responses)
                    stored_anchors += check_anchors
                    stored_result += responses
                stored_links += check_links
                caught_errors = write_response(
                    args,
                    stored_links,
                    stored_result,
                    base_url,
                    license_name,
                    stored_anchors,
                    context,
                    context_printed,
                )

            if caught_errors:
                errors_total += caught_errors
                exit_status = 1

    return license_names, errors_total, exit_status