Beispiel #1
0
def apply_domain_limit(entries, domain_limit):
    """ Takes a list of sanitised URLs and looks at their domains.  If a
        domain count exceeds the limit, the urls from that domain are
        removed and replaced with a single 'domain: url' entry. The counts
        of the new 'domain:' entries, and the remaining links are also
        returned.
    """

    stripped_urls = subdomains(entries)
    new_url_set = set()
    exceeded_domains = set()
    domains_seen = set()
    domains_count = dict()

    for url in stripped_urls:
        if url not in domains_seen:
            domains_seen.add(url)
            domains_count[url] = 1
        else:
            domains_count[url] += 1

    for url in stripped_urls:
        if domains_count[url] >= domain_limit:
            exceeded_domains.add(url)

    for entry in entries:
        if (subdomain(entry) not in exceeded_domains and
                rootdomain(entry) not in exceeded_domains):
            new_url_set.add(entry)

    applied_domain_limit = namedtuple('applied_domain_limit', 'remaining_urls exceeded_domains')

    return applied_domain_limit(new_url_set, exceeded_domains)
def apply_domain_limit(entries, domain_limit):
    """ Takes a list of sanitised URLs and looks at their domains.  If a
        domain count exceeds the limit, the urls from that domain are
        removed and replaced with a single 'domain: url' entry. The counts
        of the new 'domain:' entries, and the remaining links are also
        returned.
    """

    stripped_urls = subdomains(entries)
    new_url_set = set()
    exceeded_domains = set()
    domains_seen = set()
    domains_count = dict()

    for url in stripped_urls:
        if url not in domains_seen:
            domains_seen.add(url)
            domains_count[url] = 1
        else:
            domains_count[url] += 1

    for url in stripped_urls:
        if domains_count[url] >= domain_limit:
            exceeded_domains.add(url)

    for entry in entries:
        if (subdomain(entry) not in exceeded_domains
                and rootdomain(entry) not in exceeded_domains):
            new_url_set.add(entry)

    applied_domain_limit = namedtuple('applied_domain_limit',
                                      'remaining_urls exceeded_domains')

    return applied_domain_limit(new_url_set, exceeded_domains)
Beispiel #3
0
def remove_redundant_domains(old_domains, new_domains):
    """ Checks whether any domains that have been newly created
    conflict with any existing domain entries and removes any
    such conflict.
    """

    non_redundant_old_domains = set()
    non_redundant_new_domains = set()
    for old_domain in old_domains:
        if rootdomain(old_domain) not in new_domains:
            non_redundant_old_domains.add(old_domain)
    for new_domain in new_domains:
        if new_domain not in old_domains:
            non_redundant_new_domains.add(new_domain)
    return (list(non_redundant_old_domains), list(non_redundant_new_domains))
def remove_redundant_domains(old_domains, new_domains):
    """ Checks whether any domains that have been newly created
    conflict with any existing domain entries and removes any
    such conflict.
    """

    non_redundant_old_domains = set()
    non_redundant_new_domains = set()
    for old_domain in old_domains:
        if rootdomain(old_domain) not in new_domains:
            non_redundant_old_domains.add(old_domain)
    for new_domain in new_domains:
        if new_domain not in old_domains:
            non_redundant_new_domains.add(new_domain)
    return (list(non_redundant_old_domains), list(non_redundant_new_domains))
Beispiel #5
0
def combine_with_original_disavow(file_contents, disavow_entries):
    """ Takes the disavow file passed to disavow_file_to_dict() and it's
        resulting output and combines them to create a .txt file with the
        relevant 'domain:' entries and individual links to be disavowed,
        while maintaining the order and the comments from the original
        document.
     """

    output = []
    # extract = extract_file_contents(disavow_file)
    file_contents = file_contents.splitlines()
    urls_encountered = set()
    domains_encountered = set()
    for raw_entry in file_contents:

        if (not raw_entry.isspace()) and (raw_entry != ""):

            # Strip quotes for lines wrapped in quotes
            if raw_entry.startswith('"') and raw_entry.endswith('"'):
                raw_entry = raw_entry[1:-1]

            if raw_entry[0] == '#':
                # line is a comment, so we just keep it
                output.append(raw_entry)
                continue

            if raw_entry[:7] == 'domain:':
                # line is an domain entry

                # clean the domain entry
                domain_normalized = normalize(raw_entry[7:])

                # check if it is valid, if not then include it is a comment
                if not domain_normalized:
                    output.append('# invalid entry - ' + raw_entry)

                else:
                    clean_domain = subdomain(domain_normalized)
                    if clean_domain in disavow_entries['domain_entries']:
                        if clean_domain not in domains_encountered:
                            output.append('domain:' + clean_domain)
                            domains_encountered.add(clean_domain)
                        else:
                            output.append('# domain entry already present - ' + clean_domain)
            else:
                # line is a url entry

                # clean the url entry
                url_normalized = normalize(raw_entry)

                # check if link entry is valid
                if not url_normalized:
                    output.append('# invalid entry - ' + raw_entry)

                else:
                    url_subdomain = subdomain(url_normalized)
                    url_rootdomain = rootdomain(url_normalized)

                    if url_subdomain in disavow_entries['domain_entries']:

                        if url_subdomain not in domains_encountered:
                            domains_encountered.add(url_subdomain)
                            output.append('domain:' + url_subdomain)

                        else:
                            output.append('# link now disavowed via new domain entry - ' + raw_entry)

                    elif url_rootdomain in disavow_entries['domain_entries']:

                        if url_rootdomain not in domains_encountered:
                            domains_encountered.add(url_rootdomain)
                            output.append('domain:' + url_rootdomain)

                        else:
                            output.append('# link now disavowed via new domain entry - ' + raw_entry)

                    elif url_normalized in disavow_entries['url_entries']:
                        if url_normalized not in urls_encountered:
                            output.append(url_normalized)
                            urls_encountered.add(url_normalized)
                        else:
                            output.append('# link entry already present')

                    else:
                        output.append('# error occurred, not sure what to do with this - ' + raw_entry)
    return output
def combine_with_original_disavow(file_contents, disavow_entries):
    """ Takes the disavow file passed to disavow_file_to_dict() and it's
        resulting output and combines them to create a .txt file with the
        relevant 'domain:' entries and individual links to be disavowed,
        while maintaining the order and the comments from the original
        document.
     """

    output = []
    # extract = extract_file_contents(disavow_file)
    file_contents = file_contents.splitlines()
    urls_encountered = set()
    domains_encountered = set()
    for raw_entry in file_contents:

        if (not raw_entry.isspace()) and (raw_entry != ""):

            # Strip quotes for lines wrapped in quotes
            if raw_entry.startswith('"') and raw_entry.endswith('"'):
                raw_entry = raw_entry[1:-1]

            if raw_entry[0] == '#':
                # line is a comment, so we just keep it
                output.append(raw_entry)
                continue

            if raw_entry[:7] == 'domain:':
                # line is an domain entry

                # clean the domain entry
                domain_normalized = normalize(raw_entry[7:])

                # check if it is valid, if not then include it is a comment
                if not domain_normalized:
                    output.append('# invalid entry - ' + raw_entry)

                else:
                    clean_domain = subdomain(domain_normalized)
                    if clean_domain in disavow_entries['domain_entries']:
                        if clean_domain not in domains_encountered:
                            output.append('domain:' + clean_domain)
                            domains_encountered.add(clean_domain)
                        else:
                            output.append('# domain entry already present - ' +
                                          clean_domain)
            else:
                # line is a url entry

                # clean the url entry
                url_normalized = normalize(raw_entry)

                # check if link entry is valid
                if not url_normalized:
                    output.append('# invalid entry - ' + raw_entry)

                else:
                    url_subdomain = subdomain(url_normalized)
                    url_rootdomain = rootdomain(url_normalized)

                    if url_subdomain in disavow_entries['domain_entries']:

                        if url_subdomain not in domains_encountered:
                            domains_encountered.add(url_subdomain)
                            output.append('domain:' + url_subdomain)

                        else:
                            output.append(
                                '# link now disavowed via new domain entry - '
                                + raw_entry)

                    elif url_rootdomain in disavow_entries['domain_entries']:

                        if url_rootdomain not in domains_encountered:
                            domains_encountered.add(url_rootdomain)
                            output.append('domain:' + url_rootdomain)

                        else:
                            output.append(
                                '# link now disavowed via new domain entry - '
                                + raw_entry)

                    elif url_normalized in disavow_entries['url_entries']:
                        if url_normalized not in urls_encountered:
                            output.append(url_normalized)
                            urls_encountered.add(url_normalized)
                        else:
                            output.append('# link entry already present')

                    else:
                        output.append(
                            '# error occurred, not sure what to do with this - '
                            + raw_entry)
    return output