Esempio n. 1
0
def apply_domain_limit(entries, domain_limit):
    """ Takes a list of sanitised URLs and looks at their domains.  If a
        domain count exceeds the limit, the urls from that domain are
        removed and replaced with a single 'domain: url' entry. The counts
        of the new 'domain:' entries, and the remaining links are also
        returned.
    """

    stripped_urls = subdomains(entries)
    new_url_set = set()
    exceeded_domains = set()
    domains_seen = set()
    domains_count = dict()

    for url in stripped_urls:
        if url not in domains_seen:
            domains_seen.add(url)
            domains_count[url] = 1
        else:
            domains_count[url] += 1

    for url in stripped_urls:
        if domains_count[url] >= domain_limit:
            exceeded_domains.add(url)

    for entry in entries:
        if (subdomain(entry) not in exceeded_domains and
                rootdomain(entry) not in exceeded_domains):
            new_url_set.add(entry)

    applied_domain_limit = namedtuple('applied_domain_limit', 'remaining_urls exceeded_domains')

    return applied_domain_limit(new_url_set, exceeded_domains)
Esempio n. 2
0
def apply_domain_limit(entries, domain_limit):
    """ Takes a list of sanitised URLs and looks at their domains.  If a
        domain count exceeds the limit, the urls from that domain are
        removed and replaced with a single 'domain: url' entry. The counts
        of the new 'domain:' entries, and the remaining links are also
        returned.
    """

    stripped_urls = subdomains(entries)
    new_url_set = set()
    exceeded_domains = set()
    domains_seen = set()
    domains_count = dict()

    for url in stripped_urls:
        if url not in domains_seen:
            domains_seen.add(url)
            domains_count[url] = 1
        else:
            domains_count[url] += 1

    for url in stripped_urls:
        if domains_count[url] >= domain_limit:
            exceeded_domains.add(url)

    for entry in entries:
        if (subdomain(entry) not in exceeded_domains
                and rootdomain(entry) not in exceeded_domains):
            new_url_set.add(entry)

    applied_domain_limit = namedtuple('applied_domain_limit',
                                      'remaining_urls exceeded_domains')

    return applied_domain_limit(new_url_set, exceeded_domains)
Esempio n. 3
0
def apply_disavow(disavow_entries, urls_list):
    """ Using a disavow file, tests which of a file of urls would be
        disavowed and which wouldn't.
    """

    disavow_links = []
    disavow_domains = []
    output_dict = {}

    if 'urls' in disavow_entries:
        disavow_links_details = normalize_and_dedupe_with_counts(
            disavow_entries['urls'])
        disavow_links = disavow_links_details.clean_urls
        output_dict[
            'disavow_links_entered'] = disavow_links_details.urls_entered
        output_dict[
            'unique_disavow_links_entered'] = disavow_links_details.unique_urls_entered
    if 'domains' in disavow_entries:
        disavow_domains_details = normalize_and_dedupe_with_counts(
            disavow_entries['domains'])
        disavow_domains = subdomains(disavow_domains_details.clean_urls)

    urls_to_test_details = normalize_and_dedupe_with_counts(urls_list)
    urls = urls_to_test_details.clean_urls

    disavowed_urls = []
    non_disavowed_urls = []

    for url in urls:
        if (url in disavow_links) or (subdomain(url) in disavow_domains):
            disavowed_urls.append(url)
        else:
            non_disavowed_urls.append(url)

    total_disavowed_links = len(disavowed_urls)
    total_remaining_links = len(non_disavowed_urls)

    output_dict.update({
        'disavowed': disavowed_urls,
        'non_disavowed': non_disavowed_urls,
        'domains_entered': disavow_domains_details.urls_entered,
        'unique_domains_entered': disavow_domains_details.unique_urls_entered,
        'urls_entered_to_test': urls_to_test_details.urls_entered,
        'unique_urls_entered_to_test':
        urls_to_test_details.unique_urls_entered,
        'total_disavowed_links': total_disavowed_links,
        'total_remaining_links': total_remaining_links
    })

    return output_dict
Esempio n. 4
0
def apply_disavow(disavow_entries, urls_list):
    """ Using a disavow file, tests which of a file of urls would be
        disavowed and which wouldn't.
    """

    disavow_links = []
    disavow_domains = []
    output_dict = {}

    if 'urls' in disavow_entries:
        disavow_links_details = normalize_and_dedupe_with_counts(disavow_entries['urls'])
        disavow_links = disavow_links_details.clean_urls
        output_dict['disavow_links_entered'] = disavow_links_details.urls_entered
        output_dict['unique_disavow_links_entered'] = disavow_links_details.unique_urls_entered
    if 'domains' in disavow_entries:
        disavow_domains_details = normalize_and_dedupe_with_counts(disavow_entries['domains'])
        disavow_domains = subdomains(disavow_domains_details.clean_urls)

    urls_to_test_details = normalize_and_dedupe_with_counts(urls_list)
    urls = urls_to_test_details.clean_urls

    disavowed_urls = []
    non_disavowed_urls = []

    for url in urls:
        if (url in disavow_links) or (subdomain(url) in disavow_domains):
            disavowed_urls.append(url)
        else:
            non_disavowed_urls.append(url)

    total_disavowed_links = len(disavowed_urls)
    total_remaining_links = len(non_disavowed_urls)

    output_dict.update({
        'disavowed': disavowed_urls,
        'non_disavowed': non_disavowed_urls,
        'domains_entered': disavow_domains_details.urls_entered,
        'unique_domains_entered': disavow_domains_details.unique_urls_entered,
        'urls_entered_to_test': urls_to_test_details.urls_entered,
        'unique_urls_entered_to_test': urls_to_test_details.unique_urls_entered,
        'total_disavowed_links': total_disavowed_links,
        'total_remaining_links': total_remaining_links
    })

    return output_dict
Esempio n. 5
0
def disavow_file_to_dict(file_contents, domain_limit=False):
    """ Takes a disavow file and applies many helper functions,
        outputting a dictionary with old and new domain entries,
        the individual links to be disavowed, as well as useful counts.
    """

    entries_dict = import_from_file_contents(file_contents)
    link_entries_details = normalize_and_dedupe_with_counts(
        entries_dict['urls'])
    link_entries = link_entries_details.clean_urls
    domain_entries_details = normalize_and_dedupe_with_counts(
        entries_dict['domains'])
    domain_entries = subdomains(domain_entries_details.clean_urls)

    if domain_entries:
        applied_disavow = apply_disavow({"domains": entries_dict['domains']},
                                        entries_dict['urls'])
        link_entries = applied_disavow['non_disavowed']

    final_domain_entries = set()
    final_domain_entries.update(domain_entries)

    if domain_limit:
        link_entries, new_domain_entries = apply_domain_limit(
            link_entries, domain_limit)
        final_domain_entries.update(new_domain_entries)

    if domain_entries and domain_limit:
        domain_entries, new_domain_entries = remove_redundant_domains(
            domain_entries, new_domain_entries)
    # total_domains_disavowed = len(domain_entries + new_domain_entries)
    links_disavowed = len(link_entries)

    return {
        'domain_entries': list(final_domain_entries),
        'url_entries': link_entries,
        'urls_entered_count': link_entries_details.urls_entered,
        'urls_disavowed_count': links_disavowed,
        'unique_urls_entered_count': link_entries_details.unique_urls_entered,
        'domain_entries_entered_count': domain_entries_details.urls_entered,
    }
Esempio n. 6
0
def disavow_file_to_dict(file_contents, domain_limit=False):
    """ Takes a disavow file and applies many helper functions,
        outputting a dictionary with old and new domain entries,
        the individual links to be disavowed, as well as useful counts.
    """

    entries_dict = import_from_file_contents(file_contents)
    link_entries_details = normalize_and_dedupe_with_counts(entries_dict['urls'])
    link_entries = link_entries_details.clean_urls
    domain_entries_details = normalize_and_dedupe_with_counts(entries_dict['domains'])
    domain_entries = subdomains(domain_entries_details.clean_urls)

    if domain_entries:
        applied_disavow = apply_disavow({"domains": entries_dict['domains']}, entries_dict['urls'])
        link_entries = applied_disavow['non_disavowed']

    final_domain_entries = set()
    final_domain_entries.update(domain_entries)

    if domain_limit:
        link_entries, new_domain_entries = apply_domain_limit(link_entries, domain_limit)
        final_domain_entries.update(new_domain_entries)

    if domain_entries and domain_limit:
        domain_entries, new_domain_entries = remove_redundant_domains(domain_entries,
                                                                      new_domain_entries)
    # total_domains_disavowed = len(domain_entries + new_domain_entries)
    links_disavowed = len(link_entries)

    return {
        'domain_entries': list(final_domain_entries),
        'url_entries': link_entries,

        'urls_entered_count': link_entries_details.urls_entered,
        'urls_disavowed_count': links_disavowed,
        'unique_urls_entered_count': link_entries_details.unique_urls_entered,
        'domain_entries_entered_count': domain_entries_details.urls_entered,
    }