def apply_domain_limit(entries, domain_limit): """ Takes a list of sanitised URLs and looks at their domains. If a domain count exceeds the limit, the urls from that domain are removed and replaced with a single 'domain: url' entry. The counts of the new 'domain:' entries, and the remaining links are also returned. """ stripped_urls = subdomains(entries) new_url_set = set() exceeded_domains = set() domains_seen = set() domains_count = dict() for url in stripped_urls: if url not in domains_seen: domains_seen.add(url) domains_count[url] = 1 else: domains_count[url] += 1 for url in stripped_urls: if domains_count[url] >= domain_limit: exceeded_domains.add(url) for entry in entries: if (subdomain(entry) not in exceeded_domains and rootdomain(entry) not in exceeded_domains): new_url_set.add(entry) applied_domain_limit = namedtuple('applied_domain_limit', 'remaining_urls exceeded_domains') return applied_domain_limit(new_url_set, exceeded_domains)
def import_from_file_contents(file_contents): """ Takes a string, such as that from extract_file_contents(), splits on new lines, and returns a dictionary of 'domain:' entries, and standalone URL entries. Handles comments by ignoring them. """ urls = [] domains = [] file_contents = file_contents.splitlines() for lineraw in file_contents: if (not lineraw.isspace()) and (lineraw != ""): if lineraw.startswith('"') and lineraw.endswith('"'): lineraw = lineraw[1:-1] if lineraw[0] == '#': # comment entry continue # checking if url is valid if not normalize(lineraw): continue else: if lineraw[:7] == "domain:": # domain entry line = re.sub("\n", "", lineraw[7:]) # checking if domain url is valid if not normalize(line): continue else: if line.startswith('"') and line.endswith('"'): line = line[1:-1] # We run the domain extract here, as sometimes people accidentally # put full URLs in domain entries. We assume they mean to exclude # the domain (which is often now recommended anyway - "no good # links from bad domains"). domain = subdomain(line) domains.append(domain) else: # not a domain entry line = re.sub("\n", "", lineraw) urls.append(line) return {'urls': urls, 'domains': domains}
def apply_disavow(disavow_entries, urls_list): """ Using a disavow file, tests which of a file of urls would be disavowed and which wouldn't. """ disavow_links = [] disavow_domains = [] output_dict = {} if 'urls' in disavow_entries: disavow_links_details = normalize_and_dedupe_with_counts( disavow_entries['urls']) disavow_links = disavow_links_details.clean_urls output_dict[ 'disavow_links_entered'] = disavow_links_details.urls_entered output_dict[ 'unique_disavow_links_entered'] = disavow_links_details.unique_urls_entered if 'domains' in disavow_entries: disavow_domains_details = normalize_and_dedupe_with_counts( disavow_entries['domains']) disavow_domains = subdomains(disavow_domains_details.clean_urls) urls_to_test_details = normalize_and_dedupe_with_counts(urls_list) urls = urls_to_test_details.clean_urls disavowed_urls = [] non_disavowed_urls = [] for url in urls: if (url in disavow_links) or (subdomain(url) in disavow_domains): disavowed_urls.append(url) else: non_disavowed_urls.append(url) total_disavowed_links = len(disavowed_urls) total_remaining_links = len(non_disavowed_urls) output_dict.update({ 'disavowed': disavowed_urls, 'non_disavowed': non_disavowed_urls, 'domains_entered': disavow_domains_details.urls_entered, 'unique_domains_entered': disavow_domains_details.unique_urls_entered, 'urls_entered_to_test': urls_to_test_details.urls_entered, 'unique_urls_entered_to_test': urls_to_test_details.unique_urls_entered, 'total_disavowed_links': total_disavowed_links, 'total_remaining_links': total_remaining_links }) return output_dict
def apply_disavow(disavow_entries, urls_list): """ Using a disavow file, tests which of a file of urls would be disavowed and which wouldn't. """ disavow_links = [] disavow_domains = [] output_dict = {} if 'urls' in disavow_entries: disavow_links_details = normalize_and_dedupe_with_counts(disavow_entries['urls']) disavow_links = disavow_links_details.clean_urls output_dict['disavow_links_entered'] = disavow_links_details.urls_entered output_dict['unique_disavow_links_entered'] = disavow_links_details.unique_urls_entered if 'domains' in disavow_entries: disavow_domains_details = normalize_and_dedupe_with_counts(disavow_entries['domains']) disavow_domains = subdomains(disavow_domains_details.clean_urls) urls_to_test_details = normalize_and_dedupe_with_counts(urls_list) urls = urls_to_test_details.clean_urls disavowed_urls = [] non_disavowed_urls = [] for url in urls: if (url in disavow_links) or (subdomain(url) in disavow_domains): disavowed_urls.append(url) else: non_disavowed_urls.append(url) total_disavowed_links = len(disavowed_urls) total_remaining_links = len(non_disavowed_urls) output_dict.update({ 'disavowed': disavowed_urls, 'non_disavowed': non_disavowed_urls, 'domains_entered': disavow_domains_details.urls_entered, 'unique_domains_entered': disavow_domains_details.unique_urls_entered, 'urls_entered_to_test': urls_to_test_details.urls_entered, 'unique_urls_entered_to_test': urls_to_test_details.unique_urls_entered, 'total_disavowed_links': total_disavowed_links, 'total_remaining_links': total_remaining_links }) return output_dict
def combine_with_original_disavow(file_contents, disavow_entries): """ Takes the disavow file passed to disavow_file_to_dict() and it's resulting output and combines them to create a .txt file with the relevant 'domain:' entries and individual links to be disavowed, while maintaining the order and the comments from the original document. """ output = [] # extract = extract_file_contents(disavow_file) file_contents = file_contents.splitlines() urls_encountered = set() domains_encountered = set() for raw_entry in file_contents: if (not raw_entry.isspace()) and (raw_entry != ""): # Strip quotes for lines wrapped in quotes if raw_entry.startswith('"') and raw_entry.endswith('"'): raw_entry = raw_entry[1:-1] if raw_entry[0] == '#': # line is a comment, so we just keep it output.append(raw_entry) continue if raw_entry[:7] == 'domain:': # line is an domain entry # clean the domain entry domain_normalized = normalize(raw_entry[7:]) # check if it is valid, if not then include it is a comment if not domain_normalized: output.append('# invalid entry - ' + raw_entry) else: clean_domain = subdomain(domain_normalized) if clean_domain in disavow_entries['domain_entries']: if clean_domain not in domains_encountered: output.append('domain:' + clean_domain) domains_encountered.add(clean_domain) else: output.append('# domain entry already present - ' + clean_domain) else: # line is a url entry # clean the url entry url_normalized = normalize(raw_entry) # check if link entry is valid if not url_normalized: output.append('# invalid entry - ' + raw_entry) else: url_subdomain = subdomain(url_normalized) url_rootdomain = rootdomain(url_normalized) if url_subdomain in disavow_entries['domain_entries']: if url_subdomain not in domains_encountered: domains_encountered.add(url_subdomain) output.append('domain:' + url_subdomain) else: output.append('# link now disavowed via new domain entry - ' + raw_entry) elif url_rootdomain in disavow_entries['domain_entries']: if url_rootdomain not in domains_encountered: domains_encountered.add(url_rootdomain) output.append('domain:' + url_rootdomain) else: output.append('# link now disavowed via new domain entry - ' + raw_entry) elif url_normalized in disavow_entries['url_entries']: if url_normalized not in urls_encountered: output.append(url_normalized) urls_encountered.add(url_normalized) else: output.append('# link entry already present') else: output.append('# error occurred, not sure what to do with this - ' + raw_entry) return output
def combine_with_original_disavow(file_contents, disavow_entries): """ Takes the disavow file passed to disavow_file_to_dict() and it's resulting output and combines them to create a .txt file with the relevant 'domain:' entries and individual links to be disavowed, while maintaining the order and the comments from the original document. """ output = [] # extract = extract_file_contents(disavow_file) file_contents = file_contents.splitlines() urls_encountered = set() domains_encountered = set() for raw_entry in file_contents: if (not raw_entry.isspace()) and (raw_entry != ""): # Strip quotes for lines wrapped in quotes if raw_entry.startswith('"') and raw_entry.endswith('"'): raw_entry = raw_entry[1:-1] if raw_entry[0] == '#': # line is a comment, so we just keep it output.append(raw_entry) continue if raw_entry[:7] == 'domain:': # line is an domain entry # clean the domain entry domain_normalized = normalize(raw_entry[7:]) # check if it is valid, if not then include it is a comment if not domain_normalized: output.append('# invalid entry - ' + raw_entry) else: clean_domain = subdomain(domain_normalized) if clean_domain in disavow_entries['domain_entries']: if clean_domain not in domains_encountered: output.append('domain:' + clean_domain) domains_encountered.add(clean_domain) else: output.append('# domain entry already present - ' + clean_domain) else: # line is a url entry # clean the url entry url_normalized = normalize(raw_entry) # check if link entry is valid if not url_normalized: output.append('# invalid entry - ' + raw_entry) else: url_subdomain = subdomain(url_normalized) url_rootdomain = rootdomain(url_normalized) if url_subdomain in disavow_entries['domain_entries']: if url_subdomain not in domains_encountered: domains_encountered.add(url_subdomain) output.append('domain:' + url_subdomain) else: output.append( '# link now disavowed via new domain entry - ' + raw_entry) elif url_rootdomain in disavow_entries['domain_entries']: if url_rootdomain not in domains_encountered: domains_encountered.add(url_rootdomain) output.append('domain:' + url_rootdomain) else: output.append( '# link now disavowed via new domain entry - ' + raw_entry) elif url_normalized in disavow_entries['url_entries']: if url_normalized not in urls_encountered: output.append(url_normalized) urls_encountered.add(url_normalized) else: output.append('# link entry already present') else: output.append( '# error occurred, not sure what to do with this - ' + raw_entry) return output