def import_from_file_contents(file_contents): """ Takes a string, such as that from extract_file_contents(), splits on new lines, and returns a dictionary of 'domain:' entries, and standalone URL entries. Handles comments by ignoring them. """ urls = [] domains = [] file_contents = file_contents.splitlines() for lineraw in file_contents: if (not lineraw.isspace()) and (lineraw != ""): if lineraw.startswith('"') and lineraw.endswith('"'): lineraw = lineraw[1:-1] if lineraw[0] == '#': # comment entry continue # checking if url is valid if not normalize(lineraw): continue else: if lineraw[:7] == "domain:": # domain entry line = re.sub("\n", "", lineraw[7:]) # checking if domain url is valid if not normalize(line): continue else: if line.startswith('"') and line.endswith('"'): line = line[1:-1] # We run the domain extract here, as sometimes people accidentally # put full URLs in domain entries. We assume they mean to exclude # the domain (which is often now recommended anyway - "no good # links from bad domains"). domain = subdomain(line) domains.append(domain) else: # not a domain entry line = re.sub("\n", "", lineraw) urls.append(line) return {'urls': urls, 'domains': domains}
def construct_url(self, url): """Using the specified URL components, this method attempts to combine `_baseurl`, `_namespace`, `_endpoint` and the specified `url` into a single, normalized, path. Parameters:: - `url` (str) - The full URL, or URL segment, to send to specified request to Returns:: str Example:: >>> client = ripcord.Client( baseurl='https://api.twitter.com', namespace='1/', endpoint='statuses/user_timeline/' ) >>> client.construct_url('wilhelm.json') 'https://api.twitter.com/1/statuses/user_timeline/wilhelm.json' """ components = [] if self.baseurl: components.append(self.baseurl) if self.namespace: components.append(self.namespace) if self.endpoint: components.append(self.endpoint) components.append(url) normalized_url = urls.normalize('/'.join(components)) if not self.keep_trailing_slash: return normalized_url.rstrip('/') return normalized_url
def combine_with_original_disavow(file_contents, disavow_entries): """ Takes the disavow file passed to disavow_file_to_dict() and it's resulting output and combines them to create a .txt file with the relevant 'domain:' entries and individual links to be disavowed, while maintaining the order and the comments from the original document. """ output = [] # extract = extract_file_contents(disavow_file) file_contents = file_contents.splitlines() urls_encountered = set() domains_encountered = set() for raw_entry in file_contents: if (not raw_entry.isspace()) and (raw_entry != ""): # Strip quotes for lines wrapped in quotes if raw_entry.startswith('"') and raw_entry.endswith('"'): raw_entry = raw_entry[1:-1] if raw_entry[0] == '#': # line is a comment, so we just keep it output.append(raw_entry) continue if raw_entry[:7] == 'domain:': # line is an domain entry # clean the domain entry domain_normalized = normalize(raw_entry[7:]) # check if it is valid, if not then include it is a comment if not domain_normalized: output.append('# invalid entry - ' + raw_entry) else: clean_domain = subdomain(domain_normalized) if clean_domain in disavow_entries['domain_entries']: if clean_domain not in domains_encountered: output.append('domain:' + clean_domain) domains_encountered.add(clean_domain) else: output.append('# domain entry already present - ' + clean_domain) else: # line is a url entry # clean the url entry url_normalized = normalize(raw_entry) # check if link entry is valid if not url_normalized: output.append('# invalid entry - ' + raw_entry) else: url_subdomain = subdomain(url_normalized) url_rootdomain = rootdomain(url_normalized) if url_subdomain in disavow_entries['domain_entries']: if url_subdomain not in domains_encountered: domains_encountered.add(url_subdomain) output.append('domain:' + url_subdomain) else: output.append('# link now disavowed via new domain entry - ' + raw_entry) elif url_rootdomain in disavow_entries['domain_entries']: if url_rootdomain not in domains_encountered: domains_encountered.add(url_rootdomain) output.append('domain:' + url_rootdomain) else: output.append('# link now disavowed via new domain entry - ' + raw_entry) elif url_normalized in disavow_entries['url_entries']: if url_normalized not in urls_encountered: output.append(url_normalized) urls_encountered.add(url_normalized) else: output.append('# link entry already present') else: output.append('# error occurred, not sure what to do with this - ' + raw_entry) return output
def combine_with_original_disavow(file_contents, disavow_entries): """ Takes the disavow file passed to disavow_file_to_dict() and it's resulting output and combines them to create a .txt file with the relevant 'domain:' entries and individual links to be disavowed, while maintaining the order and the comments from the original document. """ output = [] # extract = extract_file_contents(disavow_file) file_contents = file_contents.splitlines() urls_encountered = set() domains_encountered = set() for raw_entry in file_contents: if (not raw_entry.isspace()) and (raw_entry != ""): # Strip quotes for lines wrapped in quotes if raw_entry.startswith('"') and raw_entry.endswith('"'): raw_entry = raw_entry[1:-1] if raw_entry[0] == '#': # line is a comment, so we just keep it output.append(raw_entry) continue if raw_entry[:7] == 'domain:': # line is an domain entry # clean the domain entry domain_normalized = normalize(raw_entry[7:]) # check if it is valid, if not then include it is a comment if not domain_normalized: output.append('# invalid entry - ' + raw_entry) else: clean_domain = subdomain(domain_normalized) if clean_domain in disavow_entries['domain_entries']: if clean_domain not in domains_encountered: output.append('domain:' + clean_domain) domains_encountered.add(clean_domain) else: output.append('# domain entry already present - ' + clean_domain) else: # line is a url entry # clean the url entry url_normalized = normalize(raw_entry) # check if link entry is valid if not url_normalized: output.append('# invalid entry - ' + raw_entry) else: url_subdomain = subdomain(url_normalized) url_rootdomain = rootdomain(url_normalized) if url_subdomain in disavow_entries['domain_entries']: if url_subdomain not in domains_encountered: domains_encountered.add(url_subdomain) output.append('domain:' + url_subdomain) else: output.append( '# link now disavowed via new domain entry - ' + raw_entry) elif url_rootdomain in disavow_entries['domain_entries']: if url_rootdomain not in domains_encountered: domains_encountered.add(url_rootdomain) output.append('domain:' + url_rootdomain) else: output.append( '# link now disavowed via new domain entry - ' + raw_entry) elif url_normalized in disavow_entries['url_entries']: if url_normalized not in urls_encountered: output.append(url_normalized) urls_encountered.add(url_normalized) else: output.append('# link entry already present') else: output.append( '# error occurred, not sure what to do with this - ' + raw_entry) return output