Example #1
0
def import_from_file_contents(file_contents):
    """ Takes a string, such as that from extract_file_contents(), splits
        on new lines, and returns a dictionary of 'domain:' entries, and
        standalone URL entries.  Handles comments by ignoring them.
    """
    urls = []
    domains = []

    file_contents = file_contents.splitlines()

    for lineraw in file_contents:
        if (not lineraw.isspace()) and (lineraw != ""):

            if lineraw.startswith('"') and lineraw.endswith('"'):
                lineraw = lineraw[1:-1]

            if lineraw[0] == '#':
                # comment entry
                continue

            # checking if url is valid
            if not normalize(lineraw):
                continue

            else:

                if lineraw[:7] == "domain:":
                    # domain entry

                    line = re.sub("\n", "", lineraw[7:])

                    # checking if domain url is valid
                    if not normalize(line):
                        continue

                    else:
                        if line.startswith('"') and line.endswith('"'):
                            line = line[1:-1]

                        # We run the domain extract here, as sometimes people accidentally
                        # put full URLs in domain entries. We assume they mean to exclude
                        # the domain (which is often now recommended anyway - "no good
                        # links from bad domains").
                        domain = subdomain(line)
                        domains.append(domain)

                else:
                    # not a domain entry
                    line = re.sub("\n", "", lineraw)
                    urls.append(line)

    return {'urls': urls, 'domains': domains}
Example #2
0
def import_from_file_contents(file_contents):
    """ Takes a string, such as that from extract_file_contents(), splits
        on new lines, and returns a dictionary of 'domain:' entries, and
        standalone URL entries.  Handles comments by ignoring them.
    """
    urls = []
    domains = []

    file_contents = file_contents.splitlines()

    for lineraw in file_contents:
        if (not lineraw.isspace()) and (lineraw != ""):

            if lineraw.startswith('"') and lineraw.endswith('"'):
                lineraw = lineraw[1:-1]

            if lineraw[0] == '#':
                # comment entry
                continue

            # checking if url is valid
            if not normalize(lineraw):
                continue

            else:

                if lineraw[:7] == "domain:":
                    # domain entry

                    line = re.sub("\n", "", lineraw[7:])

                    # checking if domain url is valid
                    if not normalize(line):
                        continue

                    else:
                        if line.startswith('"') and line.endswith('"'):
                            line = line[1:-1]

                        # We run the domain extract here, as sometimes people accidentally
                        # put full URLs in domain entries. We assume they mean to exclude
                        # the domain (which is often now recommended anyway - "no good
                        # links from bad domains").
                        domain = subdomain(line)
                        domains.append(domain)

                else:
                    # not a domain entry
                    line = re.sub("\n", "", lineraw)
                    urls.append(line)

    return {'urls': urls, 'domains': domains}
Example #3
0
    def construct_url(self, url):
        """Using the specified URL components, this method attempts to combine
        `_baseurl`, `_namespace`, `_endpoint` and the specified `url` into a
        single, normalized, path.

        Parameters::
            - `url` (str) - The full URL, or URL segment, to send to specified request to

        Returns::
            str

        Example::
            >>> client = ripcord.Client(
                    baseurl='https://api.twitter.com',
                    namespace='1/',
                    endpoint='statuses/user_timeline/'
                )
            >>> client.construct_url('wilhelm.json')
            'https://api.twitter.com/1/statuses/user_timeline/wilhelm.json'
        """
        components = []
        if self.baseurl:
            components.append(self.baseurl)

        if self.namespace:
            components.append(self.namespace)

        if self.endpoint:
            components.append(self.endpoint)

        components.append(url)

        normalized_url = urls.normalize('/'.join(components))

        if not self.keep_trailing_slash:
            return normalized_url.rstrip('/')

        return normalized_url
Example #4
0
def combine_with_original_disavow(file_contents, disavow_entries):
    """ Takes the disavow file passed to disavow_file_to_dict() and it's
        resulting output and combines them to create a .txt file with the
        relevant 'domain:' entries and individual links to be disavowed,
        while maintaining the order and the comments from the original
        document.
     """

    output = []
    # extract = extract_file_contents(disavow_file)
    file_contents = file_contents.splitlines()
    urls_encountered = set()
    domains_encountered = set()
    for raw_entry in file_contents:

        if (not raw_entry.isspace()) and (raw_entry != ""):

            # Strip quotes for lines wrapped in quotes
            if raw_entry.startswith('"') and raw_entry.endswith('"'):
                raw_entry = raw_entry[1:-1]

            if raw_entry[0] == '#':
                # line is a comment, so we just keep it
                output.append(raw_entry)
                continue

            if raw_entry[:7] == 'domain:':
                # line is an domain entry

                # clean the domain entry
                domain_normalized = normalize(raw_entry[7:])

                # check if it is valid, if not then include it is a comment
                if not domain_normalized:
                    output.append('# invalid entry - ' + raw_entry)

                else:
                    clean_domain = subdomain(domain_normalized)
                    if clean_domain in disavow_entries['domain_entries']:
                        if clean_domain not in domains_encountered:
                            output.append('domain:' + clean_domain)
                            domains_encountered.add(clean_domain)
                        else:
                            output.append('# domain entry already present - ' + clean_domain)
            else:
                # line is a url entry

                # clean the url entry
                url_normalized = normalize(raw_entry)

                # check if link entry is valid
                if not url_normalized:
                    output.append('# invalid entry - ' + raw_entry)

                else:
                    url_subdomain = subdomain(url_normalized)
                    url_rootdomain = rootdomain(url_normalized)

                    if url_subdomain in disavow_entries['domain_entries']:

                        if url_subdomain not in domains_encountered:
                            domains_encountered.add(url_subdomain)
                            output.append('domain:' + url_subdomain)

                        else:
                            output.append('# link now disavowed via new domain entry - ' + raw_entry)

                    elif url_rootdomain in disavow_entries['domain_entries']:

                        if url_rootdomain not in domains_encountered:
                            domains_encountered.add(url_rootdomain)
                            output.append('domain:' + url_rootdomain)

                        else:
                            output.append('# link now disavowed via new domain entry - ' + raw_entry)

                    elif url_normalized in disavow_entries['url_entries']:
                        if url_normalized not in urls_encountered:
                            output.append(url_normalized)
                            urls_encountered.add(url_normalized)
                        else:
                            output.append('# link entry already present')

                    else:
                        output.append('# error occurred, not sure what to do with this - ' + raw_entry)
    return output
Example #5
0
def combine_with_original_disavow(file_contents, disavow_entries):
    """ Takes the disavow file passed to disavow_file_to_dict() and it's
        resulting output and combines them to create a .txt file with the
        relevant 'domain:' entries and individual links to be disavowed,
        while maintaining the order and the comments from the original
        document.
     """

    output = []
    # extract = extract_file_contents(disavow_file)
    file_contents = file_contents.splitlines()
    urls_encountered = set()
    domains_encountered = set()
    for raw_entry in file_contents:

        if (not raw_entry.isspace()) and (raw_entry != ""):

            # Strip quotes for lines wrapped in quotes
            if raw_entry.startswith('"') and raw_entry.endswith('"'):
                raw_entry = raw_entry[1:-1]

            if raw_entry[0] == '#':
                # line is a comment, so we just keep it
                output.append(raw_entry)
                continue

            if raw_entry[:7] == 'domain:':
                # line is an domain entry

                # clean the domain entry
                domain_normalized = normalize(raw_entry[7:])

                # check if it is valid, if not then include it is a comment
                if not domain_normalized:
                    output.append('# invalid entry - ' + raw_entry)

                else:
                    clean_domain = subdomain(domain_normalized)
                    if clean_domain in disavow_entries['domain_entries']:
                        if clean_domain not in domains_encountered:
                            output.append('domain:' + clean_domain)
                            domains_encountered.add(clean_domain)
                        else:
                            output.append('# domain entry already present - ' +
                                          clean_domain)
            else:
                # line is a url entry

                # clean the url entry
                url_normalized = normalize(raw_entry)

                # check if link entry is valid
                if not url_normalized:
                    output.append('# invalid entry - ' + raw_entry)

                else:
                    url_subdomain = subdomain(url_normalized)
                    url_rootdomain = rootdomain(url_normalized)

                    if url_subdomain in disavow_entries['domain_entries']:

                        if url_subdomain not in domains_encountered:
                            domains_encountered.add(url_subdomain)
                            output.append('domain:' + url_subdomain)

                        else:
                            output.append(
                                '# link now disavowed via new domain entry - '
                                + raw_entry)

                    elif url_rootdomain in disavow_entries['domain_entries']:

                        if url_rootdomain not in domains_encountered:
                            domains_encountered.add(url_rootdomain)
                            output.append('domain:' + url_rootdomain)

                        else:
                            output.append(
                                '# link now disavowed via new domain entry - '
                                + raw_entry)

                    elif url_normalized in disavow_entries['url_entries']:
                        if url_normalized not in urls_encountered:
                            output.append(url_normalized)
                            urls_encountered.add(url_normalized)
                        else:
                            output.append('# link entry already present')

                    else:
                        output.append(
                            '# error occurred, not sure what to do with this - '
                            + raw_entry)
    return output