Python PublicSuffixList.PublicSuffixList Beispiele, publicsuffix.PublicSuffixList.PublicSuffixList Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: mdx.py Projekt: skanct/pyFF

    def __init__(self, pipes=None, observers=None):

        if not observers:
            observers = []
        if not pipes:
            pipes = []
        self._pipes = pipes
        self.lock = ReadWriteLock()
        self.plumbings = [plumbing(v) for v in pipes]
        self.refresh = MDUpdate(cherrypy.engine,
                                server=self,
                                frequency=config.update_frequency)
        self.refresh.subscribe()
        self.aliases = config.aliases
        self.psl = PublicSuffixList()
        self.md = MDRepository()
        self.ready = False

        if config.autoreload:
            for f in pipes:
                cherrypy.engine.autoreload.files.add(f)

Beispiel #2

0

Datei anzeigen

def run(fname=None,
        iface=None,
        log_by_ip=False,
        launch_ff=False,
        sslstriplog=None,
        sslsplitdir=None):

    global psl
    global ip_logging

    psl = PublicSuffixList()

    ip_logging = log_by_ip

    if (not fname and not iface and not launch_ff):
        print parser.print_help()
        exit(-1)

    if (launch_ff):
        if (sslsplitdir):
            parsesslsplit(sslsplitdir)
        launch_firefox()

    else:

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        print "MANA (FireLamb) : [+] Saving output to %s" % save_dir

        if (iface):
            print "MANA (FireLamb) : [+] Listening for cookie traffic on interface %s" % iface
            sniff(iface=iface, prn=process)

        elif (fname):
            print "MANA (FireLamb) : [+] Reading pcap file '%s'...." % fname
            packets = rdpcap(fname)
            print "MANA (FireLamb) : [+] Processing file contents..."
            for p in packets:
                process(p)
            print "MANA (FireLamb) : [+] Done."

Beispiel #3

0

Datei anzeigen

Datei: urlParse.py Projekt: AidenLong/python

def domain_split(server_domain):
    '''''
    server_domain为网站所用服务名+域名
    分割域名, 得到前缀(服务名)、主机域名、后缀(顶级域名)
        输入www.baidu.com，输出'www', 'baidu', 'com'
        输入172.31.137.240，输出'', '172.31.137.240', ''
    '''
    PSL_FILE = codecs.open('public_suffix_list.dat', encoding='utf8')
    psl = PublicSuffixList(PSL_FILE)
    domain = psl.get_public_suffix(server_domain)
    # 取域名的第一个字段，即第一个'.'之前的为主机域名, 后面为顶级域名，前面为所使用的服务
    if '.' in domain:
        server = server_domain[:-len(domain)]
        host = domain[:domain.index('.')]
        top = domain[domain.index('.'):]
        hostname = server + host + top
    else:  # 说明提取域名失败，例如172.31.137.240等IP形式，此时全部当作主机域名
        server = ''
        host = server_domain
        top = ''
        hostname = server_domain
    return server, host, top, hostname

Beispiel #4

0

Datei anzeigen

def alexa_malware_scan(url):
    domain = PublicSuffixList().get_public_suffix(
        urlparse(url).netloc)  # IRIs are going to be a pain here.
    pipe = redis_db["slave"].pipeline()
    pipe.hlen(domains_key)
    pipe.hmget(domains_key, domain)
    total, score = pipe.execute()
    score = score[0]

    def rank_to_ratio(score, total):
        """
        if the score is between 1 and 1 million never return 1
        If the score is none return 1
        """
        if score is not None:
            score = int(score) - 1
            total = total

            return score / total
        else:
            return 1

    return [{"type": "generic", "confidence": rank_to_ratio(score, total)}]

Beispiel #5

0

Datei anzeigen

from goslate import Goslate
from publicsuffix import PublicSuffixList
from unidecode import unidecode

import keywords

###################
# initializations #
###################


g = Goslate()

stopwords = pickle.load(open("data/stopwords_dict", 'rb'))

psl = PublicSuffixList(open("data/public_suffix_list.dat", encoding="utf8"))

document_frequencies = {}
with open("data/count_1w.txt") as f:
    for line in f:
        key, value = line.strip().split()
        document_frequencies[key] = int(value)


#########
# UTILS #
#########

def cleanString(s):
    s = unidecode(s)
    s = re.sub('\n',' ',s)

Beispiel #6

0

Datei anzeigen

Datei: url.py Projekt: xdhejingang/TSpider

class URL(object):
    BLOCKEXT = [
        'a3c', 'ace', 'aif', 'aifc', 'aiff', 'arj', 'asf', 'asx', 'attach',
        'au', 'avi', 'bin', 'cab', 'cache', 'class', 'djv', 'djvu', 'dwg',
        'es', 'esl', 'exe', 'fif', 'fvi', 'gz', 'hqx', 'ice', 'ief', 'ifs',
        'iso', 'jar', 'kar', 'mid', 'midi', 'mov', 'movie', 'mp', 'mp2', 'mp3',
        'mp4', 'mpeg', '7z', 'mpeg2', 'mpg', 'mpg2', 'mpga', 'msi', 'pac',
        'pdf', 'ppt', 'pptx', 'psd', 'qt', 'ra', 'ram', 'rm', 'rpm', 'snd',
        'svf', 'tar', 'tgz', 'tif', 'gzip', 'tiff', 'tpl', 'uff', 'wav', 'wma',
        'wmv', 'doc', 'docx', 'db', 'jpg', 'png', 'bmp', 'svg', 'gif', 'jpeg',
        'css', 'js', 'cur', 'ico', 'zip', 'txt', 'apk', 'dmg', 'xml', 'jar',
        'class', 'torrent'
    ]
    BLOCKHOST = ['mirrors.aliyun.com', 'code.taobao.org']
    # PUBLIC_SUFFIX_LIST_URL = 'http://publicsuffix.org/list/public_suffix_list.dat'
    PSL = PublicSuffixList(codecs.open(PSL_FILE_PATH, encoding='utf8'))

    def __init__(self, url):
        self.valid = True
        self.urlstring = self.normalize_url(url)
        if not self.urlstring:
            self.valid = False
        self._p = urlparse.urlparse(self.urlstring)

    @staticmethod
    def normalize_url(url):
        """
        :param url:
        :return:
        """
        # only hostname
        if not '/' in url:
            return 'http://{}'.format(url)
        p = urlparse.urlparse(url)
        # www.test.com/index.php
        # exclude /xxxxx/index.php
        if not p.netloc:
            if url.startswith('/'):
                # /xxxxx/index.php
                return ''
            else:
                # www.test.com/index.php
                return 'http://{}'.format(url)
        # //www.test.com/index.php
        if not p.scheme:
            url = urlparse.urlunparse(('http', p.netloc, p.path
                                       or '/', p.query, p.params, p.fragment))
        return url

    @property
    def scheme(self):
        return self._p.scheme

    @property
    def netloc(self):
        return self._p.netloc

    @property
    def hostname(self):
        return self._p.hostname

    @property
    def domain(self):
        return self.PSL.get_public_suffix(self.hostname)

    @property
    def path(self):
        # http://www.test.com  =>  self._p.path=''
        return self._p.path or '/'

    @property
    def path_without_file(self):
        return self.path[:self.path.rfind('/') + 1]

    @property
    def filename(self):
        return self.path[self.path.rfind('/') + 1:]

    @property
    def extension(self):
        fname = self.filename
        extension = fname[fname.rfind('.') + 1:]
        if extension == fname:
            return ''
        else:
            return extension

    @property
    def querystring(self):
        return self._p.query

    @property
    def querydict(self):
        # remove keep_blank_values=True, as url blow cause duplicate scans
        # /Common/common/captcha?0.610851539997384 => querydict = {'0.610851539997384': ''}
        return dict(urlparse.parse_qsl(self._p.query))

    @property
    def fragment(self):
        return self._p.fragment

    @property
    def index_page(self):
        return urlparse.urljoin(self.urlstring, '/', allow_fragments=False)

    @property
    def pattern(self):
        """
        :return:
        """
        return urlparse.urlunsplit(
            (self.scheme, self.netloc, self.path_querystring_pattern, '', ''))

    @property
    def path_querystring_pattern(self):
        """
        :return:
        """
        # TODO url pattern
        path_pattern = re.sub('\d+', '{digit}', self.path)
        query_params = '<>'.join(sorted(self.querydict.keys()))
        pattern = '{}?{}'.format(
            path_pattern, query_params) if query_params else path_pattern
        return pattern

    @property
    def blocked(self):
        return True if self.extension.lower(
        ) in URL.BLOCKEXT or self.hostname.lower() in URL.BLOCKHOST else False

Beispiel #7

0

Datei anzeigen

Datei: domain.py Projekt: xinyiZzz/Eggtart

def open_public_suffix_list(file_dir=''):
    global PSL_FILE
    global PSL
    PSL_FILE = codecs.open(pjoin(file_dir, 'public_suffix_list.dat'),
                           encoding='utf8')
    PSL = PublicSuffixList(PSL_FILE)

Beispiel #8

0

Datei anzeigen

def get_org_domain(domain):
    fn = get_suffix_list_file_name()
    with open(fn) as suffixList:
        psl = PublicSuffixList(suffixList)
        return psl.get_public_suffix(domain)

Beispiel #9

0

Datei anzeigen

import json
from code.domain import readDomains, clusterDomains, group2file
from code.secondDomain import saveDGA
import pandas as pd
from code.tools import oneHotFeature, checkip
from sklearn.cluster import DBSCAN
from publicsuffix import PublicSuffixList
from sklearn import preprocessing
import csv
import codecs
psl_file = codecs.open('suffix.dat', encoding='utf8')
psl = PublicSuffixList(psl_file)


def makeGraphFile():
    readDomains()
    clusterDomains(json.load(open('data/noRegisterBlackTwoLevel.txt')),
                   'data/featureCluster.txt')
    group2file()
    saveDGA()


def addToSetMap(data, key, val):
    if key not in data:
        data[key] = set()
    data[key].add(val)


def readDGADomains():
    domainMap = {}
    filenames = [

Beispiel #10

0

Datei anzeigen

Datei: brutedns.py Projekt: shakenetwork/subdomain3

 def get_suffix(self):
     suffix_list = fetch()
     psl = PublicSuffixList(suffix_list)
     return psl

Beispiel #11

0

Datei anzeigen

                           "Too many simulataneous connections from your host",
                           "Please try again later.",
                           "You have been banned for abuse.",
                           "has exceeded the established limit", "WHOIS LIMI",
                           "Still in grace period, wait", "Permission denied.")

_tld_to_whois = dict()

with open("datasources/whois-servers.txt", "r") as whois_servers:
    for line in whois_servers:
        if line.startswith(';'):
            continue
        parts = line.split(' ')
        _tld_to_whois['.' + parts[0].strip()] = parts[1].strip()

_psl = PublicSuffixList(
    input_file=codecs.open("datasources/effective_tld_names.dat", "r", "utf8"))


def _whois_lookup(sServer, sDomain):
    """
    Perform the network connection to the Whois Server and query for the given domain.

    @param sServer: The hostname of the whois server to query.
    @param sDomain: The domain to query for.
    @return: The whois result string.
    """

    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.settimeout(5)
    try:
        s.connect((sServer, 43))

Beispiel #12

0

Datei anzeigen

from urlparse import urlsplit

from marshmallow import Schema, fields
from publicsuffix import PublicSuffixList

PSL = PublicSuffixList()


class ImageSchema(Schema):
    height = fields.Int(allow_none=True)
    url = fields.Url(allow_none=True)
    width = fields.Int(allow_none=True)


class EmbedlyURLSchema(Schema):
    description = fields.Str(allow_none=True)
    favicon_url = fields.Url(allow_none=True)
    images = fields.Nested(ImageSchema, many=True)
    original_url = fields.Url(allow_none=True)
    provider_name = fields.Str(allow_none=True)
    title = fields.Str(allow_none=True)
    url = fields.Url(allow_none=True)

    def __init__(self, blocked_domains, *args, **kwargs):
        self.blocked_domains = blocked_domains
        super(EmbedlyURLSchema, self).__init__(*args, **kwargs)

    def load(self, data):
        validated = super(EmbedlyURLSchema, self).load(data)

        def get_domain(url):

Beispiel #13

0

Datei anzeigen

Datei: url.py Projekt: ortodesign/wextracto

    def dirnames(self):
        encoded = self.encode('utf-8')
        hexdigest = md5(encoded).hexdigest()
        names = [self.parsed.scheme, self.parsed.netloc]
        names.extend(filter(None, self.parsed.path.split('/')))
        if self.parsed.query:
            names.extend(self.parsed.query.split('&'))
        names.append(hexdigest)
        return [urlquote(name, safe='')[:PC_NAME_MAX] for name in names]


#
# URL related composable helpers
# ============================================================

public_suffix_list = PublicSuffixList()


@composable
@map_if_iter
def url(obj):
    return getattr(obj, 'url', obj)

parse_url = url | map_if_iter(urlparse)
url_query = parse_url | map_if_iter(attrgetter('query'))
url_path = parse_url | map_if_iter(attrgetter('path'))
url_hostname = parse_url | map_if_iter(attrgetter('hostname'))
url_query_dict = url_query | map_if_iter(parse_qs)
url_query_list = url_query | map_if_iter(parse_qsl)

Beispiel #14

0

Datei anzeigen

 def __init__(self, procs):
     BaseStreamifier.__init__(self, procs)
     self.psl = PublicSuffixList()

Beispiel #15

0

Datei anzeigen

Datei: pshtt.py Projekt: siparker/pshtt

def initialize_external_data(init_preload_list=None,
                             init_preload_pending=None,
                             init_suffix_list=None):
    """
    This function serves to load all of third party external data.

    This can be called explicitly by a library, as part of the setup needed
    before calling other library functions, or called as part of running
    inspect_domains() or CLI operation.

    If values are passed in to this function, they will be assigned to
    be the cached values. This allows a caller of the Python API to manage
    cached data in a customized way.

    It also potentially allows clients to pass in subsets of these lists,
    for testing or novel performance reasons.

    Otherwise, if the --cache-third-parties=[DIR] flag specifies a directory,
    all downloaded third party data will be cached in a directory, and
    used from cache on the next pshtt run instead of hitting the network.

    If no values are passed in, and no --cache-third-parties flag is used,
    then no cached third party data will be created or used, and pshtt will
    download the latest data from those third party sources.
    """
    global preload_list, preload_pending, suffix_list

    # The preload list should be sent in as a list of domains.
    if init_preload_list is not None:
        preload_list = init_preload_list

    # The preload_pending list should be sent in as a list of domains.
    if init_preload_pending is not None:
        preload_pending = init_preload_pending

    # The public suffix list should be sent in as a list of file lines.
    if init_suffix_list is not None:
        suffix_list = PublicSuffixList(init_suffix_list)

    # If there's a specified cache dir, prepare paths.
    # Only used when no data has been set yet for a source.
    if THIRD_PARTIES_CACHE:
        cache_preload_list = os.path.join(THIRD_PARTIES_CACHE,
                                          cache_preload_list_default)
        cache_preload_pending = os.path.join(THIRD_PARTIES_CACHE,
                                             cache_preload_pending_default)
        cache_suffix_list = os.path.join(THIRD_PARTIES_CACHE,
                                         cache_suffix_list_default)
    else:
        cache_preload_list, cache_preload_pending, cache_suffix_list = None, None, None

    # Load Chrome's latest versioned HSTS preload list.
    if preload_list is None:
        if cache_preload_list and os.path.exists(cache_preload_list):
            utils.debug("Using cached Chrome preload list.", divider=True)
            preload_list = json.loads(open(cache_preload_list).read())
        else:
            preload_list = load_preload_list()

            if cache_preload_list:
                utils.debug("Caching preload list at %s" % cache_preload_list,
                            divider=True)
                utils.write(utils.json_for(preload_list), cache_preload_list)

    # Load Chrome's current HSTS pending preload list.
    if preload_pending is None:
        if cache_preload_pending and os.path.exists(cache_preload_pending):
            utils.debug("Using cached hstspreload.org pending list.",
                        divider=True)
            preload_pending = json.loads(open(cache_preload_pending).read())
        else:
            preload_pending = load_preload_pending()

            if cache_preload_pending:
                utils.debug("Caching preload pending list at %s" %
                            cache_preload_pending,
                            divider=True)
                utils.write(utils.json_for(preload_pending),
                            cache_preload_pending)

    # Load Mozilla's current Public Suffix list.
    if suffix_list is None:
        if cache_suffix_list and os.path.exists(cache_suffix_list):
            utils.debug("Using cached suffix list.", divider=True)
            cache_file = codecs.open(cache_suffix_list, encoding='utf-8')
            suffix_list = PublicSuffixList(cache_file)
        else:
            suffix_list, raw_content = load_suffix_list()

            if cache_suffix_list:
                utils.debug("Caching suffix list at %s" % cache_suffix_list,
                            divider=True)
                utils.write(''.join(raw_content), cache_suffix_list)

Beispiel #16

0

Datei anzeigen

from publicsuffix import PublicSuffixList

from trustymail import trustymail

public_list = PublicSuffixList()


class Domain:

    base_domains = {}

    def __init__(self, domain_name):
        self.domain_name = domain_name

        self.base_domain_name = public_list.get_public_suffix(domain_name)

        if self.base_domain_name != self.domain_name:
            if self.base_domain_name not in Domain.base_domains:
                domain = Domain(self.base_domain_name)
                # Populate DMARC for parent.
                trustymail.dmarc_scan(domain)
                Domain.base_domains[self.base_domain_name] = domain
            self.base_domain = Domain.base_domains[self.base_domain_name]
        else:
            self.base_domain = None

        # Start off assuming the host is live unless an error tells us otherwise.
        self.is_live = True

        # Keep entire record for potential future use.
        self.mx_records = []

Beispiel #17

0

Datei anzeigen

Datei: publicsuffix_test.py Projekt: resite/WebModel

#!/usr/bin/python
#encoding:utf-8

from publicsuffix import PublicSuffixList

domainParser = PublicSuffixList()
# print domainParser.get_public_suffix("www.example.com.cn")
# print domainParser.get_public_suffix("www.example.com.uk")
# print domainParser.get_public_suffix("jaysonhwang.sinaapp.com")
# print domainParser.get_public_suffix("1.jaysonhwang.sinaapp.com")
# print domainParser.get_public_suffix("jaysonhwang.sinaapp.com/web/1")

print domainParser.get_domain("http://192.168.0.100:8080/web")
print domainParser.get_domain("http://www.qq.com")
allow = [
    "http://www.people.com.cn",
    "http://www.xinhuanet.com",
    "http://www.qq.com",
    "http://www.163.com",
    "http://www.cntv.cn",
    "http://www.ifeng.com",
    "http://www.hexun.com",
    "http://www.sina.com.cn",
    "http://www.sohu.com",
    "http://www.dbw.cn",]
for a in allow:
    print domainParser.get_domain(a)[0]

Beispiel #18

0

Datei anzeigen

"""Utils for analyzing Princeton Web Census data."""
from BlockListParser import BlockListParser
from ipaddress import ip_address
from publicsuffix import PublicSuffixList, fetch
from urllib.parse import urlparse

import codecs
import json

PSL_CACHE_LOC = 'public_suffix_list.dat'

# Execute on module load
psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8')
psl = PublicSuffixList(psl_cache)
el_parser = BlockListParser('easylist.txt')
ep_parser = BlockListParser('easyprivacy.txt')

with open('org_domains.json', 'r') as f:
    org_domains = json.load(f)

with open('alexa_cats.json', 'r') as f:
    alexa_cats = json.load(f)


class CensusUtilsException(Exception):
    pass


def get_domain(url):
    """Strip the URL down to just a hostname+publicsuffix.

Beispiel #19

0

Datei anzeigen

class typogen(object):
    """generate typo"""
    psl = PublicSuffixList(input_file=codecs.open(
        "datasources/effective_tld_names.dat", "r", "utf8"))

    alexa_top = {}

    def __init__(self):
        #Load up the list of TLDs
        self.lstTlds = list()
        filename = "datasources/tlds-alpha-by-domain.txt"
        with open(filename) as f:
            for line in f:
                if not line.lstrip().startswith('#'):
                    self.lstTlds.append(line.rstrip().lower())
        print("Loading confusables...", end=" ", flush=True)
        self.loadconfusables()
        print("Loading Alexa data...", end=" ", flush=True)
        with open(r'datasources\top-1m.csv') as top1m:
            for line in top1m:
                parts = line.rstrip().split(',', 1)
                if len(parts) == 2:
                    self.alexa_top[parts[1]] = int(parts[0])
        print("Done.")

    @staticmethod
    def loadkeyb(strCountry):
        keyDict = dict()

        # obviously you can have other maps here
        # I've only included this one
        filename = "datasources/keyb" + strCountry + ".txt"
        with open(filename) as f:
            for line in f:
                split = line.rstrip().split(',')
                if split[0] in keyDict:
                    keyDict[split[0]].append(split[1])
                else:
                    keyDict[split[0]] = [split[1]]

        return keyDict

    @staticmethod
    def loadadditionalhomoglyphs():
        homoglyphs = dict()
        with open("datasources/homoglyphs.txt", "r", encoding="utf8") as f:
            for line in f:
                if not line.startswith("#"):
                    split = line.rstrip().split(',')
                    key = split[0]
                    #filter out any glyphs which are the same as the key (case insensitive)
                    tempvalues = [
                        glyph for glyph in split[1].split(' ')
                        if glyph.lower() != key
                    ]
                    #filter out glyphs which do not survive round trip conversion, e.g. ß -> ss -> ss
                    values = list()
                    for glyph in tempvalues:
                        try:
                            if 'a' + glyph + 'b' == codecs.decode(
                                    codecs.encode('a' + glyph + 'b', "idna"),
                                    "idna"):
                                values.append(glyph)
                        except UnicodeError:
                            #Some characters/combinations will fail the nameprep stage
                            pass
                    homoglyphs[key] = values

        return homoglyphs

    @staticmethod
    def loadconfusables():
        global _homoglyphs_confusables
        _homoglyphs_confusables = dict()
        rejected_sequences = set()

        #'utf_8_sig' swallows the BOM at start of file
        with open("datasources/confusables.txt", "r",
                  encoding="'utf_8_sig") as f:
            for line in f:
                #If line contains more than whitespace and isn't a comment
                if line.strip() and not line.startswith("#"):
                    split = line.split(';', maxsplit=2)
                    #parse the left hand side of the pairing
                    unihex = split[0].split(' ')[0]
                    part0 = (chr(int(unihex, 16)))

                    if part0 in rejected_sequences:
                        continue

                    #parse the right hand side of the pairing
                    part1 = ''
                    for unihex in split[1].strip().split(' '):
                        part1 += (chr(int(unihex, 16)))

                    if part1 in rejected_sequences:
                        continue

                    #Skip pairs already in the _homoglyphs dict
                    if part0 in _homoglyphs_confusables and part1 in _homoglyphs_confusables[
                            part0]:
                        continue

                    try:
                        #filter out glyphs which do not survive round trip conversion, e.g. ß -> ss -> ss
                        if 'a' + part0 + 'b' != codecs.decode(
                                codecs.encode('a' + part0 + 'b', "idna"),
                                "idna"):
                            rejected_sequences.add(part0)
                            continue
                    except UnicodeError:
                        #Some characters/combinations will fail the nameprep stage
                        rejected_sequences.add(part0)
                        continue

                    try:
                        #filter out glyphs which do not survive round trip conversion, e.g. ß -> ss -> ss
                        if 'a' + part1 + 'b' != codecs.decode(
                                codecs.encode('a' + part1 + 'b', "idna"),
                                "idna"):
                            rejected_sequences.add(part1)
                            continue
                    except UnicodeError:
                        #Some characters/combinations will fail the nameprep stage
                        rejected_sequences.add(part1)
                        continue

                    #Include left to right pair mapping in the dict
                    if part0 not in _homoglyphs_confusables:
                        _homoglyphs_confusables[part0] = set()
                    _homoglyphs_confusables[part0].add(part1)

                    #Include right to left pair mapping in the dict
                    if part1 not in _homoglyphs_confusables:
                        _homoglyphs_confusables[part1] = set()
                    _homoglyphs_confusables[part1].add(part0)

    def is_domain_valid(self, domain):
        #Ensure its in the correct character set
        if not re.match('^[a-z0-9.-]+$', domain):
            return False
        #Ensure the TLD is sane
        elif domain[domain.rfind(".") + 1:] not in self.lstTlds:
            return False
        # hostnames can't start or end with a -
        elif ".-" in domain or "-." in domain or domain.startswith("-"):
            return False
        #Ensure the location of dots are sane
        elif ".." in domain or domain.startswith("."):
            return False
        else:
            return True

    @staticmethod
    def bitflipbyte(inputbyte):
        """
        Flips the lowest 7 bits in the given input byte/int to build a list of mutated values.

        @param inputbyte: The byte/int to bit flip
        @return: A list of the mutated values.
        """
        result = list()
        mask = 1
        #As we know we're flipping ASCII, only do the lowest 7 bits
        for i in range(0, 7):
            result.append(inputbyte ^ mask)
            mask <<= 1
        return result

    @staticmethod
    def generate_country_code_doppelgangers(strHost):
        result = list()
        with open("datasources/countrynames.txt", 'r',
                  encoding="UTF-8") as countrynames:
            for line in countrynames:
                if not line.startswith('#'):
                    parts = line.split(';', maxsplit=2)
                    # 2 letter country code subdomain, but without the dot
                    result.append(parts[0].strip().lower() + strHost)
                    # 3 letter country code subdomain, but without the dot
                    result.append(parts[1].strip().lower() + strHost)
        return result

    @staticmethod
    def generate_subdomain_doppelgangers(strHost):
        result = list()
        with open("datasources/subdomains.txt", 'r') as subdomains:
            for subdomain in subdomains:
                result.append(subdomain.strip() + strHost)
        return result

    @staticmethod
    def generate_extra_dot_doppelgangers(strHost):
        result = list()
        for idx, char in enumerate(strHost):
            #A dot instead of a character
            result.append(strHost[:idx] + '.' + strHost[idx + 1:])
            #A dot inserted between characters
            result.append(strHost[:idx] + '.' + strHost[idx:])
        return result

    @staticmethod
    def bitflipstring(strInput):
        """
        Flips the lowest 7 bits in each character of the given string to build a list of mutated values.

        @param strInput: The string to bit flip
        @return: A list of the mutated values.
        """
        result = list()
        i = 0
        for character in strInput:
            flippedchars = typogen.bitflipbyte(character.encode("UTF-8")[0])
            for flippedchar in flippedchars:
                result.append(strInput[:i] + chr(flippedchar) +
                              strInput[i + 1:])
            i += 1
        return result

    @staticmethod
    def generate_missing_character_typos(strHost):
        # missing characters

        result = list()
        idx = 0
        while idx < len(strHost):
            strTypo = strHost[0:idx] + strHost[idx + 1:]
            idx += 1
            result.append(strTypo)
        return result

    @staticmethod
    def generate_duplicate_character_typos(strHost):
        # duplicate characters

        result = list()
        idx = 0
        while idx < len(strHost):
            strHostList = list(strHost)
            if strHostList[idx] != '.':
                strHostList.insert(idx, strHostList[idx])
                strTypo = "".join(strHostList)
                result.append(strTypo)
            idx += 1
        return result

    @staticmethod
    def generate_miskeyed_typos(strHost, strCountry):
        # swap to a surrounding key for each character

        result = list()
        # load keyboard mapping
        typoDict = typogen.loadkeyb(strCountry)

        for idx, char in enumerate(strHost):
            if char in typoDict:
                for replacement_char in typoDict[char]:
                    result.append(strHost[:idx] + replacement_char +
                                  strHost[idx + 1:])
        return result

    @staticmethod
    def generate_homoglyph_confusables_typos(strHost):
        # swap characters to similar looking characters, based on Unicode's confusables.txt

        results = list()
        global _homoglyphs_confusables
        #Replace each homoglyph subsequence in the strHost with each replacement subsequence associated with the homoglyph subsequence
        for homoglyph_subsequence in _homoglyphs_confusables:
            idx = 0
            while 1:
                idx = strHost.find(homoglyph_subsequence, idx)
                if idx > -1:
                    for replacement_subsequence in _homoglyphs_confusables[
                            homoglyph_subsequence]:
                        #Add with just one change
                        newhostname = strHost[:
                                              idx] + replacement_subsequence + strHost[
                                                  idx +
                                                  len(homoglyph_subsequence):]
                        try:
                            results.append(
                                str(codecs.encode(newhostname, "idna"),
                                    "ascii"))
                        except UnicodeError:
                            #This can be caused by domain parts which are too long for IDNA encoding, so just skip it
                            pass

                        #Add with all occurrences changed
                        newhostname = strHost.replace(homoglyph_subsequence,
                                                      replacement_subsequence)
                        try:
                            if newhostname not in results:
                                results.append(
                                    str(codecs.encode(newhostname, "idna"),
                                        "ascii"))
                        except UnicodeError:
                            #This can be caused by domain parts which are too long for IDNA encoding, so just skip it
                            pass

                    idx += len(homoglyph_subsequence)
                else:
                    break

        return results

    @staticmethod
    def generate_additional_homoglyph_typos(strHost):
        # swap characters to similar looking characters, based on homoglyphs.txt

        result = list()
        # load homoglyph mapping
        homoglyphs = typogen.loadadditionalhomoglyphs()

        for idx, char in enumerate(strHost):
            if char in homoglyphs:
                for replacement_char in homoglyphs[char]:
                    newhostname = strHost[:idx] + replacement_char + strHost[
                        idx + 1:]
                    try:
                        result.append(
                            str(codecs.encode(newhostname, "idna"), "ascii"))
                    except UnicodeError:
                        #This can be caused by domain parts which are too long for IDNA encoding, so just skip it
                        pass

        return result

    @staticmethod
    def generate_miskeyed_addition_typos(strHost, strCountry):
        # add a surrounding key either side of each character

        result = list()
        # load keyboard mapping
        typoDict = typogen.loadkeyb(strCountry)

        for idx, char in enumerate(strHost):
            if char in typoDict:
                for replacement_char in typoDict[char]:
                    result.append(strHost[:idx + 1] + replacement_char +
                                  strHost[idx + 1:])
                    result.append(strHost[:idx] + replacement_char +
                                  strHost[idx:])
        return result

    @staticmethod
    def generate_miskeyed_sequence_typos(strHost, strCountry):
        # repeated surrounding keys for any character sequences in the string

        result = list()
        # load keyboard mapping
        typoDict = typogen.loadkeyb(strCountry)

        idx = 0
        while idx < len(strHost):
            char = strHost[idx]
            #Loop through sequences of the same character, counting the sequence length
            sequence_len = 1
            while idx + 1 < len(strHost) and strHost[idx + 1] == char:
                sequence_len += 1
                idx += 1

            #Increment the index at this point to make the maths easier if we found a sequence
            idx += 1

            #Replace the whole sequence
            if sequence_len > 1:
                if char in typoDict:
                    for replacement_char in typoDict[char]:
                        result.append(strHost[:idx - sequence_len] +
                                      (replacement_char * sequence_len) +
                                      strHost[idx:])

        return result

    @staticmethod
    def generate_transposed_character_typos(strHost):
        result = list()
        for idx in range(0, len(strHost) - 1):
            result.append(strHost[:idx] + strHost[idx + 1:idx + 2] +
                          strHost[idx:idx + 1] + strHost[idx + 2:])
        return result

    @staticmethod
    def is_valid_rfc3491(domainname):
        """
        Checks if the given domain would pass processing by nameprep unscathed.

        :param domainname: The unicode string of the domain name.
        :return: True if the unicode is valid (i.e. only uses Unicode 3.2 code points)
        """
        valid_rfc3491 = True
        for char in domainname:
            if stringprep.in_table_a1(char):
                valid_rfc3491 = False
                break

        return valid_rfc3491

    @staticmethod
    def is_ascii(domainname):
        return str(codecs.encode(domainname, "idna"), "ascii") == domainname

    @staticmethod
    def is_in_charset(domainname, icharsetamount):
        if icharsetamount == 100:
            return True
        elif icharsetamount == 50:
            return typogen.is_valid_rfc3491(domainname)
        elif icharsetamount == 0:
            return typogen.is_ascii(domainname)

    def generatetyposv2(self,
                        strHost,
                        strCountry="gb",
                        bTypos=True,
                        iTypoIntensity=100,
                        bTLDS=False,
                        bBitFlip=True,
                        bHomoglyphs=True,
                        bDoppelganger=True,
                        bOnlyAlexa=False,
                        bNeverAlexa=False,
                        icharsetamount=100):
        """
        generate the typos

        @param strHost The hostname to generate typos for
        @param strCountry The country code of the keyboard to use when generating miskeyed typos
        @param bTypos Flag to indicate that typos should be generated
        @param iTypoIntensity A percentage of how intense the typo generation should be.
        @param bTLDS Flag to indicate that the TLDs should be swapped
        @param bBitFlip Flag to indicate that the hostname should be bitflipped
        @param bHomoglyphs Flag to indicate that homoglyphs should be generated
        @param bDoppelganger Flag to indicate that domain doppleganers should be generated
        @param bOnlyAlexa Flag to indicate that only results which appear in the Alexa top 1m domains should be returned
        @param bNeverAlexa Flag to indicate that results which are in the Alexa top 1m domains should not be returned
        """

        # result list of typos
        lstTypos = []

        if bBitFlip:
            lstTypos += self.bitflipstring(strHost)

        if bTypos:
            #Quick:
            lstTypos += self.generate_missing_character_typos(strHost)
            lstTypos += self.generate_duplicate_character_typos(strHost)
            #Balanced:
            if iTypoIntensity > 0:
                lstTypos += self.generate_miskeyed_typos(strHost, strCountry)
                lstTypos += self.generate_miskeyed_sequence_typos(
                    strHost, strCountry)
            #Rigorous:
            if iTypoIntensity > 50:
                lstTypos += self.generate_transposed_character_typos(strHost)
                lstTypos += self.generate_miskeyed_addition_typos(
                    strHost, strCountry)

        if bTLDS:
            public_suffix = self.psl.get_public_suffix(strHost)
            no_suffix = public_suffix[:public_suffix.find('.')] + '.'
            # Add each TLD
            for gtld in self.lstTlds:
                newHost = no_suffix + gtld
                lstTypos.append(newHost)

        if bHomoglyphs:
            lstTypos += self.generate_homoglyph_confusables_typos(strHost)
            lstTypos += self.generate_additional_homoglyph_typos(strHost)

        if bDoppelganger:
            #Commented out until a slider is put in - this following line results in Ssssllloooowwww searches
            #lstTypos += self.generate_country_code_doppelgangers(strHost)
            lstTypos += self.generate_subdomain_doppelgangers(strHost)
            lstTypos += self.generate_extra_dot_doppelgangers(strHost)

        uniqueTypos = set(lstTypos)
        # Remove any invalid typos
        for typo in copy.copy(uniqueTypos):
            if not self.is_domain_valid(typo):
                uniqueTypos.remove(typo)
            elif bOnlyAlexa and typo not in self.alexa_top:
                uniqueTypos.remove(typo)
            elif bNeverAlexa and typo in self.alexa_top:
                uniqueTypos.remove(typo)

        # Add the original domain for comparison purposes and to ensure we have at least one result
        try:
            uniqueTypos.add(strHost)
        except KeyError:
            pass

        unicode_typos = sorted([
            codecs.decode(asciiHost.encode(), "idna")
            for asciiHost in uniqueTypos
        ])
        for typo in copy.copy(unicode_typos):
            if not typogen.is_in_charset(typo, icharsetamount):
                unicode_typos.remove(typo)

        return unicode_typos

Beispiel #20

0

Datei anzeigen

Datei: url.py Projekt: djiesamsoe/url-py

# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
'''This is a module for dealing with urls. In particular, sanitizing them.'''

import re
import codecs
import urllib
try:
    import urlparse
except ImportError:  # pragma: no cover
    # Python 3 support
    import urllib.parse as urlparse

# For publicsuffix utilities
from publicsuffix import PublicSuffixList
psl = PublicSuffixList()

# Come codes that we'll need
IDNA = codecs.lookup('idna')
UTF8 = codecs.lookup('utf-8')
ASCII = codecs.lookup('ascii')
W1252 = codecs.lookup('windows-1252')

# The default ports associated with each scheme
PORTS = {'http': 80, 'https': 443}


def parse(url, encoding='utf-8'):
    '''Parse the provided url string and return an URL object'''
    return URL.parse(url, encoding)

Beispiel #21

0

Datei anzeigen

class UrlUtil:
    """封装一些关于url的操作"""
    # psl_file = fetch()  # 加载https://publicsuffix.org/list/public_suffix_list.dat

    psl_file = codecs.open(os.path.abspath(os.path.dirname(__file__)) + os.path.sep + 'public_suffix_list.dat', encoding='utf8')
    psl = PublicSuffixList(psl_file)

    @classmethod
    def get_protocol(cls, url):
        """抽取url的协议"""
        parse_result = parse.urlparse(url=url)
        return parse_result[0].strip()  # 加上strip以防万一

    @classmethod
    def get_domain(cls, url):
        """抽取url的域名"""
        parse_result = parse.urlparse(url=url)
        return parse_result[1].strip()  # 有的链接域名最后跟了空白，chrome还能够正确的识别解析，神奇……

    @classmethod
    def get_top_domain(cls, url):
        """抽取url的一级域名"""
        domain = UrlUtil.get_domain(url)
        domain = domain.split(':')[0]  # 去掉端口
        ip_pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
        if re.match(ip_pattern, domain):
            return domain
        return cls.psl.get_public_suffix(domain)

    @classmethod
    def get_path(cls, url):
        """抽取url对应文件的路径（去掉文件本身）"""
        splites = url.split('/')
        if len(splites) == 3:
            return url
        elif len(splites) == 4 and splites[-1] == "":
            return url[:-1]
        return "/".join(url.split('/')[:-1])

    @classmethod
    def is_gov_or_edu(cls, url):
        """判断url是否是政府或教育机构域名"""
        domain = UrlUtil.get_domain(url)
        if len(domain) > 7 and domain[-7:] in (".gov.cn", ".edu.cn"):
            return True
        return False

    @classmethod
    def top_domain_is_gov_or_edu(cls, top_domain):
        """判断主域名是否是政府或教育机构"""
        if top_domain in ("gov.cn", "edu.cn"):
            return True
        return False

    @classmethod
    def get_url_suffix(cls, url):
        """获取网页后缀名（如html、js、css）"""
        path = urllib.parse.urlsplit(url)[2]
        if '.' not in path.split('/')[-1]:
            return ""
        return path.split('.')[-1]

Beispiel #22

0

Datei anzeigen

class IPMeta(models.Model):
    ip = models.GenericIPAddressField(db_index=True)
    created = models.DateTimeField(auto_now_add=True)
    invalidated = models.DateTimeField(blank=True, null=True, db_index=True)
    last_updated = models.DateTimeField(auto_now=True)
    dnsloc = models.CharField(max_length=256, blank=True, null=True)
    hostname = models.CharField(max_length=256, blank=True, null=True)
    ##is_anycast = models.NullBooleanField( blank=True, null=True )

    psl = PublicSuffixList()

    def save(self, **kwargs):
        ''' IPMeta save method, does lookups if object isn't saved yet '''
        if not self.id:
            ## do dnsloc and hostname lookups
            try:
                host_resolve = dns.resolver.query(
                    dns.reversename.from_address(self.ip), 'PTR')
                h = str(host_resolve.response.answer[0].items[0])
                h = h.rstrip('.')
                self.hostname = h
            except:  #it's perfectly fine for a reverse not to exist
                pass
            if self.hostname:
                try:
                    loc_resolve = dns.resolver.query(self.hostname, 'LOC')
                    self.dnsloc = str(loc_resolve[0])
                except:  # it's perfectly fine for a loc record not to exist
                    pass
        super(self.__class__, self).save(**kwargs)

    def info2json(self, **kwargs):
        '''
        convert all info about this IP into a json structure.
        optional arguments accepted are
           'lat': latitude, to georestrict by
           'lon': longitude, to georestrict by
           'min_rtt': rtt, to georestrict by
        '''
        do_rtt_constraint = False
        try:
            lat = kwargs['lat']
            lon = kwargs['lon']
            min_rtt = kwargs['min_rtt']
            do_rtt_constraint = True
        except:
            pass

        DNSLOC_WEIGHT = 0.95
        HOSTNAME_WEIGHT = 0.90

        # 0  1  2      3 4 5  7     7
        # 48 51 21.953 N 2 23 0.143 E 10.00m 1.00m 10000.00m 10.00m"
        def _dnsloc2ll(loc_str):
            out = {'str': loc_str}
            fields = loc_str.split()
            if len(fields) >= 7:
                lat = float(fields[0]) + float(fields[1]) / 60 + float(
                    fields[2]) / (60 * 60)
                if fields[3] == 'S': lat = -lat
                lon = float(fields[4]) + float(fields[5]) / 60 + float(
                    fields[6]) / (60 * 60)
                if fields[7] == 'W': lon = -lon
                out['lat'] = lat
                out['lon'] = lon
            return out

        info = {}
        name2loc = []
        crowdsourced = []
        info['ip'] = self.ip
        info['hostname'] = self.hostname
        info['domainname'] = None
        try:
            info['domainname'] = self.__class__.psl.get_public_suffix(
                self.hostname)
        except:
            pass
        if self.dnsloc:
            info['dnsloc'] = _dnsloc2ll(self.dnsloc)
        #gc = IPGeoConstraint.objects.filter(ipmeta = self)
        #if len( gc ) == 1:
        #    info['area'] = json.loads( gc[0].area.geojson )
        ## add a suggestions array that contains the ordered list of suggested lat/lon
        suggestions = []
        name2loc = self.name2loc(**kwargs)
        if 'dnsloc' in info:
            if not do_rtt_constraint or openipmap.geoutils.can_one_travel_distance_in_rtt(
                    lat, lon, info['dnsloc']['lat'], info['dnsloc']['lon'],
                    min_rtt):
                # only add this if this is possible RTTwise
                suggestions.append({
                    'lat': info['dnsloc']['lat'],
                    'lon': info['dnsloc']['lon'],
                    'reason': 'dnsloc',
                    'weight': DNSLOC_WEIGHT,
                })
        total_pop = 0
        for n in name2loc:
            total_pop += n['pop']
        for n in name2loc:
            # lat/lon already there
            n['weight'] = HOSTNAME_WEIGHT * n['pop'] / total_pop
            n['reason'] = 'hostname'
            suggestions.append(n)
        info['suggestions'] = suggestions
        crowdsourced.extend(IPRule.get_crowdsourced(self.ip))
        if self.hostname:
            crowdsourced.extend(HostnameRule.get_crowdsourced(self.hostname))

        info['crowdsourced'] = crowdsourced
        return info

    def name2loc(self, poly_geoconstraint=None, **kwargs):
        '''
           try to figure out loc, based on name
           optional arguments accepted are
             'lat': latitude, to georestrict by
             'lon': longitude, to georestrict by
             'min_rtt': rtt, to georestrict by

        '''
        ## TODO: add polygon confinement?
        nr_results = 10  ## configurable?
        do_rtt_constraint = False
        try:
            lat = kwargs['lat']
            lon = kwargs['lon']
            min_rtt = kwargs['min_rtt']
            do_rtt_constraint = True
        except:
            pass

        # this should be configurable/tags and/or have low confidence value
        tag_blacklist = set([
            'rev', 'cloud', 'clients', 'demarc', 'ebr', 'pool', 'bras', 'core',
            'static', 'router', 'net', 'bgp', 'pos', 'out', 'link', 'host',
            'infra', 'ptr', 'isp', 'adsl', 'rdns', 'tengig', 'tengige', 'tge',
            'rtr', 'shared', 'red', 'access', 'tenge', 'gin', 'dsl', 'cpe'
        ])

        if not self.hostname: return []
        name = self.hostname.rstrip('.')
        suf = self.__class__.psl.get_public_suffix(name)
        rest = ''
        tokens = []
        if suf != name:
            rest = name[0:len(name) - len(suf) - 1]
            rest = rest.lower()
            ## support for additional tokenization?
            tokens = re.split(r'[^a-zA-Z]+', rest)
            ## filter by token-length (for now) , TODO make configurable?
            tokens = [t for t in tokens if len(t) >= 3]
            ## remove blacklisted tokens
            tokens = [t for t in tokens if not t in tag_blacklist]

        matches = {}

        def add_to_matches(g, token, is_abbrev, **kwargs):
            if not g.loc.id in matches:
                ## check if geoconstraints
                if do_rtt_constraint and not openipmap.geoutils.can_one_travel_distance_in_rtt(
                        lat, lon, g.loc.lat, g.loc.lon, min_rtt):
                    return
                matches[g.loc.id] = {
                    'loc_id': g.loc.id,
                    'pop': g.loc.pop,
                    'count': g.loc.count,
                    'name': str(g.loc),
                    'lat': g.loc.lat,
                    'lon': g.loc.lon,
                    'token': set(),
                    'kind': set()
                }
                if poly_geoconstraint:
                    if poly_geoconstraint.contains(g.loc.point):
                        matches[g.loc.id] = {'in_constraint': True}

            matches[g.loc.id]['token'].add(token)
            ## this loses the link between the token and the geoalias-kind (for now)
            if is_abbrev:
                matches[g.loc.id]['kind'].add('abbrev-' + g.kind)
            else:
                matches[g.loc.id]['kind'].add(g.kind)

        for t in tokens:
            for ga in Geoalias.objects.filter(word=t):
                add_to_matches(ga, t, False, **kwargs)
        if len(matches) == 0:
            #print "little on strict match, trying like"
            for t in tokens:
                ## 't' can't be anything but a-zA-Z so no SQL injection should be possible
                sql_like_chars = '%%'.join(list(t))
                sql_like_chars += '%%'
                # 'a%m%s%'
                sql = "SELECT id FROM openipmap_geoalias WHERE word LIKE '%s'" % (
                    sql_like_chars)
                for ga in Geoalias.objects.raw(sql):
                    add_to_matches(ga, t, True, **kwargs)
        ## this sorts, first by 'count' (=number of hostnames the DB already has for this location) then by 'population' of location
        mk = sorted(
            matches.keys(),
            reverse=True,
            key=lambda x:
            (matches[x]['count'], matches[x]['pop']))[0:nr_results]  ## max 10
        result = []
        for m in mk:
            entry = matches[m]
            # flatten
            entry['token'] = list(entry['token'])
            entry['kind'] = list(entry['kind'])
            result.append(entry)
        return result

    @classmethod
    def gather_from_msm(self, msm_id, interval=3600):
        #@@ todo make these configurable:
        limit = 10
        stop = int(time.time())
        start = stop - interval

        msm_url = "https://atlas.ripe.net/api/v1/measurement/%d/result/?start=%d&stop=%d&limit=%d&format=txt" % (
            msm_id, start, stop, limit)
        print msm_url
        url_fh = urllib2.urlopen(msm_url)
        ips = {}
        for line in url_fh:
            try:
                msm = json.loads(line)
                prb_id = msm['prb_id']
                for msm_res in msm['result']:
                    hop_nr = msm_res['hop']
                    for hop_res in msm_res['result']:
                        if 'from' in hop_res:
                            ip = hop_res['from']
                            rtt = hop_res['rtt']
                            if not ip in ips:
                                ips[ip] = 1
            except:
                print "oops on %s" % (line)
        timediff = datetime.now() - timedelta(days=30)
        for ip in ips:
            ## figure out if there is a recent Meta fetch done
            try:
                ipm = self.objects.filter(ip=ip).filter(
                    created__gte=timediff).order_by('-created')
                if len(ipm) > 0:
                    i = ipm[0]
                else:
                    ## insert it (does autolookups)
                    i = IPMeta()
                    i.ip = ip
                    i.save()
                print "%s %s %s" % (i.ip, i.hostname, i.dnsloc)
            except:
                pass

Beispiel #23

0

Datei anzeigen

Datei: uri.py Projekt: venantius/takehome

class URI(object):
    """
    Core URI class as specified in RFC 3986.
    """
    suffix_list = PublicSuffixList()

    def __init__(self,
                 scheme=None,
                 authority=None,
                 path='',
                 query=None,
                 fragment=None):
        """
        Constitute a URI from various constituent parts. Requires a path,
        but other arguments are optional. Existing URIs will be percent-decoded
        as they are read, but re-encoded when printed or when certain objects
        (such as query or authority strings) are retrieved.

        To create the appropriate object for the following URI:

        >>> demo_uri = 'https://www.google.com/search?q=setter+python&oq=setter+python&aqs=chrome..69i57j0l3.9438j0&sourceid=chrome&ie=UTF-8'

        Either use: 
        
        >>> x = URI.parse_uri(demo_uri) 
        
        or initialize individual components, e.g.:

        >>> x = URI(path = '/search', scheme = 'https', authority = 'www.google.com', 
        query = 'q=setter+python&oq=setter+python&aqs=chrome..69i57j0l3.9438j0&sourceid=chrome&ie=UTF-8')

        Additional query arguments can be easily added as follows:

        >>> x.set_query_arg('bananas', 'are_yummy!')
        """
        self.scheme = scheme
        self._userinfo, self._host, self._port = None, None, None
        self.authority = authority
        self.path = path
        self.query_dict = {}
        self.query = query
        self.fragment = fragment

    def __repr__(self, normalize=False):
        """
        Retrieves the string representation of the URI.

        Assembles the various URI components into a string representation,
        complete with percent-encoding. Can be normalized, which compresses
        dot-segments.

        Args:
            normalize: removes dot-segments
        """
        result = ""
        if self.scheme:
            if normalize:
                result += self.scheme.lower() + ":"
            else:
                result += self.scheme + ":"
        if self.authority:
            self._build_authority(normalize=normalize)
            result += '//' + self._authority
        if normalize:
            if not self.path:
                result += '/'
            else:
                result += self._remove_dot_segments(self.path)
        else:
            result += self.path
        if self.query:
            result += '?' + self.query
        if self.fragment:
            result += '#' + self.fragment

        # Go through and uppercase any percent-encodings
        if normalize:
            tmp = result
            while tmp.rfind('%') != -1:
                pos = tmp.rfind('%')
                tmp = tmp[:pos]
                result = result[:pos] + result[pos:pos+3].upper() + \
                    result[pos+3:]
            result.find('%')
        return result

    def __eq__(self, other):
        if self.__class__ == other.__class__:
            return (self.__repr__(normalize=True) == other.__repr__(
                normalize=True))
        # In case we're just comparing against a URI string
        elif type(other) == str:
            return self.__repr__() == other.lower()
        else:
            return False

    @property
    def authority(self):
        """
        Retrieves a percent-encoded authority string, if one exists.
        """
        self._build_authority()
        return self._authority

    @authority.setter
    def authority(self, authority):
        """
        Sets the authority string and parses the userinfo, host and port.
        """
        self._authority = authority
        self._parse_authority()

    @property
    def domain(self):
        """
        Returns the domain for the given URI.
        """
        if not (self._is_ipv4(self._host) or self._is_ipvliteral(self._host)):
            return self.suffix_list.get_public_suffix(self.host)
        else:
            return None

    @domain.setter
    def domain(self, domain):
        """
        Set the domain for the given URI.
        """
        if not (self._is_ipv4(self._host) or self._is_ipvliteral(self._host)):
            self.host = self.host.split(self.domain)[0] + domain
        else:
            raise Exception, "Host is an IP address, not a domain"

    @property
    def fragment(self):
        """
        Retrieves a percent-encoded fragment, if one exists.
        """
        if self._fragment:
            return self.percent_encode(self._fragment, regexes.FRAGMENT_REGEX)
        else:
            return self._fragment

    @fragment.setter
    def fragment(self, fragment):
        """
        Sets the fragment.
        """
        if fragment:
            self._fragment = self.percent_decode(fragment)
        else:
            self._fragment = None

    @property
    def host(self):
        """
        Retrieve the percent-encoded host, if one has been set. 
        """
        if self._is_ipv4(self._host) or self._is_ipvliteral(self._host):
            return self._host
        else:
            return self.percent_encode(self._host,
                                       regexes.REG_NAME_ELIGIBLE_REGEX)

    @host.setter
    def host(self, host):
        """
        Set a new host for this URI.
        """
        if host == '':
            host = None
        self._host = self.percent_decode(host)

    @property
    def path(self):
        """
        Retrieve the path for this URI
        """
        if self.scheme:
            return '/'.join([
                self.percent_encode(x, regexes.PATH_REGEX) for x in self._path
            ])
        else:
            return '/'.join([
                self.percent_encode(x, regexes.PATH_NOSCHEME_REGEX)
                for x in self._path
            ])

    @path.setter
    def path(self, path):
        """
        Set a new path for this URI
        """
        if self.authority and path != '':
            if path[0] != '/':
                raise Exception, "Invalid path: when authority is present," + \
                    " path should begin with a '/' character"
        elif not self.authority:
            if path[0:2] == '//':
                raise Exception, "Invalid path: when no authority is" + \
                    " present, path cannot begin with '//'"
        self._path = [self.percent_decode(x) for x in path.split('/')]

    @property
    def port(self):
        """
        Retrieve the port, if one has been set.
        """
        return self._port

    @port.setter
    def port(self, port):
        """
        Set a new port for this URI. 
        
        If a host has been defined, re-build the authority string, else pass 
        (an authority string with no host is meaningless).
        
        Args:
            port: the target port
        """
        self._port = int(port)

    @property
    def query(self):
        """
        Retrieves a percent-encoded query string, if one has been set.
        """
        self._build_query()
        if self._query:
            return self.percent_encode(self._query, regexes.QUERY_REGEX)
        else:
            return None

    @query.setter
    def query(self, query):
        """
        Sets the query string.
        """
        if query:
            self._query = self.percent_decode(query)
            self._parse_query()
        else:
            self._query = None

    @property
    def tld(self):
        """
        Retrieves the top-level domain, if one has been set.
        """
        return '.'.join(self.domain.split('.')[1:])

    @tld.setter
    def tld(self, tld):
        """
        Sets the top-level domain.
        """
        self.domain = '.'.join(self.domain.split('.')[:1]) + '.' + tld

    @property
    def userinfo(self):
        """
        Retrieves the percent-encoded userinfo string, if one exists. 
        """
        if self._userinfo:
            return self.percent_encode(self._userinfo, regexes.USERINFO_REGEX)
        else:
            return None

    @userinfo.setter
    def userinfo(self, userinfo):
        """
        Set a new userinfo for this URI. 
        """
        if userinfo == "":
            userinfo = None
        self._userinfo = self.percent_decode(userinfo)

    def set_query_arg(self, key, value=None):
        """
        Sets a query argument. 
        """
        self.query_dict[key] = value

    def get_query_arg(self, key):
        """
        Gets a query argument.
        """
        return self.query_dict[key]

    @staticmethod
    def _is_ipv4(host_string):
        """
        Checks to see if a given host string is a valid IPv4 address
        """
        return regexes.IPV4_REGEX.search(host_string)

    @staticmethod
    def _is_ipvliteral(host_string):
        """
        Checks to see if a given host string is a valid IPvLiteral address
        """
        return regexes.IPVLITERAL_REGEX.search(host_string)

    @staticmethod
    def percent_encode(string, regex):
        """
        Percent-encode a string w/ hex codes.
        
        Given a provided string and regex, encodes any characters that don't 
        match the provided characters in the regex.

        Args:
            string: the string to be encoded
            regex: a regex listing any characters that don't need encoding
        """
        return ''.join([
            '%' + x.encode('hex') if not regex.search(x) else x for x in string
        ])

    @staticmethod
    def percent_decode(string):
        """
        Percent-decode a string. See also: percent_encode(string, regex)
        """
        return ''.join(_PercentDecoder(string))

    def _build_authority(self, normalize=False):
        """
        Build a percent-encoded authority string and set the authority attribute.

        Takes the userinfo, host, and port attributes and attempts to build a 
        percent-encoded authority string. If the host is not set, returns None
        as a host is necessary for a valid authority string.
        """
        self._authority = ""
        if self._userinfo:
            self._authority += self.userinfo + '@'
        if self._host:
            if normalize:
                host = self.host.lower()
            else:
                host = self.host
        else:
            self.authority = None
            return
        self._authority += host
        if self.port:
            self._authority += ':' + str(self.port)

    def _build_query(self):
        """
        Build a percent-encoded query string from the query dict.
        """
        if len(self.query_dict.keys()) > 0:
            self._query = []
            for key, value in self.query_dict.iteritems():
                if value:
                    self._query.append(key + '=' + value)
                else:
                    self._query.append(key)
            self._query = '&'.join(self._query)
        else:
            self._query = None

    def _parse_authority(self):
        """
        Parses the authority attribute for userinfo, host, and port.

        Follows available regular expressesions to identify userinfo, host, 
        and port data. If identified, sets the corresponding attributes.
        If the host is of reg-name type (as opposed to IPv4 or an IP-literal),
        this function will also percent-decode the host.
        """
        if not self._authority:
            return
        auth_string = self._authority
        if auth_string.find('@') != -1:
            self.userinfo, auth_string = auth_string.split('@', 1)

        if self._is_ipv4(auth_string):
            search_result = self._is_ipv4(auth_string)
        elif self._is_ipvliteral(auth_string):
            search_result = self._is_ipvliteral(auth_string)
        else:
            search_result = regexes.REG_NAME_SEARCH_REGEX.search(auth_string)

        self.host = auth_string[search_result.start():search_result.end()]

        # Check for port info
        if len(auth_string) != len(self.host):
            self.port = auth_string[search_result.end() + 1:]

    def _parse_query(self):
        """
        Parse a query string into a query_dict attribute.
        """
        if self._query.find('&'):
            query_array = self._query.split('&')
        elif self._query.find(';'):
            query_array = self._query.split(';')
        else:
            query_array = [self._query]
        self.query_dict = {}
        for element in query_array:
            try:
                key, value = element.split('=')
                value = self.percent_decode(value)
            except ValueError:
                key, value = element, None
            key = self.percent_decode(key)
            self.query_dict[key] = value

    @staticmethod
    def _remove_dot_segments(path):
        """
        Removes dot segments from a given path.
        """
        segments = path.split('/')
        compressed_path = []
        for segment in segments:
            if segment == '.':
                pass
            elif segment == '..':
                compressed_path.pop()
            else:
                compressed_path.append(segment)
        return '/'.join(compressed_path)

    def chdir(self, changepath):
        """
        Functions like the UNIX cd or chdir command.

        Args
            changepath: the subdirectory to change to
        """
        if changepath[0] == '/':
            changepath = changepath[1:]
        if self._path[-1] == '':
            self._path.pop()
        self._path.extend(changepath.split('/'))

    @staticmethod
    def parse_uri(uri_string):
        """
        Parses a given URI using the regex provided in RFC 3986.
        """
        result = regexes.URI_REGEX.match(uri_string).groups()
        scheme, authority, path, query, fragment = \
                [result[i] for i in [1,3,4,6,8]]
        return URI(path=path,
                   scheme=scheme,
                   authority=authority,
                   query=query,
                   fragment=fragment)