def __init__(self, pipes=None, observers=None): if not observers: observers = [] if not pipes: pipes = [] self._pipes = pipes self.lock = ReadWriteLock() self.plumbings = [plumbing(v) for v in pipes] self.refresh = MDUpdate(cherrypy.engine, server=self, frequency=config.update_frequency) self.refresh.subscribe() self.aliases = config.aliases self.psl = PublicSuffixList() self.md = MDRepository() self.ready = False if config.autoreload: for f in pipes: cherrypy.engine.autoreload.files.add(f)
def run(fname=None, iface=None, log_by_ip=False, launch_ff=False, sslstriplog=None, sslsplitdir=None): global psl global ip_logging psl = PublicSuffixList() ip_logging = log_by_ip if (not fname and not iface and not launch_ff): print parser.print_help() exit(-1) if (launch_ff): if (sslsplitdir): parsesslsplit(sslsplitdir) launch_firefox() else: if not os.path.exists(save_dir): os.makedirs(save_dir) print "MANA (FireLamb) : [+] Saving output to %s" % save_dir if (iface): print "MANA (FireLamb) : [+] Listening for cookie traffic on interface %s" % iface sniff(iface=iface, prn=process) elif (fname): print "MANA (FireLamb) : [+] Reading pcap file '%s'...." % fname packets = rdpcap(fname) print "MANA (FireLamb) : [+] Processing file contents..." for p in packets: process(p) print "MANA (FireLamb) : [+] Done."
def domain_split(server_domain): ''''' server_domain为网站所用服务名+域名 分割域名, 得到前缀(服务名)、主机域名、后缀(顶级域名) 输入www.baidu.com,输出'www', 'baidu', 'com' 输入172.31.137.240,输出'', '172.31.137.240', '' ''' PSL_FILE = codecs.open('public_suffix_list.dat', encoding='utf8') psl = PublicSuffixList(PSL_FILE) domain = psl.get_public_suffix(server_domain) # 取域名的第一个字段,即第一个'.'之前的为主机域名, 后面为顶级域名,前面为所使用的服务 if '.' in domain: server = server_domain[:-len(domain)] host = domain[:domain.index('.')] top = domain[domain.index('.'):] hostname = server + host + top else: # 说明提取域名失败,例如172.31.137.240等IP形式,此时全部当作主机域名 server = '' host = server_domain top = '' hostname = server_domain return server, host, top, hostname
def alexa_malware_scan(url): domain = PublicSuffixList().get_public_suffix( urlparse(url).netloc) # IRIs are going to be a pain here. pipe = redis_db["slave"].pipeline() pipe.hlen(domains_key) pipe.hmget(domains_key, domain) total, score = pipe.execute() score = score[0] def rank_to_ratio(score, total): """ if the score is between 1 and 1 million never return 1 If the score is none return 1 """ if score is not None: score = int(score) - 1 total = total return score / total else: return 1 return [{"type": "generic", "confidence": rank_to_ratio(score, total)}]
from goslate import Goslate from publicsuffix import PublicSuffixList from unidecode import unidecode import keywords ################### # initializations # ################### g = Goslate() stopwords = pickle.load(open("data/stopwords_dict", 'rb')) psl = PublicSuffixList(open("data/public_suffix_list.dat", encoding="utf8")) document_frequencies = {} with open("data/count_1w.txt") as f: for line in f: key, value = line.strip().split() document_frequencies[key] = int(value) ######### # UTILS # ######### def cleanString(s): s = unidecode(s) s = re.sub('\n',' ',s)
class URL(object): BLOCKEXT = [ 'a3c', 'ace', 'aif', 'aifc', 'aiff', 'arj', 'asf', 'asx', 'attach', 'au', 'avi', 'bin', 'cab', 'cache', 'class', 'djv', 'djvu', 'dwg', 'es', 'esl', 'exe', 'fif', 'fvi', 'gz', 'hqx', 'ice', 'ief', 'ifs', 'iso', 'jar', 'kar', 'mid', 'midi', 'mov', 'movie', 'mp', 'mp2', 'mp3', 'mp4', 'mpeg', '7z', 'mpeg2', 'mpg', 'mpg2', 'mpga', 'msi', 'pac', 'pdf', 'ppt', 'pptx', 'psd', 'qt', 'ra', 'ram', 'rm', 'rpm', 'snd', 'svf', 'tar', 'tgz', 'tif', 'gzip', 'tiff', 'tpl', 'uff', 'wav', 'wma', 'wmv', 'doc', 'docx', 'db', 'jpg', 'png', 'bmp', 'svg', 'gif', 'jpeg', 'css', 'js', 'cur', 'ico', 'zip', 'txt', 'apk', 'dmg', 'xml', 'jar', 'class', 'torrent' ] BLOCKHOST = ['mirrors.aliyun.com', 'code.taobao.org'] # PUBLIC_SUFFIX_LIST_URL = 'http://publicsuffix.org/list/public_suffix_list.dat' PSL = PublicSuffixList(codecs.open(PSL_FILE_PATH, encoding='utf8')) def __init__(self, url): self.valid = True self.urlstring = self.normalize_url(url) if not self.urlstring: self.valid = False self._p = urlparse.urlparse(self.urlstring) @staticmethod def normalize_url(url): """ :param url: :return: """ # only hostname if not '/' in url: return 'http://{}'.format(url) p = urlparse.urlparse(url) # www.test.com/index.php # exclude /xxxxx/index.php if not p.netloc: if url.startswith('/'): # /xxxxx/index.php return '' else: # www.test.com/index.php return 'http://{}'.format(url) # //www.test.com/index.php if not p.scheme: url = urlparse.urlunparse(('http', p.netloc, p.path or '/', p.query, p.params, p.fragment)) return url @property def scheme(self): return self._p.scheme @property def netloc(self): return self._p.netloc @property def hostname(self): return self._p.hostname @property def domain(self): return self.PSL.get_public_suffix(self.hostname) @property def path(self): # http://www.test.com => self._p.path='' return self._p.path or '/' @property def path_without_file(self): return self.path[:self.path.rfind('/') + 1] @property def filename(self): return self.path[self.path.rfind('/') + 1:] @property def extension(self): fname = self.filename extension = fname[fname.rfind('.') + 1:] if extension == fname: return '' else: return extension @property def querystring(self): return self._p.query @property def querydict(self): # remove keep_blank_values=True, as url blow cause duplicate scans # /Common/common/captcha?0.610851539997384 => querydict = {'0.610851539997384': ''} return dict(urlparse.parse_qsl(self._p.query)) @property def fragment(self): return self._p.fragment @property def index_page(self): return urlparse.urljoin(self.urlstring, '/', allow_fragments=False) @property def pattern(self): """ :return: """ return urlparse.urlunsplit( (self.scheme, self.netloc, self.path_querystring_pattern, '', '')) @property def path_querystring_pattern(self): """ :return: """ # TODO url pattern path_pattern = re.sub('\d+', '{digit}', self.path) query_params = '<>'.join(sorted(self.querydict.keys())) pattern = '{}?{}'.format( path_pattern, query_params) if query_params else path_pattern return pattern @property def blocked(self): return True if self.extension.lower( ) in URL.BLOCKEXT or self.hostname.lower() in URL.BLOCKHOST else False
def open_public_suffix_list(file_dir=''): global PSL_FILE global PSL PSL_FILE = codecs.open(pjoin(file_dir, 'public_suffix_list.dat'), encoding='utf8') PSL = PublicSuffixList(PSL_FILE)
def get_org_domain(domain): fn = get_suffix_list_file_name() with open(fn) as suffixList: psl = PublicSuffixList(suffixList) return psl.get_public_suffix(domain)
import json from code.domain import readDomains, clusterDomains, group2file from code.secondDomain import saveDGA import pandas as pd from code.tools import oneHotFeature, checkip from sklearn.cluster import DBSCAN from publicsuffix import PublicSuffixList from sklearn import preprocessing import csv import codecs psl_file = codecs.open('suffix.dat', encoding='utf8') psl = PublicSuffixList(psl_file) def makeGraphFile(): readDomains() clusterDomains(json.load(open('data/noRegisterBlackTwoLevel.txt')), 'data/featureCluster.txt') group2file() saveDGA() def addToSetMap(data, key, val): if key not in data: data[key] = set() data[key].add(val) def readDGADomains(): domainMap = {} filenames = [
def get_suffix(self): suffix_list = fetch() psl = PublicSuffixList(suffix_list) return psl
"Too many simulataneous connections from your host", "Please try again later.", "You have been banned for abuse.", "has exceeded the established limit", "WHOIS LIMI", "Still in grace period, wait", "Permission denied.") _tld_to_whois = dict() with open("datasources/whois-servers.txt", "r") as whois_servers: for line in whois_servers: if line.startswith(';'): continue parts = line.split(' ') _tld_to_whois['.' + parts[0].strip()] = parts[1].strip() _psl = PublicSuffixList( input_file=codecs.open("datasources/effective_tld_names.dat", "r", "utf8")) def _whois_lookup(sServer, sDomain): """ Perform the network connection to the Whois Server and query for the given domain. @param sServer: The hostname of the whois server to query. @param sDomain: The domain to query for. @return: The whois result string. """ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(5) try: s.connect((sServer, 43))
from urlparse import urlsplit from marshmallow import Schema, fields from publicsuffix import PublicSuffixList PSL = PublicSuffixList() class ImageSchema(Schema): height = fields.Int(allow_none=True) url = fields.Url(allow_none=True) width = fields.Int(allow_none=True) class EmbedlyURLSchema(Schema): description = fields.Str(allow_none=True) favicon_url = fields.Url(allow_none=True) images = fields.Nested(ImageSchema, many=True) original_url = fields.Url(allow_none=True) provider_name = fields.Str(allow_none=True) title = fields.Str(allow_none=True) url = fields.Url(allow_none=True) def __init__(self, blocked_domains, *args, **kwargs): self.blocked_domains = blocked_domains super(EmbedlyURLSchema, self).__init__(*args, **kwargs) def load(self, data): validated = super(EmbedlyURLSchema, self).load(data) def get_domain(url):
def dirnames(self): encoded = self.encode('utf-8') hexdigest = md5(encoded).hexdigest() names = [self.parsed.scheme, self.parsed.netloc] names.extend(filter(None, self.parsed.path.split('/'))) if self.parsed.query: names.extend(self.parsed.query.split('&')) names.append(hexdigest) return [urlquote(name, safe='')[:PC_NAME_MAX] for name in names] # # URL related composable helpers # ============================================================ public_suffix_list = PublicSuffixList() @composable @map_if_iter def url(obj): return getattr(obj, 'url', obj) parse_url = url | map_if_iter(urlparse) url_query = parse_url | map_if_iter(attrgetter('query')) url_path = parse_url | map_if_iter(attrgetter('path')) url_hostname = parse_url | map_if_iter(attrgetter('hostname')) url_query_dict = url_query | map_if_iter(parse_qs) url_query_list = url_query | map_if_iter(parse_qsl)
def __init__(self, procs): BaseStreamifier.__init__(self, procs) self.psl = PublicSuffixList()
def initialize_external_data(init_preload_list=None, init_preload_pending=None, init_suffix_list=None): """ This function serves to load all of third party external data. This can be called explicitly by a library, as part of the setup needed before calling other library functions, or called as part of running inspect_domains() or CLI operation. If values are passed in to this function, they will be assigned to be the cached values. This allows a caller of the Python API to manage cached data in a customized way. It also potentially allows clients to pass in subsets of these lists, for testing or novel performance reasons. Otherwise, if the --cache-third-parties=[DIR] flag specifies a directory, all downloaded third party data will be cached in a directory, and used from cache on the next pshtt run instead of hitting the network. If no values are passed in, and no --cache-third-parties flag is used, then no cached third party data will be created or used, and pshtt will download the latest data from those third party sources. """ global preload_list, preload_pending, suffix_list # The preload list should be sent in as a list of domains. if init_preload_list is not None: preload_list = init_preload_list # The preload_pending list should be sent in as a list of domains. if init_preload_pending is not None: preload_pending = init_preload_pending # The public suffix list should be sent in as a list of file lines. if init_suffix_list is not None: suffix_list = PublicSuffixList(init_suffix_list) # If there's a specified cache dir, prepare paths. # Only used when no data has been set yet for a source. if THIRD_PARTIES_CACHE: cache_preload_list = os.path.join(THIRD_PARTIES_CACHE, cache_preload_list_default) cache_preload_pending = os.path.join(THIRD_PARTIES_CACHE, cache_preload_pending_default) cache_suffix_list = os.path.join(THIRD_PARTIES_CACHE, cache_suffix_list_default) else: cache_preload_list, cache_preload_pending, cache_suffix_list = None, None, None # Load Chrome's latest versioned HSTS preload list. if preload_list is None: if cache_preload_list and os.path.exists(cache_preload_list): utils.debug("Using cached Chrome preload list.", divider=True) preload_list = json.loads(open(cache_preload_list).read()) else: preload_list = load_preload_list() if cache_preload_list: utils.debug("Caching preload list at %s" % cache_preload_list, divider=True) utils.write(utils.json_for(preload_list), cache_preload_list) # Load Chrome's current HSTS pending preload list. if preload_pending is None: if cache_preload_pending and os.path.exists(cache_preload_pending): utils.debug("Using cached hstspreload.org pending list.", divider=True) preload_pending = json.loads(open(cache_preload_pending).read()) else: preload_pending = load_preload_pending() if cache_preload_pending: utils.debug("Caching preload pending list at %s" % cache_preload_pending, divider=True) utils.write(utils.json_for(preload_pending), cache_preload_pending) # Load Mozilla's current Public Suffix list. if suffix_list is None: if cache_suffix_list and os.path.exists(cache_suffix_list): utils.debug("Using cached suffix list.", divider=True) cache_file = codecs.open(cache_suffix_list, encoding='utf-8') suffix_list = PublicSuffixList(cache_file) else: suffix_list, raw_content = load_suffix_list() if cache_suffix_list: utils.debug("Caching suffix list at %s" % cache_suffix_list, divider=True) utils.write(''.join(raw_content), cache_suffix_list)
from publicsuffix import PublicSuffixList from trustymail import trustymail public_list = PublicSuffixList() class Domain: base_domains = {} def __init__(self, domain_name): self.domain_name = domain_name self.base_domain_name = public_list.get_public_suffix(domain_name) if self.base_domain_name != self.domain_name: if self.base_domain_name not in Domain.base_domains: domain = Domain(self.base_domain_name) # Populate DMARC for parent. trustymail.dmarc_scan(domain) Domain.base_domains[self.base_domain_name] = domain self.base_domain = Domain.base_domains[self.base_domain_name] else: self.base_domain = None # Start off assuming the host is live unless an error tells us otherwise. self.is_live = True # Keep entire record for potential future use. self.mx_records = []
#!/usr/bin/python #encoding:utf-8 from publicsuffix import PublicSuffixList domainParser = PublicSuffixList() # print domainParser.get_public_suffix("www.example.com.cn") # print domainParser.get_public_suffix("www.example.com.uk") # print domainParser.get_public_suffix("jaysonhwang.sinaapp.com") # print domainParser.get_public_suffix("1.jaysonhwang.sinaapp.com") # print domainParser.get_public_suffix("jaysonhwang.sinaapp.com/web/1") print domainParser.get_domain("http://192.168.0.100:8080/web") print domainParser.get_domain("http://www.qq.com") allow = [ "http://www.people.com.cn", "http://www.xinhuanet.com", "http://www.qq.com", "http://www.163.com", "http://www.cntv.cn", "http://www.ifeng.com", "http://www.hexun.com", "http://www.sina.com.cn", "http://www.sohu.com", "http://www.dbw.cn",] for a in allow: print domainParser.get_domain(a)[0]
"""Utils for analyzing Princeton Web Census data.""" from BlockListParser import BlockListParser from ipaddress import ip_address from publicsuffix import PublicSuffixList, fetch from urllib.parse import urlparse import codecs import json PSL_CACHE_LOC = 'public_suffix_list.dat' # Execute on module load psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8') psl = PublicSuffixList(psl_cache) el_parser = BlockListParser('easylist.txt') ep_parser = BlockListParser('easyprivacy.txt') with open('org_domains.json', 'r') as f: org_domains = json.load(f) with open('alexa_cats.json', 'r') as f: alexa_cats = json.load(f) class CensusUtilsException(Exception): pass def get_domain(url): """Strip the URL down to just a hostname+publicsuffix.
class typogen(object): """generate typo""" psl = PublicSuffixList(input_file=codecs.open( "datasources/effective_tld_names.dat", "r", "utf8")) alexa_top = {} def __init__(self): #Load up the list of TLDs self.lstTlds = list() filename = "datasources/tlds-alpha-by-domain.txt" with open(filename) as f: for line in f: if not line.lstrip().startswith('#'): self.lstTlds.append(line.rstrip().lower()) print("Loading confusables...", end=" ", flush=True) self.loadconfusables() print("Loading Alexa data...", end=" ", flush=True) with open(r'datasources\top-1m.csv') as top1m: for line in top1m: parts = line.rstrip().split(',', 1) if len(parts) == 2: self.alexa_top[parts[1]] = int(parts[0]) print("Done.") @staticmethod def loadkeyb(strCountry): keyDict = dict() # obviously you can have other maps here # I've only included this one filename = "datasources/keyb" + strCountry + ".txt" with open(filename) as f: for line in f: split = line.rstrip().split(',') if split[0] in keyDict: keyDict[split[0]].append(split[1]) else: keyDict[split[0]] = [split[1]] return keyDict @staticmethod def loadadditionalhomoglyphs(): homoglyphs = dict() with open("datasources/homoglyphs.txt", "r", encoding="utf8") as f: for line in f: if not line.startswith("#"): split = line.rstrip().split(',') key = split[0] #filter out any glyphs which are the same as the key (case insensitive) tempvalues = [ glyph for glyph in split[1].split(' ') if glyph.lower() != key ] #filter out glyphs which do not survive round trip conversion, e.g. ß -> ss -> ss values = list() for glyph in tempvalues: try: if 'a' + glyph + 'b' == codecs.decode( codecs.encode('a' + glyph + 'b', "idna"), "idna"): values.append(glyph) except UnicodeError: #Some characters/combinations will fail the nameprep stage pass homoglyphs[key] = values return homoglyphs @staticmethod def loadconfusables(): global _homoglyphs_confusables _homoglyphs_confusables = dict() rejected_sequences = set() #'utf_8_sig' swallows the BOM at start of file with open("datasources/confusables.txt", "r", encoding="'utf_8_sig") as f: for line in f: #If line contains more than whitespace and isn't a comment if line.strip() and not line.startswith("#"): split = line.split(';', maxsplit=2) #parse the left hand side of the pairing unihex = split[0].split(' ')[0] part0 = (chr(int(unihex, 16))) if part0 in rejected_sequences: continue #parse the right hand side of the pairing part1 = '' for unihex in split[1].strip().split(' '): part1 += (chr(int(unihex, 16))) if part1 in rejected_sequences: continue #Skip pairs already in the _homoglyphs dict if part0 in _homoglyphs_confusables and part1 in _homoglyphs_confusables[ part0]: continue try: #filter out glyphs which do not survive round trip conversion, e.g. ß -> ss -> ss if 'a' + part0 + 'b' != codecs.decode( codecs.encode('a' + part0 + 'b', "idna"), "idna"): rejected_sequences.add(part0) continue except UnicodeError: #Some characters/combinations will fail the nameprep stage rejected_sequences.add(part0) continue try: #filter out glyphs which do not survive round trip conversion, e.g. ß -> ss -> ss if 'a' + part1 + 'b' != codecs.decode( codecs.encode('a' + part1 + 'b', "idna"), "idna"): rejected_sequences.add(part1) continue except UnicodeError: #Some characters/combinations will fail the nameprep stage rejected_sequences.add(part1) continue #Include left to right pair mapping in the dict if part0 not in _homoglyphs_confusables: _homoglyphs_confusables[part0] = set() _homoglyphs_confusables[part0].add(part1) #Include right to left pair mapping in the dict if part1 not in _homoglyphs_confusables: _homoglyphs_confusables[part1] = set() _homoglyphs_confusables[part1].add(part0) def is_domain_valid(self, domain): #Ensure its in the correct character set if not re.match('^[a-z0-9.-]+$', domain): return False #Ensure the TLD is sane elif domain[domain.rfind(".") + 1:] not in self.lstTlds: return False # hostnames can't start or end with a - elif ".-" in domain or "-." in domain or domain.startswith("-"): return False #Ensure the location of dots are sane elif ".." in domain or domain.startswith("."): return False else: return True @staticmethod def bitflipbyte(inputbyte): """ Flips the lowest 7 bits in the given input byte/int to build a list of mutated values. @param inputbyte: The byte/int to bit flip @return: A list of the mutated values. """ result = list() mask = 1 #As we know we're flipping ASCII, only do the lowest 7 bits for i in range(0, 7): result.append(inputbyte ^ mask) mask <<= 1 return result @staticmethod def generate_country_code_doppelgangers(strHost): result = list() with open("datasources/countrynames.txt", 'r', encoding="UTF-8") as countrynames: for line in countrynames: if not line.startswith('#'): parts = line.split(';', maxsplit=2) # 2 letter country code subdomain, but without the dot result.append(parts[0].strip().lower() + strHost) # 3 letter country code subdomain, but without the dot result.append(parts[1].strip().lower() + strHost) return result @staticmethod def generate_subdomain_doppelgangers(strHost): result = list() with open("datasources/subdomains.txt", 'r') as subdomains: for subdomain in subdomains: result.append(subdomain.strip() + strHost) return result @staticmethod def generate_extra_dot_doppelgangers(strHost): result = list() for idx, char in enumerate(strHost): #A dot instead of a character result.append(strHost[:idx] + '.' + strHost[idx + 1:]) #A dot inserted between characters result.append(strHost[:idx] + '.' + strHost[idx:]) return result @staticmethod def bitflipstring(strInput): """ Flips the lowest 7 bits in each character of the given string to build a list of mutated values. @param strInput: The string to bit flip @return: A list of the mutated values. """ result = list() i = 0 for character in strInput: flippedchars = typogen.bitflipbyte(character.encode("UTF-8")[0]) for flippedchar in flippedchars: result.append(strInput[:i] + chr(flippedchar) + strInput[i + 1:]) i += 1 return result @staticmethod def generate_missing_character_typos(strHost): # missing characters result = list() idx = 0 while idx < len(strHost): strTypo = strHost[0:idx] + strHost[idx + 1:] idx += 1 result.append(strTypo) return result @staticmethod def generate_duplicate_character_typos(strHost): # duplicate characters result = list() idx = 0 while idx < len(strHost): strHostList = list(strHost) if strHostList[idx] != '.': strHostList.insert(idx, strHostList[idx]) strTypo = "".join(strHostList) result.append(strTypo) idx += 1 return result @staticmethod def generate_miskeyed_typos(strHost, strCountry): # swap to a surrounding key for each character result = list() # load keyboard mapping typoDict = typogen.loadkeyb(strCountry) for idx, char in enumerate(strHost): if char in typoDict: for replacement_char in typoDict[char]: result.append(strHost[:idx] + replacement_char + strHost[idx + 1:]) return result @staticmethod def generate_homoglyph_confusables_typos(strHost): # swap characters to similar looking characters, based on Unicode's confusables.txt results = list() global _homoglyphs_confusables #Replace each homoglyph subsequence in the strHost with each replacement subsequence associated with the homoglyph subsequence for homoglyph_subsequence in _homoglyphs_confusables: idx = 0 while 1: idx = strHost.find(homoglyph_subsequence, idx) if idx > -1: for replacement_subsequence in _homoglyphs_confusables[ homoglyph_subsequence]: #Add with just one change newhostname = strHost[: idx] + replacement_subsequence + strHost[ idx + len(homoglyph_subsequence):] try: results.append( str(codecs.encode(newhostname, "idna"), "ascii")) except UnicodeError: #This can be caused by domain parts which are too long for IDNA encoding, so just skip it pass #Add with all occurrences changed newhostname = strHost.replace(homoglyph_subsequence, replacement_subsequence) try: if newhostname not in results: results.append( str(codecs.encode(newhostname, "idna"), "ascii")) except UnicodeError: #This can be caused by domain parts which are too long for IDNA encoding, so just skip it pass idx += len(homoglyph_subsequence) else: break return results @staticmethod def generate_additional_homoglyph_typos(strHost): # swap characters to similar looking characters, based on homoglyphs.txt result = list() # load homoglyph mapping homoglyphs = typogen.loadadditionalhomoglyphs() for idx, char in enumerate(strHost): if char in homoglyphs: for replacement_char in homoglyphs[char]: newhostname = strHost[:idx] + replacement_char + strHost[ idx + 1:] try: result.append( str(codecs.encode(newhostname, "idna"), "ascii")) except UnicodeError: #This can be caused by domain parts which are too long for IDNA encoding, so just skip it pass return result @staticmethod def generate_miskeyed_addition_typos(strHost, strCountry): # add a surrounding key either side of each character result = list() # load keyboard mapping typoDict = typogen.loadkeyb(strCountry) for idx, char in enumerate(strHost): if char in typoDict: for replacement_char in typoDict[char]: result.append(strHost[:idx + 1] + replacement_char + strHost[idx + 1:]) result.append(strHost[:idx] + replacement_char + strHost[idx:]) return result @staticmethod def generate_miskeyed_sequence_typos(strHost, strCountry): # repeated surrounding keys for any character sequences in the string result = list() # load keyboard mapping typoDict = typogen.loadkeyb(strCountry) idx = 0 while idx < len(strHost): char = strHost[idx] #Loop through sequences of the same character, counting the sequence length sequence_len = 1 while idx + 1 < len(strHost) and strHost[idx + 1] == char: sequence_len += 1 idx += 1 #Increment the index at this point to make the maths easier if we found a sequence idx += 1 #Replace the whole sequence if sequence_len > 1: if char in typoDict: for replacement_char in typoDict[char]: result.append(strHost[:idx - sequence_len] + (replacement_char * sequence_len) + strHost[idx:]) return result @staticmethod def generate_transposed_character_typos(strHost): result = list() for idx in range(0, len(strHost) - 1): result.append(strHost[:idx] + strHost[idx + 1:idx + 2] + strHost[idx:idx + 1] + strHost[idx + 2:]) return result @staticmethod def is_valid_rfc3491(domainname): """ Checks if the given domain would pass processing by nameprep unscathed. :param domainname: The unicode string of the domain name. :return: True if the unicode is valid (i.e. only uses Unicode 3.2 code points) """ valid_rfc3491 = True for char in domainname: if stringprep.in_table_a1(char): valid_rfc3491 = False break return valid_rfc3491 @staticmethod def is_ascii(domainname): return str(codecs.encode(domainname, "idna"), "ascii") == domainname @staticmethod def is_in_charset(domainname, icharsetamount): if icharsetamount == 100: return True elif icharsetamount == 50: return typogen.is_valid_rfc3491(domainname) elif icharsetamount == 0: return typogen.is_ascii(domainname) def generatetyposv2(self, strHost, strCountry="gb", bTypos=True, iTypoIntensity=100, bTLDS=False, bBitFlip=True, bHomoglyphs=True, bDoppelganger=True, bOnlyAlexa=False, bNeverAlexa=False, icharsetamount=100): """ generate the typos @param strHost The hostname to generate typos for @param strCountry The country code of the keyboard to use when generating miskeyed typos @param bTypos Flag to indicate that typos should be generated @param iTypoIntensity A percentage of how intense the typo generation should be. @param bTLDS Flag to indicate that the TLDs should be swapped @param bBitFlip Flag to indicate that the hostname should be bitflipped @param bHomoglyphs Flag to indicate that homoglyphs should be generated @param bDoppelganger Flag to indicate that domain doppleganers should be generated @param bOnlyAlexa Flag to indicate that only results which appear in the Alexa top 1m domains should be returned @param bNeverAlexa Flag to indicate that results which are in the Alexa top 1m domains should not be returned """ # result list of typos lstTypos = [] if bBitFlip: lstTypos += self.bitflipstring(strHost) if bTypos: #Quick: lstTypos += self.generate_missing_character_typos(strHost) lstTypos += self.generate_duplicate_character_typos(strHost) #Balanced: if iTypoIntensity > 0: lstTypos += self.generate_miskeyed_typos(strHost, strCountry) lstTypos += self.generate_miskeyed_sequence_typos( strHost, strCountry) #Rigorous: if iTypoIntensity > 50: lstTypos += self.generate_transposed_character_typos(strHost) lstTypos += self.generate_miskeyed_addition_typos( strHost, strCountry) if bTLDS: public_suffix = self.psl.get_public_suffix(strHost) no_suffix = public_suffix[:public_suffix.find('.')] + '.' # Add each TLD for gtld in self.lstTlds: newHost = no_suffix + gtld lstTypos.append(newHost) if bHomoglyphs: lstTypos += self.generate_homoglyph_confusables_typos(strHost) lstTypos += self.generate_additional_homoglyph_typos(strHost) if bDoppelganger: #Commented out until a slider is put in - this following line results in Ssssllloooowwww searches #lstTypos += self.generate_country_code_doppelgangers(strHost) lstTypos += self.generate_subdomain_doppelgangers(strHost) lstTypos += self.generate_extra_dot_doppelgangers(strHost) uniqueTypos = set(lstTypos) # Remove any invalid typos for typo in copy.copy(uniqueTypos): if not self.is_domain_valid(typo): uniqueTypos.remove(typo) elif bOnlyAlexa and typo not in self.alexa_top: uniqueTypos.remove(typo) elif bNeverAlexa and typo in self.alexa_top: uniqueTypos.remove(typo) # Add the original domain for comparison purposes and to ensure we have at least one result try: uniqueTypos.add(strHost) except KeyError: pass unicode_typos = sorted([ codecs.decode(asciiHost.encode(), "idna") for asciiHost in uniqueTypos ]) for typo in copy.copy(unicode_typos): if not typogen.is_in_charset(typo, icharsetamount): unicode_typos.remove(typo) return unicode_typos
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. '''This is a module for dealing with urls. In particular, sanitizing them.''' import re import codecs import urllib try: import urlparse except ImportError: # pragma: no cover # Python 3 support import urllib.parse as urlparse # For publicsuffix utilities from publicsuffix import PublicSuffixList psl = PublicSuffixList() # Come codes that we'll need IDNA = codecs.lookup('idna') UTF8 = codecs.lookup('utf-8') ASCII = codecs.lookup('ascii') W1252 = codecs.lookup('windows-1252') # The default ports associated with each scheme PORTS = {'http': 80, 'https': 443} def parse(url, encoding='utf-8'): '''Parse the provided url string and return an URL object''' return URL.parse(url, encoding)
class UrlUtil: """封装一些关于url的操作""" # psl_file = fetch() # 加载https://publicsuffix.org/list/public_suffix_list.dat psl_file = codecs.open(os.path.abspath(os.path.dirname(__file__)) + os.path.sep + 'public_suffix_list.dat', encoding='utf8') psl = PublicSuffixList(psl_file) @classmethod def get_protocol(cls, url): """抽取url的协议""" parse_result = parse.urlparse(url=url) return parse_result[0].strip() # 加上strip以防万一 @classmethod def get_domain(cls, url): """抽取url的域名""" parse_result = parse.urlparse(url=url) return parse_result[1].strip() # 有的链接域名最后跟了空白,chrome还能够正确的识别解析,神奇…… @classmethod def get_top_domain(cls, url): """抽取url的一级域名""" domain = UrlUtil.get_domain(url) domain = domain.split(':')[0] # 去掉端口 ip_pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}" if re.match(ip_pattern, domain): return domain return cls.psl.get_public_suffix(domain) @classmethod def get_path(cls, url): """抽取url对应文件的路径(去掉文件本身)""" splites = url.split('/') if len(splites) == 3: return url elif len(splites) == 4 and splites[-1] == "": return url[:-1] return "/".join(url.split('/')[:-1]) @classmethod def is_gov_or_edu(cls, url): """判断url是否是政府或教育机构域名""" domain = UrlUtil.get_domain(url) if len(domain) > 7 and domain[-7:] in (".gov.cn", ".edu.cn"): return True return False @classmethod def top_domain_is_gov_or_edu(cls, top_domain): """判断主域名是否是政府或教育机构""" if top_domain in ("gov.cn", "edu.cn"): return True return False @classmethod def get_url_suffix(cls, url): """获取网页后缀名(如html、js、css)""" path = urllib.parse.urlsplit(url)[2] if '.' not in path.split('/')[-1]: return "" return path.split('.')[-1]
class IPMeta(models.Model): ip = models.GenericIPAddressField(db_index=True) created = models.DateTimeField(auto_now_add=True) invalidated = models.DateTimeField(blank=True, null=True, db_index=True) last_updated = models.DateTimeField(auto_now=True) dnsloc = models.CharField(max_length=256, blank=True, null=True) hostname = models.CharField(max_length=256, blank=True, null=True) ##is_anycast = models.NullBooleanField( blank=True, null=True ) psl = PublicSuffixList() def save(self, **kwargs): ''' IPMeta save method, does lookups if object isn't saved yet ''' if not self.id: ## do dnsloc and hostname lookups try: host_resolve = dns.resolver.query( dns.reversename.from_address(self.ip), 'PTR') h = str(host_resolve.response.answer[0].items[0]) h = h.rstrip('.') self.hostname = h except: #it's perfectly fine for a reverse not to exist pass if self.hostname: try: loc_resolve = dns.resolver.query(self.hostname, 'LOC') self.dnsloc = str(loc_resolve[0]) except: # it's perfectly fine for a loc record not to exist pass super(self.__class__, self).save(**kwargs) def info2json(self, **kwargs): ''' convert all info about this IP into a json structure. optional arguments accepted are 'lat': latitude, to georestrict by 'lon': longitude, to georestrict by 'min_rtt': rtt, to georestrict by ''' do_rtt_constraint = False try: lat = kwargs['lat'] lon = kwargs['lon'] min_rtt = kwargs['min_rtt'] do_rtt_constraint = True except: pass DNSLOC_WEIGHT = 0.95 HOSTNAME_WEIGHT = 0.90 # 0 1 2 3 4 5 7 7 # 48 51 21.953 N 2 23 0.143 E 10.00m 1.00m 10000.00m 10.00m" def _dnsloc2ll(loc_str): out = {'str': loc_str} fields = loc_str.split() if len(fields) >= 7: lat = float(fields[0]) + float(fields[1]) / 60 + float( fields[2]) / (60 * 60) if fields[3] == 'S': lat = -lat lon = float(fields[4]) + float(fields[5]) / 60 + float( fields[6]) / (60 * 60) if fields[7] == 'W': lon = -lon out['lat'] = lat out['lon'] = lon return out info = {} name2loc = [] crowdsourced = [] info['ip'] = self.ip info['hostname'] = self.hostname info['domainname'] = None try: info['domainname'] = self.__class__.psl.get_public_suffix( self.hostname) except: pass if self.dnsloc: info['dnsloc'] = _dnsloc2ll(self.dnsloc) #gc = IPGeoConstraint.objects.filter(ipmeta = self) #if len( gc ) == 1: # info['area'] = json.loads( gc[0].area.geojson ) ## add a suggestions array that contains the ordered list of suggested lat/lon suggestions = [] name2loc = self.name2loc(**kwargs) if 'dnsloc' in info: if not do_rtt_constraint or openipmap.geoutils.can_one_travel_distance_in_rtt( lat, lon, info['dnsloc']['lat'], info['dnsloc']['lon'], min_rtt): # only add this if this is possible RTTwise suggestions.append({ 'lat': info['dnsloc']['lat'], 'lon': info['dnsloc']['lon'], 'reason': 'dnsloc', 'weight': DNSLOC_WEIGHT, }) total_pop = 0 for n in name2loc: total_pop += n['pop'] for n in name2loc: # lat/lon already there n['weight'] = HOSTNAME_WEIGHT * n['pop'] / total_pop n['reason'] = 'hostname' suggestions.append(n) info['suggestions'] = suggestions crowdsourced.extend(IPRule.get_crowdsourced(self.ip)) if self.hostname: crowdsourced.extend(HostnameRule.get_crowdsourced(self.hostname)) info['crowdsourced'] = crowdsourced return info def name2loc(self, poly_geoconstraint=None, **kwargs): ''' try to figure out loc, based on name optional arguments accepted are 'lat': latitude, to georestrict by 'lon': longitude, to georestrict by 'min_rtt': rtt, to georestrict by ''' ## TODO: add polygon confinement? nr_results = 10 ## configurable? do_rtt_constraint = False try: lat = kwargs['lat'] lon = kwargs['lon'] min_rtt = kwargs['min_rtt'] do_rtt_constraint = True except: pass # this should be configurable/tags and/or have low confidence value tag_blacklist = set([ 'rev', 'cloud', 'clients', 'demarc', 'ebr', 'pool', 'bras', 'core', 'static', 'router', 'net', 'bgp', 'pos', 'out', 'link', 'host', 'infra', 'ptr', 'isp', 'adsl', 'rdns', 'tengig', 'tengige', 'tge', 'rtr', 'shared', 'red', 'access', 'tenge', 'gin', 'dsl', 'cpe' ]) if not self.hostname: return [] name = self.hostname.rstrip('.') suf = self.__class__.psl.get_public_suffix(name) rest = '' tokens = [] if suf != name: rest = name[0:len(name) - len(suf) - 1] rest = rest.lower() ## support for additional tokenization? tokens = re.split(r'[^a-zA-Z]+', rest) ## filter by token-length (for now) , TODO make configurable? tokens = [t for t in tokens if len(t) >= 3] ## remove blacklisted tokens tokens = [t for t in tokens if not t in tag_blacklist] matches = {} def add_to_matches(g, token, is_abbrev, **kwargs): if not g.loc.id in matches: ## check if geoconstraints if do_rtt_constraint and not openipmap.geoutils.can_one_travel_distance_in_rtt( lat, lon, g.loc.lat, g.loc.lon, min_rtt): return matches[g.loc.id] = { 'loc_id': g.loc.id, 'pop': g.loc.pop, 'count': g.loc.count, 'name': str(g.loc), 'lat': g.loc.lat, 'lon': g.loc.lon, 'token': set(), 'kind': set() } if poly_geoconstraint: if poly_geoconstraint.contains(g.loc.point): matches[g.loc.id] = {'in_constraint': True} matches[g.loc.id]['token'].add(token) ## this loses the link between the token and the geoalias-kind (for now) if is_abbrev: matches[g.loc.id]['kind'].add('abbrev-' + g.kind) else: matches[g.loc.id]['kind'].add(g.kind) for t in tokens: for ga in Geoalias.objects.filter(word=t): add_to_matches(ga, t, False, **kwargs) if len(matches) == 0: #print "little on strict match, trying like" for t in tokens: ## 't' can't be anything but a-zA-Z so no SQL injection should be possible sql_like_chars = '%%'.join(list(t)) sql_like_chars += '%%' # 'a%m%s%' sql = "SELECT id FROM openipmap_geoalias WHERE word LIKE '%s'" % ( sql_like_chars) for ga in Geoalias.objects.raw(sql): add_to_matches(ga, t, True, **kwargs) ## this sorts, first by 'count' (=number of hostnames the DB already has for this location) then by 'population' of location mk = sorted( matches.keys(), reverse=True, key=lambda x: (matches[x]['count'], matches[x]['pop']))[0:nr_results] ## max 10 result = [] for m in mk: entry = matches[m] # flatten entry['token'] = list(entry['token']) entry['kind'] = list(entry['kind']) result.append(entry) return result @classmethod def gather_from_msm(self, msm_id, interval=3600): #@@ todo make these configurable: limit = 10 stop = int(time.time()) start = stop - interval msm_url = "https://atlas.ripe.net/api/v1/measurement/%d/result/?start=%d&stop=%d&limit=%d&format=txt" % ( msm_id, start, stop, limit) print msm_url url_fh = urllib2.urlopen(msm_url) ips = {} for line in url_fh: try: msm = json.loads(line) prb_id = msm['prb_id'] for msm_res in msm['result']: hop_nr = msm_res['hop'] for hop_res in msm_res['result']: if 'from' in hop_res: ip = hop_res['from'] rtt = hop_res['rtt'] if not ip in ips: ips[ip] = 1 except: print "oops on %s" % (line) timediff = datetime.now() - timedelta(days=30) for ip in ips: ## figure out if there is a recent Meta fetch done try: ipm = self.objects.filter(ip=ip).filter( created__gte=timediff).order_by('-created') if len(ipm) > 0: i = ipm[0] else: ## insert it (does autolookups) i = IPMeta() i.ip = ip i.save() print "%s %s %s" % (i.ip, i.hostname, i.dnsloc) except: pass
class URI(object): """ Core URI class as specified in RFC 3986. """ suffix_list = PublicSuffixList() def __init__(self, scheme=None, authority=None, path='', query=None, fragment=None): """ Constitute a URI from various constituent parts. Requires a path, but other arguments are optional. Existing URIs will be percent-decoded as they are read, but re-encoded when printed or when certain objects (such as query or authority strings) are retrieved. To create the appropriate object for the following URI: >>> demo_uri = 'https://www.google.com/search?q=setter+python&oq=setter+python&aqs=chrome..69i57j0l3.9438j0&sourceid=chrome&ie=UTF-8' Either use: >>> x = URI.parse_uri(demo_uri) or initialize individual components, e.g.: >>> x = URI(path = '/search', scheme = 'https', authority = 'www.google.com', query = 'q=setter+python&oq=setter+python&aqs=chrome..69i57j0l3.9438j0&sourceid=chrome&ie=UTF-8') Additional query arguments can be easily added as follows: >>> x.set_query_arg('bananas', 'are_yummy!') """ self.scheme = scheme self._userinfo, self._host, self._port = None, None, None self.authority = authority self.path = path self.query_dict = {} self.query = query self.fragment = fragment def __repr__(self, normalize=False): """ Retrieves the string representation of the URI. Assembles the various URI components into a string representation, complete with percent-encoding. Can be normalized, which compresses dot-segments. Args: normalize: removes dot-segments """ result = "" if self.scheme: if normalize: result += self.scheme.lower() + ":" else: result += self.scheme + ":" if self.authority: self._build_authority(normalize=normalize) result += '//' + self._authority if normalize: if not self.path: result += '/' else: result += self._remove_dot_segments(self.path) else: result += self.path if self.query: result += '?' + self.query if self.fragment: result += '#' + self.fragment # Go through and uppercase any percent-encodings if normalize: tmp = result while tmp.rfind('%') != -1: pos = tmp.rfind('%') tmp = tmp[:pos] result = result[:pos] + result[pos:pos+3].upper() + \ result[pos+3:] result.find('%') return result def __eq__(self, other): if self.__class__ == other.__class__: return (self.__repr__(normalize=True) == other.__repr__( normalize=True)) # In case we're just comparing against a URI string elif type(other) == str: return self.__repr__() == other.lower() else: return False @property def authority(self): """ Retrieves a percent-encoded authority string, if one exists. """ self._build_authority() return self._authority @authority.setter def authority(self, authority): """ Sets the authority string and parses the userinfo, host and port. """ self._authority = authority self._parse_authority() @property def domain(self): """ Returns the domain for the given URI. """ if not (self._is_ipv4(self._host) or self._is_ipvliteral(self._host)): return self.suffix_list.get_public_suffix(self.host) else: return None @domain.setter def domain(self, domain): """ Set the domain for the given URI. """ if not (self._is_ipv4(self._host) or self._is_ipvliteral(self._host)): self.host = self.host.split(self.domain)[0] + domain else: raise Exception, "Host is an IP address, not a domain" @property def fragment(self): """ Retrieves a percent-encoded fragment, if one exists. """ if self._fragment: return self.percent_encode(self._fragment, regexes.FRAGMENT_REGEX) else: return self._fragment @fragment.setter def fragment(self, fragment): """ Sets the fragment. """ if fragment: self._fragment = self.percent_decode(fragment) else: self._fragment = None @property def host(self): """ Retrieve the percent-encoded host, if one has been set. """ if self._is_ipv4(self._host) or self._is_ipvliteral(self._host): return self._host else: return self.percent_encode(self._host, regexes.REG_NAME_ELIGIBLE_REGEX) @host.setter def host(self, host): """ Set a new host for this URI. """ if host == '': host = None self._host = self.percent_decode(host) @property def path(self): """ Retrieve the path for this URI """ if self.scheme: return '/'.join([ self.percent_encode(x, regexes.PATH_REGEX) for x in self._path ]) else: return '/'.join([ self.percent_encode(x, regexes.PATH_NOSCHEME_REGEX) for x in self._path ]) @path.setter def path(self, path): """ Set a new path for this URI """ if self.authority and path != '': if path[0] != '/': raise Exception, "Invalid path: when authority is present," + \ " path should begin with a '/' character" elif not self.authority: if path[0:2] == '//': raise Exception, "Invalid path: when no authority is" + \ " present, path cannot begin with '//'" self._path = [self.percent_decode(x) for x in path.split('/')] @property def port(self): """ Retrieve the port, if one has been set. """ return self._port @port.setter def port(self, port): """ Set a new port for this URI. If a host has been defined, re-build the authority string, else pass (an authority string with no host is meaningless). Args: port: the target port """ self._port = int(port) @property def query(self): """ Retrieves a percent-encoded query string, if one has been set. """ self._build_query() if self._query: return self.percent_encode(self._query, regexes.QUERY_REGEX) else: return None @query.setter def query(self, query): """ Sets the query string. """ if query: self._query = self.percent_decode(query) self._parse_query() else: self._query = None @property def tld(self): """ Retrieves the top-level domain, if one has been set. """ return '.'.join(self.domain.split('.')[1:]) @tld.setter def tld(self, tld): """ Sets the top-level domain. """ self.domain = '.'.join(self.domain.split('.')[:1]) + '.' + tld @property def userinfo(self): """ Retrieves the percent-encoded userinfo string, if one exists. """ if self._userinfo: return self.percent_encode(self._userinfo, regexes.USERINFO_REGEX) else: return None @userinfo.setter def userinfo(self, userinfo): """ Set a new userinfo for this URI. """ if userinfo == "": userinfo = None self._userinfo = self.percent_decode(userinfo) def set_query_arg(self, key, value=None): """ Sets a query argument. """ self.query_dict[key] = value def get_query_arg(self, key): """ Gets a query argument. """ return self.query_dict[key] @staticmethod def _is_ipv4(host_string): """ Checks to see if a given host string is a valid IPv4 address """ return regexes.IPV4_REGEX.search(host_string) @staticmethod def _is_ipvliteral(host_string): """ Checks to see if a given host string is a valid IPvLiteral address """ return regexes.IPVLITERAL_REGEX.search(host_string) @staticmethod def percent_encode(string, regex): """ Percent-encode a string w/ hex codes. Given a provided string and regex, encodes any characters that don't match the provided characters in the regex. Args: string: the string to be encoded regex: a regex listing any characters that don't need encoding """ return ''.join([ '%' + x.encode('hex') if not regex.search(x) else x for x in string ]) @staticmethod def percent_decode(string): """ Percent-decode a string. See also: percent_encode(string, regex) """ return ''.join(_PercentDecoder(string)) def _build_authority(self, normalize=False): """ Build a percent-encoded authority string and set the authority attribute. Takes the userinfo, host, and port attributes and attempts to build a percent-encoded authority string. If the host is not set, returns None as a host is necessary for a valid authority string. """ self._authority = "" if self._userinfo: self._authority += self.userinfo + '@' if self._host: if normalize: host = self.host.lower() else: host = self.host else: self.authority = None return self._authority += host if self.port: self._authority += ':' + str(self.port) def _build_query(self): """ Build a percent-encoded query string from the query dict. """ if len(self.query_dict.keys()) > 0: self._query = [] for key, value in self.query_dict.iteritems(): if value: self._query.append(key + '=' + value) else: self._query.append(key) self._query = '&'.join(self._query) else: self._query = None def _parse_authority(self): """ Parses the authority attribute for userinfo, host, and port. Follows available regular expressesions to identify userinfo, host, and port data. If identified, sets the corresponding attributes. If the host is of reg-name type (as opposed to IPv4 or an IP-literal), this function will also percent-decode the host. """ if not self._authority: return auth_string = self._authority if auth_string.find('@') != -1: self.userinfo, auth_string = auth_string.split('@', 1) if self._is_ipv4(auth_string): search_result = self._is_ipv4(auth_string) elif self._is_ipvliteral(auth_string): search_result = self._is_ipvliteral(auth_string) else: search_result = regexes.REG_NAME_SEARCH_REGEX.search(auth_string) self.host = auth_string[search_result.start():search_result.end()] # Check for port info if len(auth_string) != len(self.host): self.port = auth_string[search_result.end() + 1:] def _parse_query(self): """ Parse a query string into a query_dict attribute. """ if self._query.find('&'): query_array = self._query.split('&') elif self._query.find(';'): query_array = self._query.split(';') else: query_array = [self._query] self.query_dict = {} for element in query_array: try: key, value = element.split('=') value = self.percent_decode(value) except ValueError: key, value = element, None key = self.percent_decode(key) self.query_dict[key] = value @staticmethod def _remove_dot_segments(path): """ Removes dot segments from a given path. """ segments = path.split('/') compressed_path = [] for segment in segments: if segment == '.': pass elif segment == '..': compressed_path.pop() else: compressed_path.append(segment) return '/'.join(compressed_path) def chdir(self, changepath): """ Functions like the UNIX cd or chdir command. Args changepath: the subdirectory to change to """ if changepath[0] == '/': changepath = changepath[1:] if self._path[-1] == '': self._path.pop() self._path.extend(changepath.split('/')) @staticmethod def parse_uri(uri_string): """ Parses a given URI using the regex provided in RFC 3986. """ result = regexes.URI_REGEX.match(uri_string).groups() scheme, authority, path, query, fragment = \ [result[i] for i in [1,3,4,6,8]] return URI(path=path, scheme=scheme, authority=authority, query=query, fragment=fragment)