def extract_twold(url): etld_obj = etld.etld() registered = '' suffix = '' registered, suffix = etld_obj.parse(url) twold = '.'.join([registered.split('.')[-1], suffix]) #print "twold: ", twold return twold
def __split_url(self, url): """Split a url in several pieces, returning a tuple with each of that pieces. It will also remove the user (http://user:[email protected]) and the port (http://domain.com:8080) Example: With the url "http://www.google.com/test/extra/index.html", the function will return this pieces: protocol: The protocol used by the url (in the example, "http"). domain: The domain of the url (in the example, "google.com"). subdomain: The subdomain of the url (in the example, "www"). firstlevel: The first level of the path (in the example, "test"). extra: The part of the URL not contained in the previous pieces (in the example, "extra/index.html"). """ url = url.lower() splitted_url = url.split("://") if len(splitted_url) > 1: protocol = splitted_url[0] url = splitted_url[1] else: protocol = "http" if protocol != "http" and protocol != "https": return (None, None, None, None, None) parsed_url = urlparse("%s://%s" % (protocol, url)) domain_string = parsed_url.netloc path_string = parsed_url.path if not domain_string: return (None, None, None, None, None) else: if domain_string.find("@") > -1: domain_string = domain_string.split("@")[1] if domain_string.find(":") > -1: domain_string = domain_string.split(":")[0] etld_object = etld.etld() try: subdomain, domain = etld_object.parse("%s://%s" % (protocol, domain_string)) except: return (None, None, None, None, None) if subdomain == "": subdomain = None if path_string: path_pieces = path_string.split("/") firstlevel = path_pieces[1] if len(path_pieces) > 1 and path_pieces[1] else None extra = "/".join(path_pieces[2:]) if len(path_pieces) > 2 and path_pieces[2] else None else: firstlevel = None extra = None return (protocol, domain, subdomain, firstlevel, extra)
def extract_twold(hostname): if hostname is None: return None hostname = hostname.strip() if len(hostname) == 0: return None if isIP4Address(hostname): return None try: etld_obj = etld.etld() registered = '' suffix = '' registered, suffix = etld_obj.parse(hostname) twold = '.'.join([registered.split('.')[-1], suffix]) print "hostname: %s -- twold: %s" % (hostname, twold) return twold except: print "Unable to compute twold: hostname: %s" % (hostname, ) return None
def __split_url(self, url): """Split a url in several pieces, returning a tuple with each of that pieces. It will also remove the user (http://user:[email protected]) and the port (http://domain.com:8080) Example: With the url "http://www.google.com/test/extra/index.html", the function will return this pieces: protocol: The protocol used by the url (in the example, "http"). domain: The domain of the url (in the example, "google.com"). subdomain: The subdomain of the url (in the example, "www"). firstlevel: The first level of the path (in the example, "test"). extra: The part of the URL not contained in the previous pieces (in the example, "extra/index.html"). """ url = url.lower() splitted_url = url.split("://") if len(splitted_url) > 1: protocol = splitted_url[0] url = splitted_url[1] else: protocol = 'http' if protocol != "http" and protocol != "https": return (None, None, None, None, None) parsed_url = urlparse("%s://%s" % (protocol, url)) domain_string = parsed_url.netloc path_string = parsed_url.path if not domain_string: return (None, None, None, None, None) else: if domain_string.find("@") > -1: domain_string = domain_string.split("@")[1] if domain_string.find(":") > -1: domain_string = domain_string.split(":")[0] etld_object = etld.etld() try: subdomain, domain = etld_object.parse("%s://%s" % (protocol, domain_string)) except: return (None, None, None, None, None) if subdomain == "": subdomain = None if path_string: path_pieces = path_string.split("/") firstlevel = path_pieces[1] if len( path_pieces) > 1 and path_pieces[1] else None extra = "/".join( path_pieces[2:] ) if len(path_pieces) > 2 and path_pieces[2] else None else: firstlevel = None extra = None return (protocol, domain, subdomain, firstlevel, extra)
return -(a[1] - b[1]) def calculateDiffs(prevData, data): """For each domain in data, calculate the delta between it and the domain in prevData.""" retval = [] for domain, count in data.iteritems(): prevValue = prevData.get(domain, 0) count -= prevValue retval.append((domain, count)) retval.sort(rank_by_count) return retval etldService = etld.etld() def getSLD(domain): """Get the "second level domain", e.g. "mozilla.org" or "bbc.co.uk" """ try: sp = etldService.parse(domain) # returns ("5.4.bbc", "co.uk") sld = sp[0].rsplit(".", 1)[-1] tld = sp[1] return sld + "." + tld except etld.EtldException: return domain mx_queries = 0 mx_cache_hit = 0
def rank_by_count(a,b): return -(a[1] - b[1]) def calculateDiffs(prevData, data): """For each domain in data, calculate the delta between it and the domain in prevData.""" retval = [] for domain,count in data.iteritems(): prevValue = prevData.get(domain, 0) count -= prevValue retval.append((domain,count)) retval.sort(rank_by_count) return retval etldService = etld.etld() def getSLD(domain): """Get the "second level domain", e.g. "mozilla.org" or "bbc.co.uk" """ try: sp = etldService.parse(domain) # returns ("5.4.bbc", "co.uk") sld = sp[0].rsplit(".", 1)[-1] tld = sp[1] return sld + "." + tld except etld.EtldException: return domain mx_queries = 0 mx_cache_hit = 0