Example #1
0
def analyze_url(url):
    """
    Look just at the URL to see if a suitable title text can be found.  This
    method is much faster than actually visiting the URL to find the title
    element in the downloaded file. We want to do this for special sites like
    Facebook, which doesn't allow anonymous downloading of certain pages, like
    group pages.

    Args:
        url: A string that is a URL

    Returns:
        A string that is the title text to be used. If no suitable title text
        can be produced, return the empty string, "".
    """
    try:
        tl = get_tld(url)
    except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound):
        logging.debug("bad TLD; trying with http:// prefix")
        try:
            tl = get_tld("http://" + url)
        except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound):
            logging.debug("still bad TLD; giving up")
            return ""
    if tl == "facebook.com" and "facebook.com/groups/" in url:
            return "Facebook group page post"
    return ""
Example #2
0
def current_site_domain(request=None):
    try:
        if request:
            from tld import get_tld
            domain = get_tld('http://' + request.get_host())
        else:
            domain = settings.SUBDOMAIN_BASE_DOMAIN
    except Exception:
        from django.contrib.sites.models import Site
        try:
            if request:
                d = Site.objects.get_current(request=request).domain
            else:
                d = Site.objects.first().domain
            if d[0:4] != 'http':
                d = 'http://' + d
            domain = get_tld(d)
        except Exception:
            try:
                domain = settings.SUBDOMAIN_BASE_DOMAIN
            except Exception:
                domain = Site.objects.first().domain

    prefix = 'www.'
    if getattr(settings, 'REMOVE_WWW_FROM_DOMAIN', False) \
            and domain.startswith(prefix):
        domain = domain.replace(prefix, '', 1)

    return domain
Example #3
0
def get_sources_sites(html, sites):
    """ (str, list of str) -> list of [str, str]
    Searches and returns links redirected to sites within the html
    links will be storing the whole url and the domain name used for searching.
    Returns empty list if none found

    Keyword arguments:
    html                -- string of html
    sites               -- list of site urls to look for
    """
    result_urls_matched = []
    result_urls_unmatched = []
    # Format the site to assure only the domain name for searching
    formatted_sites = []

    for site in sites:
        formatted_sites.append(tld.get_tld(site))

    for url in re.findall(
            "href=[\"\'][^\"\']*?.*?[^\"\']*?[\"\']", html, re.IGNORECASE):
        try:
            domain = tld.get_tld(url[6:-1])
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            continue
        if domain in formatted_sites:
            # If it matches even once, append the site to the list
            result_urls_matched.append([url[6:-1], domain])
        else:
            result_urls_unmatched.append([url[6:-1], domain])

    # Return the list
    return [result_urls_matched,result_urls_unmatched]
Example #4
0
def get_source_sites(urls, sites):
    """ (status, list of str) -> list of str
    Searches and returns links redirected to sites within the urls
    of the tweet
    Returns empty list if none found

    Keyword arguments:
    tweet           -- Status structure to be searched through
    sites           -- List of site urls to look for
    """
    # store_all = configuration()['storage']['store_all_sources']

    result_urls_matched = []
    result_urls_unmatched = []
    formatted_sites = []

    for site in sites:
        formatted_sites.append(tld.get_tld(site))

    for url in urls:
        try:
            real_url = requests.get(url["expanded_url"], timeout=10).url
            domain = tld.get_tld(real_url)
        except:
            continue
        if domain in formatted_sites:
            # If it matches even once, append the site to the list
            result_urls_matched.append([real_url, domain])
        else:
            result_urls_unmatched.append([real_url, domain])

    # Return the list
    return [result_urls_matched, result_urls_unmatched]
Example #5
0
def get_sources_sites(article, sites):
    """ (str, list of str) -> list of [str, str]
    Searches and returns links redirected to sites within the html
    links will be storing the whole url and the domain name used for searching.
    Returns empty list if none found

    Keyword arguments:
    html                -- string of html
    sites               -- list of site urls to look for
    """
    result_urls_matched = []
    result_urls_unmatched = []
    # Format the site to assure only the domain name for searching
    formatted_sites = set()

    for site in sites:
        formatted_sites.add(tld.get_tld(site))

    for url in article.get_links(article_text_links_only=True):
        try:
            domain = tld.get_tld(url.href)
        #apparently they don't inherit a common class so I have to hard code it
        except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound, tld.exceptions.TldIOError):
            continue
        if domain in formatted_sites:
            # If it matches even once, append the site to the list
            result_urls_matched.append([url.href, domain, url.text])
        else:
            result_urls_unmatched.append([url.href, domain, url.text])

    # Return the list
    return [result_urls_matched,result_urls_unmatched]
Example #6
0
    def _is_valid_link(self, link):
        """
        Return True if given link is non document, Since this is not a perfect way to check
        but it avoids a call to server. 
        """

        # Check ONLY_ROOTDOMAIN

        scheme, netloc, path, params, query, fragment = urlparse(link)

        try:
            if get_tld(self.base_url) == get_tld(link) and not ONLY_ROOTDOMAIN:
            # if get_tld(self.base_url) == get_tld(link):
                return False
        except Exception as e:
            log.error(str(e), self.base_url, link)


        # Need to add more
        DOC_EXT = [".pdf", ".xmls", ".docx", ".odt"]

        try:

            urlPath = [i for i in (path.split('/')) if i]

            file_name = urlPath[-1]
            ext = file_name.split(".")[-1]
        except IndexError:
            # Its just a root URL
            return True
        return ext not in DOC_EXT
Example #7
0
 def parse(self, response):
     domain = get_tld(response.url)
     items = []
     url_result = urlparse(response.url)
     top_domain = url_result.scheme + '://'+url_result.netloc
     
     for sel in response.xpath('//a/@href'):
         item = LinkscrawlItem()
         link = sel.extract()
         if link.find("http://") == 0 or link.find("https://") == 0 or link.find("www.") == 0:
             try:
                 target_domain = get_tld(link)
                 #print domain +"==================="+target_domain +"==================" + link
                 if domain != target_domain:
                     item['link'] = link
                     item['source'] = top_domain
                     yield item
                     #items.append(item)
                 else:
                     yield scrapy.Request(link,callback=self.parse)
             except:
                 print "The url can't get the domain. Ignored..." + link
                 
         if link.startswith('/'):
             yield scrapy.Request(top_domain+link, callback=self.parse)         
Example #8
0
def get_source_sites(urls, sites):
    """ (list of urls, list of source site urls)
    Return a list of expanded urls found in source urls,
    and a list of expanded urls not found in srouce urls.
    """
    result_urls_matched = []
    result_urls_unmatched = []
    formatted_source_sites = []
    for site in sites:
        formatted_source_sites.append(tld.get_tld(site))
    for url in urls:
        try:
            # with eventlet, the request will time out in 10s
            with eventlet.Timeout(10):
                real_url = requests.get(url, timeout=10).url
            domain = tld.get_tld(real_url)
        except:
            continue
        if domain in formatted_source_sites:
            # If it matches even once, append the site to the list
            result_urls_matched.append([real_url, domain])
        else:
            result_urls_unmatched.append([real_url, domain])

    return [result_urls_matched,result_urls_unmatched]
def valid_domain(domain):
    """This function return True if the passed domain is valid, false otherwise"""
    try:
        get_tld(domain,fix_protocol=True)
        return True
    except:
        return False
Example #10
0
def crawl(n):
	i=0
	seed = []
	db = MySQLdb.connect(host='cspp53001.cs.uchicago.edu',db='jcbraunDB',user='******',passwd='3312crystal')
	cursor = db.cursor()
	outLinks = []
	if (n ==0):
		execString = ("SELECT URL, Domain FROM safeSeed WHERE crawled=0;") 
		cursor.execute(execString)
		seedx = cursor.fetchall()
		
	else:
		execString = ("SELECT URLTo FROM safeOutboundLinks WHERE lvl=%i;" % (n)) 
		cursor.execute(execString)
		seedx = cursor.fetchall()
		print seedx

	for row in seedx:
		i = i+1
		try:
			url = row[0]
			print url
			domain = get_tld(url, fail_silently=True)
			content = urllib2.urlopen(url, timeout=3).read(2000000)
			for k in re.findall('''href=["'](.[^"']+)["']''', content):			
				z = ((re.match('http://' , k) is not None) or (re.match('//' , k) is not None))
				y = re.match('/' , k)
				if (y):
					k = (("/").join((re.split("/", url)))+k)			
				if z or y:
					domainTo = (get_tld(k, fail_silently=True))
					print "domainTo is: %s" %domainTo
					reqURL = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=f4p&key=AIzaSyCD0pNAG-6HVh_W6udGYZFz-2_p0yHDD5k&appver=31&pver=3.1&url=" + k
					response = urllib2.urlopen(reqURL).getcode()
					if (response==200):
						print ("Found dangerous site \n")
						bad = 1
						execString = ("INSERT INTO inboundLinks (Domain, domainTo, URL, URLto, Crawled) VALUES ('%s', '%s', '%s', '%s', 'false');" % (domain, domainTo, url, k))
						cursor.execute(execString)
					else:
						bad = 0
						execString = ("INSERT INTO safeOutboundLinks (Lvl, Domain, domainTo, URL, URLto, Crawled, toSpam) VALUES ('%i', '%s', '%s', '%s', '%s', '0', '%i');" % ((n+1), domain, domainTo, url, k, bad))
						cursor.execute(execString)
						print("adding %s" %k)
					db.commit()	
			bank = open('notspam/%d.txt' %i, 'w')
			bank.write (content)
			content=db.escape_string(content)
			execString = ("INSERT INTO safeContent (Lvl, Content, Domain, URL, CopySource) VALUES ('%i', '%s', '%s', '%s', 'crawl');" % ((n+1), content, domain, url)) 
			cursor.execute(execString)
			print url + " success! \n"
			bank.close()
			
		except Exception as e:
			print ("Broken link to %s" %url)	
			print (type(e))
			print (e.args)
	db.commit()
	db.close()
Example #11
0
def valid_a_href(a_elements, main_url=None):
    hrefs = [a.get('href') for a in a_elements]
    hrefs = [link for link in hrefs if link and link.startswith('http://')]
    if main_url:
        main_tld = get_tld(main_url, fail_silently=True)
        hrefs = [link for link in hrefs if get_tld(link, fail_silently=True) == main_tld]

    return hrefs
Example #12
0
 def _from_same_site(self, ads_host, ads_target):
     if ads_target is None:
         return True
     if not ads_target.startswith("http"):
         return True
     ads_host_domain = get_tld(ads_host, as_object=True).domain
     ads_target_domain = get_tld(ads_target, as_object=True).domain
     return True if ads_host_domain == ads_target_domain else False
Example #13
0
	def bulk_insert_urls(self, content):
		logger.debug("in bulk insert urls")

		for line in content:

			items = line.split()

			if len(items) < 9:
				logger.error("error parsing line")
				logger.error(line)
			else:
				if ("http" in items[8]  and "//" in items[8]):
					parts  = items[8].split("//")[1].split("/")

					domain = parts[0]
					res = get_tld(items[8], as_object=True, fail_silently=True)

					if res is not None:
						tld = "%s.%s" % (res.domain, res.suffix)
					else:
						tld = parts[0]
					path = ""
					if len(parts) > 0:
						path = "".join(parts[1:])

					url = {'ts':items[2].split(".")[0], 'host':items[4], 'tld':tld, 'domain':domain, 'path': path}
					try:
						logger.debug("inserting %s %s %s" % (url['ts'], url['host'], url['tld']))
						self.conn.execute("INSERT INTO URLS(ts, host, tld, domain, path, datasource) VALUES(?,?,?,?,?,?)", (url['ts'], url['host'],url['tld'], url['domain'], url['path'], 'squid'))
					except Exception, e:
						logger.error("error inserting url %s" % str(url))
def getDomainTLD(ip="0.0.0.0"):
	if ip == "0.0.0.0" or ip == "" or ip == "\n":
		return

	global content
	global counter
	counter = counter +1
	print str(counter) + ":"
	try:
		t = threading.Thread(target=runGetDNSByIP, args=(ip,))
		#threads.append(t)	
		#domain = str(socket.gethostbyaddr(ip)[0])
		t.start()
		success = event.wait(5.0)
		print "{%s}  -  [%s]" % (ip, domain)
		if domain == "":
			content = content + "\n"
		else:
			content = content + get_tld("http://"+domain) + "\n"
		#lock.acquire()
		#writeFile(ip+","+get_tld("http://"+domain) + "\n", outputFile, True)
		#c.acquire()
		#print ip+","+get_tld("http://"+domain) + "\n"
		#content = content + ip+",,,,,,,,,,,," + get_tld("http://"+domain) + "\n"
		#lock.release()
		#return get_tld("http://"+domain)
	except Exception:
		#lock.acquire()
		#writeFile("\n", outputFile, True)
		#content = content + "\n"
		print ":::ERROR::: Thread {"+str(counter)+"} failed to create. IP:" + ip + "\n"
    def __init__(self,
                 tbb_path="",
                 tor_cfg=cm.LAUNCH_NEW_TBB_TOR,
                 tbb_fx_binary_path="",
                 tbb_profile_path="",
                 tbb_logfile_path="",
                 pref_dict={},
                 socks_port=None,
                 virt_display=cm.DEFAULT_XVFB_WINDOW_SIZE,
                 canvas_exceptions=[]):

        self.check_tbb_paths(tbb_path, tbb_fx_binary_path, tbb_profile_path)
        self.tor_cfg = tor_cfg
        self.canvas_exceptions = [get_tld(url) for url in canvas_exceptions]
        self.setup_virtual_display(virt_display)
        self.profile = webdriver.FirefoxProfile(self.tbb_profile_path)
        add_canvas_permission(self.profile.path, self.canvas_exceptions)
        if socks_port is None:
            if tor_cfg == cm.USE_SYSTEM_TOR:
                socks_port = cm.DEFAULT_SOCKS_PORT  # 9050
            else:
                socks_port = cm.TBB_SOCKS_PORT  # 9150
        self.socks_port = socks_port
        self.update_prefs(pref_dict)
        self.setup_capabilities()
        self.export_lib_path()
        self.binary = self.get_tbb_binary(logfile=tbb_logfile_path)
        super(TorBrowserDriver, self).__init__(firefox_profile=self.profile,
                                               firefox_binary=self.binary,
                                               capabilities=self.capabilities,
                                               # default timeout is 30
                                               timeout=60)
        self.is_running = True
    def get_tld_locations(self,json_data):
        url_location_dictionary = {}
    
        url_count = 0
        for article in json_data:
            for url in json_data[article]:
                url_count += 1
                #split url and get suffix
                try:
                    res = get_tld(url, as_object=True)
                except TldDomainNotFound:
                    print "tld not found: "+ str(url)
                    continue
                except TldBadUrl:
                    print "bad url: "+ str(url)
                    continue
                tld = res.suffix
                #check suffix for combination and get only last part if necessary, after the dot needs to be added in front for
                if (tld.count(".") > 0):
                    results = re.split(r'\.', tld)
                    tld = results[len(results)-1]
                tld ="."+tld
                
                #print tld
                try:
                    if (self.iana[tld] == 'country-code'):
                        for w in self.wfb:
                            if w.tld == tld:
                                url_location_dictionary[url]= w.iso2c
                    else:
                        url_location_dictionary[url]= tld.replace(".",'').upper() 
                except KeyError:
                    print "no entry found for: "+ str(tld)

        return url_location_dictionary
Example #17
0
    def _parse(self, dataset):
        """
        Parse data to calculate and sort about news
        :param dataset: list, list of dict that query date as key, value is tuple, including url, cat and pub_date on news
        :return: sorted dataset, type like as: [{'20160420': {...}}, ...]
        """
        total_data = []
        dt_dist = 'dt_dist'  # Key to news publish datetime distribute

        for dict_to_date in dataset:
            for query_date, data_list in dict_to_date.iteritems():
                # `query_date` is date string, `data_list` is list
                result = defaultdict(lambda: defaultdict(int))
                for uri, cat, dt in data_list:
                    try:
                        domain = get_tld(uri, as_object=True).domain
                        result[domain][cat] += 1
                        result[domain]['count'] += 1
                        result[domain].setdefault(dt_dist, []).append(dt)
                    except Exception:
                        pass

                for site_domain in result.keys():
                    result[site_domain][dt_dist].sort()
                    result['total_count'] = result.get('total_count', 0) + len(result[site_domain][dt_dist])
                total_data.append({query_date: result})
        return total_data
Example #18
0
def auto_select_target(target, output=None):
    """Auto selection logic"""
    print "Target: %s" % target
    try:
        inp=IPAddress(target);
        if inp.is_private() or inp.is_loopback():
            print "Internal IP Detected : Skipping"
            sys.exit()
        else:
            print "Looks like an IP, running ipOsint...\n"
            ipOsint.run(target, output)
    except SystemExit:
        print "exiting"
    except AddrFormatError:
        if re.match('[^@]+@[^@]+\.[^@]+', target):
            print "Looks like an EMAIL, running emailOsint...\n"
            emailOsint.run(target, output)
        elif get_tld(target, fix_protocol=True,fail_silently=True) is not None:
            print "Looks like a DOMAIN, running domainOsint...\n"
            domainOsint.run(target, output)
        else:
            print "Nothing Matched assuming username, running usernameOsint...\n"
            usernameOsint.run(target, output)
    except:
        print "Unknown Error Occured"
def get_link_text(url, mime_type, data=None):
    '''
    Take URL, MIME type, and optional data to produce the link text.
    '''
    tld = get_tld(url)
    result = "File on " + tld
    if mime_type.startswith("image"):
        result = "Image on " + tld
    elif mime_type == "application/pdf":
        result = "PDF on " + tld
    elif "text/html" in mime_type:
        try:
            soup = BeautifulSoup(data, 'html.parser')
            meta = soup.find_all("meta")
            possible = [i.get("content") for i in meta if i.get("property") == "og:title"]
            if possible:
                result = possible[0].strip()
            elif soup.title.string:
                result = messy_title_parse(soup.title.string)
            else:
                result = "Page on " + tld
        except AttributeError:
            # Probably just empty title when trying to get
            # soup.title.string
            result = "Page on " + tld
    if len(result) > 255:
        result = result[:253] + " …"

    return result
Example #20
0
def process_arguments():
    if len(sys.argv) != 2:
        print_error('Invalid number of arguments.')
    try:
        lines = []
        count = 0
        with open(sys.argv[1], 'r') as f:
            for line in f:
                if count == 0:
                    doc_pair = {}
                elif count == len(prefixes):
                    count = 0
                    lines.append(doc_pair)
                    continue
                output = match_line(prefixes[count] + ': ', line.rstrip('\n'))
                doc_pair[prefixes[count]] = output
                count += 1
    except IOError:
        print_error('Unable to read file \'{}\'.'.format(sys.argv[1]))
    for line in lines:
        line['FUZZY SRC'] = get_longest_ascii(line['SRC TXT'])
        line['FUZZY TGT'] = get_longest_ascii(line['TGT TXT'])
        try:
            domain = tld.get_tld(line['SRC URL'])
        except tld.exceptions.TldDomainNotFound:
            domain = None
        line['DOMAIN'] = domain
    return lines
Example #21
0
def get_monthly_archive_urls(links, page_url):
    '''Scans the provided links for blogspot style archives which are of
    the form website.com/yyyy_dd_01_archive.html'''
    domain = get_tld(page_url)
    monthly_archive_urls = []
    for link in links:
        # Try for drop down lists using <option value="url.com/...">
        try:
            url = link.attrs['value']
            match = re.search(domain + "/\d{4}_\d{2}_01_archive.html", url)
            if match:
                monthly_archive_urls.append(url)

        except KeyError:
            pass

        # Try for actual <a href="url.com/..." > links
        try:
            url = link.attrs['href']
            match = re.search(domain + "/\d{4}_\d{2}_01_archive.html", url)
            if match:
                monthly_archive_urls.append(url)

        except KeyError:
            pass

    return list(set(monthly_archive_urls))
Example #22
0
def monitor(message, context):
    """certstream events callback handler"""

    all_domains = ""
    if message['message_type'] == "heartbeat":
        return

    if message["message_type"] == "certificate_update":
        all_domains = message["data"]["leaf_cert"]["all_domains"]
        
    for domain in set(all_domains):
        PBAR.update(1)

        # all magic happens here
        try:
            if domain.count(".") > 1 and not domain.startswith("*.") and not re.search("\d$", domain) and "cloudflaressl" not in domain and "xn--" not in domain and not domain.endswith("local"):
                tld = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True)
                if tld is not None and tld.tld in BOUNTY_LIST and tld.tld != domain and tld.subdomain != "www":
                    if check_subdomain_not_known_in_db(domain):
                        update_subdomain(domain, "N")
                        MONITOR_QUEUE.put(domain)
        except Exception as e:
            logging.exception("message")
            print (domain)

    t.sleep(.1)
Example #23
0
 def get_domain_pubdate(file_path, yesterday):
     with open(file_path) as fp:
         url, pub_date = fp.readlines()[:2]
         if int(pub_date.strip()) and pub_date.startswith(yesterday):
             domain = tld.get_tld(url.strip(), as_object=True).domain
             return domain, pub_date.strip()
     return None, None
Example #24
0
    def run(self, target):

        domainname = get_tld("http://" + target)

        whoisdomaincmd = "whois " + domainname + " > whois-domain-" + domainname + ".txt"
        print "Whois DOMAIN lookup cmd: " + whoisdomaincmd
        print commands.getoutput(whoisdomaincmd)
Example #25
0
    def scrap(self, url):
        self.external_domain = "http://"+get_tld(url)
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text)

        self._description(soup)
        self._get_episodes(soup)
Example #26
0
	def bulk_insert_urls(self, content):
		
		for line in content:

			items = line.split()
			
			
			if len(items) < 9:
				logger.error("error parsing line")
				logger.error(line)
			else:
				if ("http" in items[8]  and "//" in items[8]):
					parts  = items[8].split("//")[1].split("/")

					domain = parts[0]	
					res = get_tld(items[8], as_object=True, fail_silently=True)

					if res is not None:
						tld = "%s.%s" % (res.domain, res.suffix)
					else:
						tld = parts[0]
						
					
					path = ""
					if len(parts) > 0:
						path = "".join(parts[1:])
					
					#sometimes dest can just be a '-', need to set it to a valid host so postgres does not barf
					if items[11].split("/")[1].strip() == "-":
						dest = "0.0.0.0"
					else:
						dest = items[11].split("/")[1]
					
					#url = {'ts':items[2].split(".")[0], 'host':items[4], 'tld':tld, 'domain':domain, 'path': path}
					url = {'ts':items[2].replace(".",""), 
							'host':items[4], 
							'tld':tld, 
							'domain':domain, 
							'path': path, 
							'verb':items[7],
							'clength':items[6],
							'statuscode':items[5].split("/")[1],
							'dest':dest,
							'contenttype':items[12],
						}
					
					try:
						#print "inserting %s %s %s %s %s %s" % (url['ts'], url['host'],url['tld'], url['domain'], url['path'], 'squid')
						sql = "SELECT deviceid from vpnips WHERE ip=%s"
						data = (url['host'],)
						self.cur.execute(sql,data)
						deviceid =  self.cur.fetchone()
						
						
						sql = "INSERT INTO http3 (id, httpverb, httpverbparam, httpstatuscode, httphost, contenttype, contentlength, src, dest, timestamp) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
						data = (deviceid,url['verb'],url['path'], url['statuscode'],url['tld'], url['contenttype'], url['clength'], url['host'], url['dest'], url['ts'])			
						self.cur.execute(sql,data)
					except Exception, e:
						logger.error(e)
						logger.error("error inserting url %s" % str(url))
    def _record_time(self, request):
        if hasattr(request, '_start_time'):
            ms = int((time.time() - request._start_time) * 1000)
            if request.is_ajax():
                is_ajax = True
            else:
                is_ajax = False

            is_authenticated = False
            is_staff = False
            is_superuser = False
            if is_user_authenticated(request.user):
                is_authenticated = True
                if request.user.is_staff:
                    is_staff = True
                if request.user.is_superuser:
                    is_superuser = True

            referer = request.META.get('HTTP_REFERER')
            referer_tld = None
            referer_tld_string = ''
            if referer:
                try:
                    referer_tld = get_tld(referer, as_object=True)
                except (TldBadUrl, TldDomainNotFound, TldIOError):
                    pass
            if referer_tld:
                referer_tld_string = referer_tld.tld

            url = request.get_full_path()
            url_query = parse.parse_qs(parse.urlparse(url).query)

            # This allows you to measure click rates for ad-campaigns, just
            # make sure that your ads have `?campaign=something` in the URL
            campaign_keyword = getattr(
                settings, 'INFLUXDB_METRICS_CAMPAIGN_KEYWORD', 'campaign')
            campaign = ''
            if campaign_keyword in url_query:
                campaign = url_query[campaign_keyword][0]

            data = [{
                'measurement': 'django_request',
                'tags': {
                    'host': settings.INFLUXDB_TAGS_HOST,
                    'is_ajax': is_ajax,
                    'is_authenticated': is_authenticated,
                    'is_staff': is_staff,
                    'is_superuser': is_superuser,
                    'method': request.method,
                    'module': request._view_module,
                    'view': request._view_name,
                    'referer': referer,
                    'referer_tld': referer_tld_string,
                    'full_path': url,
                    'path': request.path,
                    'campaign': campaign,
                },
                'fields': {'value': ms, },
            }]
            write_points(data)
Example #28
0
File: iRecon.py Project: xpn/iRecon
	def run(self, target):
		
		domainname = get_tld("http://"+target)
		
		whoisdomaincmd='whois ' + domainname + ' > whois-domain-' + domainname + '.txt'
		print "Whois DOMAIN lookup cmd: " + whoisdomaincmd
		print commands.getoutput(whoisdomaincmd)
Example #29
0
 def extract_tld(self, url):
     try:
         return get_tld(url)
     except:
         traceback.print_exc()
         print "\n\nInvalid url: %s" % url
         return url
Example #30
0
    def __init__(self, entry, site_hash=None ):
        self.request = entry['request']
        self.response = entry['response']

        self.result = { 'url': None, 
                'url_sha512': None,
                'ip': None, 
                'vhost': None, 
                'tld': None, 
                'ip_country': None, 
                'content_sha512': None, 
                'content_sha256': None, 
                'content_type': None , 
                'content_encoding': None,
                'content': None,
                'in_alexa': False,
                'http_status': None,
                'redirect_url': None
                }

        self.result['url'] = self.request['url']
        self.result['url_sha512'] = sha512(self.result['url']).hexdigest()

        try:
            self.result['tld'] = get_tld(self.result['url'])
        except TldDomainNotFound:
            pass

        if 'serverIPAddress' in entry: 
            self.result['ip'] = entry['serverIPAddress']

        for header in self.request['headers']:
            if header['name'] == 'Host':
                self.result['vhost'] = header['value']
Example #31
0
 def parse(self, response):
     print "[$$$]  " + response.url
     domain = response.url
     # print '-' * 50
     # print 'response:  ' + domain
     # print "-" * 50
     urls = []
     item = SpidersqliItem()
     item['host'] = domain
     soup = BeautifulSoup(response.body, 'lxml')
     links = soup.findAll('a')
     for link in links:
         # 获得了目标的url但还需要处理
         _url = link.get('href')
         # print "-" * 50
         # print type(_url)
         # print _url
         # print "-" * 50
         # 接着对其进行判断处理
         # 先判断它是否是无意义字符开头以及是否为None值
         # 判断URL后缀,不是列表的不抓取
         if _url is None or re.match(
                 '^(javascript|:;|#)', _url
         ) or re.match(
                 '.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',
                 _url):
             continue
         # 然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫
         if '=' in _url and '?' in _url:
             if re.match('^(http|https)', _url):
                 if get_tld(_url) in domain:
                     urls.append(_url)
             else:
                 urls.append(domain + '/' + _url)
     # for url_ in urls:
     #     yield scrapy.Request(url_)
     # yield scrapy.Request(url_ for url_ in urls)
     item['url'] = urls
     # print "=" * 50
     # print item
     # print "=" * 50
     yield item
Example #32
0
        def fetch_cert(cert_hash, search_domain, result_dict):
            """Fetches a certificate by hash and adds it to `result_dict` if it hasn't been seen before."""
            cert_response = self._cert(cert_hash)

            for cert_domain in cert_response.result.domains:
                cert_domain = cert_domain.lower()
                parsed_domain = tld.get_tld(cert_domain,
                                            as_object=True,
                                            fail_silently=True,
                                            fix_protocol=True)
                if not parsed_domain:
                    continue

                if parsed_domain.tld.lower() != search_domain.lower():
                    continue

                if cert_domain not in result_dict:
                    self.logger.debug(
                        "Adding new subdomain: {}".format(cert_domain))
                    result_dict[cert_domain] = cert_response
Example #33
0
def extractLinks(url, docRoot):
    """ Extract all Links from beautifulSoup document """
    allLinks = []
    section = docRoot.find_all("a")
    rootTLDObj = get_tld(url, as_object=True)

    if len(section) > 0:
        for tag in section:
            if tag.name == "a" and "href" in tag.attrs.keys():
                linkValue = tag['href']
                if linkValue.startswith('/'):
                    allLinks.append(
                                    rootTLDObj.parsed_url.scheme
                                    + '://'
                                    + rootTLDObj.parsed_url.netloc
                                    + linkValue)
                elif linkValue.startswith("javascript:") is False:
                    allLinks.append(linkValue)

    return(allLinks)
Example #34
0
def parse_url(url):
    """parse url"""
    if not is_string(url):
        return _get_default_parsed_url(EMPTY_STRING)

    parsed = get_tld(url, as_object=True, fail_silently=True)
    if not parsed:
        return _get_default_parsed_url(url)

    path, query, fragment = [getattr(parsed.parsed_url, key, "") for key in ["path", "query", "fragment"]]

    return {
        "url": url,
        "path": path,
        "query": query,
        "fragment": fragment,
        "domain": parsed.domain,
        "subdomain": parsed.subdomain,
        "topLevelDomain": parsed.tld
    }
Example #35
0
    def remove_letters(self):
        '''
            :return:
        '''

        url = get_tld(self.url, as_object=True, fix_protocol=True)
        domain = url.domain
        if len(domain) < 4:
            print(
                "With domains with less than 4 letters, this check is not accurate\n"
            )
        new_urls_without_letter = []
        n = 0
        m = len(domain)
        while n < m:
            new_domain = domain[0:n] + domain[n + 1:m]
            n = n + 1
            new_urls_without_letter.append(new_domain)
        new_urls_list = list(set(new_urls_without_letter))
        return new_urls_list
Example #36
0
    def __init__(self, root_url):
        self.root_url = root_url
        self.domain = get_tld(root_url)

        self.links_await = set()  # 带爬取的url
        self.links_visited = set()  # 已爬取的url
        self.attachment_saved = set()  # 已保存的附件url
        self.links_all_dict = {}  # 以字典形式保存所有的链接{url: title},不含附件

        self.s = requests.Session()
        retries = Retry(total=3,
                        backoff_factor=0.1,
                        status_forcelist=[404, 500, 502, 503, 504])
        self.s.mount('http://', HTTPAdapter(max_retries=retries))

        # 初始化数据库和附件保存目录
        init_db()
        self.target_dir = os.path.join(TARGET_DIR, self.domain)
        if not os.path.exists(self.target_dir):
            os.mkdir(self.target_dir)
Example #37
0
def derive_account_campaign(tid, title, url):

    if title in _titles:
        return _titles[title]

    if url in _target_urls:
        return _target_urls[url]

    if tid in _ids:
        return _ids[tid]

    for pattern, rval in _target_url_in:
        if pattern in url:
            return rval

    tld = get_tld(url)

    global insane_identity_counter
    insane_identity_counter += 1
    return tld, "%s/%s" % (tld, insane_identity_counter)
Example #38
0
    def checkReputation(url):
        reputation = ''
        try:
            res = tld.get_tld(url, as_object=True)
            hostname = res.domain

            website_list = []
            with open(
                    'C:\\Users\\PRRAI\\PycharmProjects\\portfolio\\jobs\\names.csv',
                    'r') as f:
                website_list = f.readlines()
            r = re.compile('^.*' + hostname + '.*$')
            newList = list(filter(r.match, website_list))
            if len(newList) != 0:
                reputation = 'Malicious'
            else:
                reputation = 'Safe'
        except:
            reputation = 'Could not fetch'
        return reputation
Example #39
0
    def domain(self) -> str:
        if self._domain is not None:
            return self._domain

        if len(self) == 0:
            self._domain = ''
            return ''

        tld_result = tld.get_tld(self.lower(), fail_silently=True, as_object=True, fix_protocol=True)

        if tld_result is None:
            self._domain = ''
            return ''

        if tld_result.tld in {'in-addr.arpa', 'ip6.arpa'}:
            self._domain = tld_result.tld
            return tld_result.tld

        self._domain = tld_result.fld
        return tld_result.fld
Example #40
0
def get_primary_zones(logger, zones):
    """
    The whois_lookups script is not reliable for international zones
    Therefore, we want to trim the list to ones that we know work.
    This is the more traditional (.com, .net, .org, etc.)
    This list of trust will be expanded over time.
    """
    supported_tld_list = ["com", "net", "org"]
    new_zones = []
    for zone in zones:
        try:
            tld = get_tld(zone, fix_protocol=True)
        except:
            logger.warning(zone + " was not compatible with TLD")
            continue

        if tld in supported_tld_list:
            new_zones.append(zone)

    return (new_zones)
Example #41
0
def get_domain(url):
    url = url_normalize(url)
    if url is None:
        return None, None

    flag = True
    try:
        res = get_tld(url, fail_silently=True, as_object=True)
        domain = res.tld
        subdomain = res.subdomain
    except:
        return None, None

    if subdomain != "www" and subdomain != "m" and subdomain != "":
        flag = False
    else:
        s = urlsplit(url, allow_fragments=False)
        if s.query != '' or s.path != '':
            flag = False
    return flag, domain
Example #42
0
def lpDL(url):
	bu = tld.get_tld(url, as_object=True)
	baseurl = bu.fld
	page = requests.get(url)
	s = bs4.BeautifulSoup(page.content,'html.parser')
	tag = s.find_all('a')
	for t in tag:
		try:
			if t['href'].split("-")[-2] == 'Bukkit' and t['href'].split("-")[-1][-4:].lower() == '.jar' :
				req = urllib.request.Request(url+t['href'], headers={'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"})
				g = urllib.request.urlopen(req)
				with open('mods/'+t['href'].split("/")[-1].rstrip(), 'b+w') as f:
					f.write(g.read())
					modlocations.append('mods/'+t['href'].split("/")[-1].rstrip())
					f.close()
				break
		except KeyError:
			pass
		except IndexError:
			pass
Example #43
0
def passive_query(hostx, key):
    keywords = get_tld(hostx.primary_domain,
                       as_object=True,
                       fail_silently=True,
                       fix_protocol=True).domain
    # print(keywords)
    par = {'access_token': key, 'keywords': keywords}
    try:
        response = requests.get(
            "https://buckets.grayhatwarfare.com/api/v1/buckets",
            params=par,
            timeout=4)
        gwf_api = response.json()

        if gwf_api["buckets_count"] > 0:
            try:
                for bucket in gwf_api["buckets"]:
                    # print(bucket["bucket"])
                    hostx.buckets.append(bucket["bucket"])
            except:
                pass

    except:
        cprint("error", "[*] Error: connecting with GrayHatWarfare API", 1)

    par = {'access_token': key, 'keywords': hostx.orgName}
    try:
        response = requests.get(
            "https://buckets.grayhatwarfare.com/api/v1/buckets",
            params=par,
            timeout=4)
        gwf_api = response.json()
        if gwf_api["buckets_count"] > 0:
            try:
                for bucket in gwf_api["buckets"]:
                    hostx.buckets.append(bucket["bucket"])
            except:
                pass

    except:
        cprint("error", "[*] Error: connecting with GrayHatWarfare API", 1)
Example #44
0
def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(
        requests.get(url, verify=False, timeout=3.05).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            # not a valid URL
            continue
        #if href in internal_urls:
        # already in the set
        #continue
        if domain_name not in href:
            # external link
            try:
                topLevel = get_tld(href)
                if topLevel != "mil" and topLevel != "gov":
                    if href not in external_urls:
                        print(f"{GRAY}[!] External link: {href}{RESET}")
                        print("Found at: " + url)
                        external_urls.add(href)
                        locationOfLink.add(url)
            except:
                pass
            continue
        urls.add(href)
    return urls
Example #45
0
def main(harfile_path):
    """Reads a har file from the filesystem, converts to CSV, then dumps to
    stdout.
    """
    txt_file = 'easylist.txt'
    raw_rules = readfile(txt_file)

    harfile = open(harfile_path, encoding  = 'UTF-8')
    harfile_json = json.loads(harfile.read())
    i = 0

    first_party = harfile_path.split('.')[1]+'.'+harfile_path.split('.')[2]
    rules = AdblockRules(raw_rules)
    blocked = 0
    blocked_domains = set()
    opt = {'script': True,'image':True,'stylesheet':True,'object':True,'subdocument':True,'xmlhttprequest':True,'websocket':True,'webrtc':True,'popup':True,'generichide':True,'genericblock':True}
    
    for entry in harfile_json['log']['entries']:
        i = i + 1
        url = entry['request']['url']
        urlparts = urlparse(entry['request']['url'])
        size_bytes = entry['response']['bodySize']
        size_kilobytes = float(entry['response']['bodySize'])/1024
        mimetype = 'unknown'
        if 'mimeType' in entry['response']['content']:
            mimetype = entry['response']['content']['mimeType']
        
        option = ''
        res = get_tld(url, as_object=True)
        mime_opt = mimetype.split('/')[0]

        if mime_opt in opt:
        	option = mime_opt
        
        if res.fld != first_party and option in opt and rules.should_block(url, {option: opt[option]}):
        	blocked += 1
        	blocked_domains.add(res.fld)
    
    blocked_domains = [dom for dom in blocked_domains] if blocked_domains else 'No domains blocked'

    print(f'\nSite: {first_party}\n# of total HTTP requests: {i}\n# of HTTP requests blocked: {blocked}\nBlocked domains: {blocked_domains}\n')
 def parse(self, response):
     try:
         html = response.body.decode('utf-8')
     except UnicodeDecodeError:
         return
     emails = []
     phones = []
     print("parse")
     # Find mailto's
     mailtos = response.xpath(
         "//a[starts-with(@href, 'mailto')]/@href").getall()
     tels = response.xpath("//a[starts-with(@href, 'tel:')]/@href").getall()
     phones += [tel.replace("tel:", "") for tel in tels]
     emails = [mail.replace('mailto:', '') for mail in mailtos]
     body_emails = self.email_regex.findall(html)
     emails += [email for email in body_emails if \
                get_tld ('https://' + email.split ('@')[-1] , fail_silently=True)]
     yield {
         'emails': list(set(emails)),
         'phones': list(set(phones)),
         'page': response.request.url
     }
     if self.greedy:
         links = response.xpath("//a/@href").getall()
         # If there are external links, scrapy will block them
         # because of the allowed_domains setting
         for link in links:
             skip = False
             for key in self.forbidden_keys:
                 if key in link:
                     skip = True
                     break
             if skip:
                 continue
             try:
                 yield scrapy.Request(link, callback=self.parse)
             except ValueError:
                 try:
                     yield response.follow(link, callback=self.parse)
                 except:
                     pass
Example #47
0
def find_platform_names():
    old_dir = os.path.dirname(
        os.path.join(APP_ROOT, 'dataset_repo\\merged_files\\'))
    new_dir = os.path.dirname(os.path.join(APP_ROOT,
                                           'dataset_repo\\Outputs\\'))
    df1 = pd.read_csv(old_dir + "\\" + "merged_all_csv.csv",
                      sep=',',
                      encoding="ISO-8859-1",
                      error_bad_lines=False)

    df2 = df1['TweetText']
    urls_dict = {}
    urls_list = []

    df = pd.DataFrame()
    df['Platform name'] = ''
    df['Sum count of Tweets'] = ''
    q = 0

    for text in df2:
        urls = re.findall(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            str(text))
        for url in urls:
            urls_list.append(url)

    for url in urls_list:
        parsed_uri = urlparse(url)
        domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
        tld = get_tld(domain)  # Top-Level domain
        plat_name = tld.split('.')
        if plat_name[0] not in urls_dict:
            urls_dict[plat_name[0]] = 1
        else:
            urls_dict[plat_name[0]] = urls_dict[plat_name[0]] + 1

    for k, v in urls_dict.items():
        df.set_value(q, 'Platform name', k)
        df.set_value(q, 'Sum count of Tweets', v)
        q = q + 1
    df.to_csv(new_dir + '\\platform_count.csv', sep=',', index=False)
Example #48
0
def run_monitor_one_shot(client, origin_file, ignore_patterns):
  global records
  global parse_start_time
  temp_dest="~/.gc_history"
  alert = False
  matched = None

  # Make User History and fetch minimal info
  remote_cmd  = f'sudo cp -p {origin_file} {temp_dest}'
  remote_cmd += f' && sudo chmod 777 {temp_dest}'
  remote_cmd += f' && sudo stat -f "%Sm %N" {temp_dest}'
  remote_cmd += f" && sqlite3 {temp_dest} \"SELECT last_visit_time, datetime(datetime(last_visit_time / 1000000 + (strftime('%s', '1601-01-01')), 'unixepoch'), 'localtime'), url FROM urls ORDER BY last_visit_time DESC LIMIT 15\""
  remote_cmd += f' && rm {temp_dest}'
  msg = NetHelpers.ssh_cmd_v2(client, remote_cmd)

  response = msg.split("\n");
  msg = f"{response[0]}\n"
  for record in reversed(response[1:]):
    data = record.split("|")
    if len(data) != 3:
      # Malformed. Ignore.
      continue

    data[0] = int(data[0])
    if data[0] <= parse_start_time:
      # We've already digested this record
      continue
    elif re.search(ignore_patterns, data[2]):
      msg += f"Ignoring: "
    elif any([re.search(pattern, data[2]) for pattern in Constants.BLACKLIST]):
      alert = True
      msg += f"ALERT!! "
      res = get_tld(data[2], as_object=True) #Get the root as an object
      matched = res.fld
    msg += f"[{data[1]}] {data[2]})\n"

    if data[0] > parse_start_time:
      parse_start_time = data[0]
      records[data[1]] = data[2]

  return(alert, msg, matched)
Example #49
0
def get_ba_dic():
    ba_data = xlrd.open_workbook('sjzx1.xls')
    ba_table = ba_data.sheets()[0]
    ba_dic = {} # 已备案ba_dic
    nrows = ba_table.nrows
    for i in range(5,nrows):# line range row (number -1,nrows)
        ips = ba_table.row_values(i)[25]
        ports = ba_table.row_values(i)[26]
        url = ba_table.row_values(i)[28]
        if ips == '' and url == '':
            continue
        else:
            if ips == '':
                if url[0:4]=='http':
                    try:
                        ips = socket.getaddrinfo(get_tld(url), 'http')[0][4][0] # get url ip
                    except Exception as e:
                        continue
                else:
                    continue
            for ip in ips.split(','):
                if not -1==ip.find('-'): # x.x.x.x-x
                    for j in range(int(ip[ip.rfind('.')+1:].split('-')[0]),int(ip[ip.rfind('.')+1:].split('-')[1])+1): # ip range
                        p = [] # port list
                        for port in ports.split(','):  # append port
                            if not -1 == port.find('-'): # x-x
                                for k in range(int(port[port.rfind('p') + 1:].split('-')[0]),int(port[port.rfind('p') + 1:].split('-')[1]) + 1): # port range
                                    p.append((port[:3] + str(k)).encode('utf-8')) # tcpk
                            else:
                                p.append(port.encode('utf-8'))
                        ba_dic[(ip[:ip.rfind('.')+1]+str(j)).encode('utf-8')] = p # add ip
                else: # x.x.x.x
                    p=[] # port list
                    for port in ports.split(','): # append port
                        if not -1 == port.find('-'): # x-x
                            for j in range(int(port[port.rfind('p') + 1:].split('-')[0]),int(port[port.rfind('p') + 1:].split('-')[1]) + 1):
                                p.append((port[:3]+str(j)).encode('utf-8'))
                        else:
                            p.append(port.encode('utf-8'))
                    ba_dic[ip.encode('utf-8')] = p # add ip
    return ba_dic
Example #50
0
    def save_screenshot_s3(self, site_url, country_code):
        conn = boto.connect_s3(
            aws_access_key_id=S3_ACCESS_KEY,
            aws_secret_access_key=S3_SECRET_KEY,
            #is_secure=False,               # uncommmnt if you are not using ssl
            calling_format=boto.s3.connection.OrdinaryCallingFormat(),
        )
        bucket = conn.get_bucket('synergetica')
        key = bucket.new_key(
            '%s/%s_%s_%s.png' %
            (self.screenshot_daily_folder, get_tld(site_url).replace(
                '.', '_'), country_code, int(time.time())))
        key.set_contents_from_string(
            self.driver.get_screenshot_as_base64().decode('base64'))
        key.set_acl('public-read')
        url = key.generate_url(expires_in=0, query_auth=False, force_http=True)

        time.sleep(4)

        print 'screen_url=', url
        return url
Example #51
0
def update_subdomain(subdomain, alive):
    """Subdomain database is maintained locally to keep track of identified live and known subdomains."""
    tld = get_tld(subdomain,
                  as_object=True,
                  fail_silently=True,
                  fix_protocol=True)
    try:
        #synchronize multithread DB_CURSOR.execute
        LOCK.acquire(True)
        if alive == "N":
            DB_CURSOR.execute(
                "insert into subdomains(subdomain, domain, first_found, alive, source) values(?, ?, ?, ?, ?)",
                (subdomain, tld.tld, datetime.now(), 0, "BountyMonitor"))
            CONNECTION.commit()
        elif alive == "Y":
            DB_CURSOR.execute(
                "update subdomains set alive=1 where subdomain = ?",
                (subdomain, ))
            CONNECTION.commit()
    finally:
        LOCK.release()
def getDomainTLD(ip="0.0.0.0"):
    if ip == "0.0.0.0" or ip == "" or ip == "\n":
        return

    global content
    global counter
    counter = counter + 1
    print str(counter) + ":"

    try:
        t = threading.Thread(target=runGetDNSByIP, args=(ip, ))
        t.start()
        success = event.wait(timeout)
        print "{%s}  -  [%s]" % (ip, domain)
        if domain == "":
            content = content + "\n"
        else:
            content = content + get_tld("http://" + domain) + "\n"
    except Exception:
        print ":::ERROR::: Thread {" + str(
            counter) + "} failed to create. IP:" + ip + "\n"
Example #53
0
def monitor(message, context):
    """certstream events callback handler"""

    all_domains = ""
    if message['message_type'] == "heartbeat":
        return

    if message["message_type"] == "certificate_update":
        all_domains = message["data"]["leaf_cert"]["all_domains"]

    for domain in set(all_domains):
        try:
            tld = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True)
            if tld.fld in targets and not domain.startswith("*"):
                # if not utils.exists(domain):
                    logging.info("New domain found "+ domain)
                    MONITOR_QUEUE.put(domain)

        except Exception as e:
            logger.exception("Checking domain " + domain + " failed")
            logger.exception(e)
def _choose_organization_name(force_install=False):
    esg_root_id = esg_property_manager.get_property("esg_root_id")
    if esg_root_id:
        logger.info("esg_root_id = [%s]", esg_root_id)
        return
    elif force_install:
        try:
            default_org_name = tld.get_tld("http://" + socket.gethostname(),
                                           as_object=True).domain
        except tld.exceptions.TldDomainNotFound, error:
            logger.exception("Could not find top level domain for %s.",
                             socket.gethostname())
            default_org_name = "llnl"
        while True:
            org_name_input = raw_input(
                "What is the name of your organization? [{default_org_name}]: "
                .format(default_org_name=default_org_name)) or default_org_name
            org_name_input.replace("", "_")
            esg_property_manager.write_as_property("esg_root_id",
                                                   org_name_input)
            break
Example #55
0
    def check_whitelist(self, uri):

        if not uri:
            return False

        if any(whitelist_str in uri
               for whitelist_str in self.custom_whitelist):
            print "in custom whitelist!"
            return True

        parsed_uri = urlparse(uri)
        parsed_domain = '{uri.netloc}'.format(uri=parsed_uri)

        try:
            domain = get_tld(uri)
            if domain in self.alexa_whitelist:
                return True
        except Exception as e:
            print "error: ", str(e)

        return False
Example #56
0
    def switch_all_letters(self):
        """
        The following function generates all the possible combinations using homoglyphs

        """
        url = get_tld(self.url, as_object=True, fix_protocol=True)
        domain = url.domain
        domains = hg.Homoglyphs().get_combinations(domain)
        a = []
        i = 0
        print("Generated " + str(len(domains)) + " domains\n")
        for domain in domains:
            idna_domain = domain.encode('idna').decode('idna')

            if not a.__contains__(idna_domain):
                a.append(domain.encode('idna').decode('idna'))
            i = i + 1
            print(
                str(i) + ' out of ' + str(len(domains)) + ' domains: ' +
                str(len(a)))
        return a
    def validate_domain(self, url) -> str:
        """
        Attempt to clean the provided url, and pull
        return the domain, or ip address
        """

        is_valid_tld = tld.is_tld(url)

        # looks like a domain
        if is_valid_tld:
            res = tld.get_tld(url, fix_protocol=True, as_object=True)
            return res.parsed_url.netloc

        # not a domain, try ip address:
        if not is_valid_tld:
            parsed_url = urllib.parse.urlparse(url)
            if not parsed_url.netloc:
                # add the //, so that our url reading code
                # parses it properly
                parsed_url = urllib.parse.urlparse(f"//{url}")
            return parsed_url.netloc
Example #58
0
    def for_url(cls, url):
        domain = get_tld(url, fail_silently=True)
        # fail silently
        
        if domain is None:
            domain = cls.is_tld_exception(url)
        
        if domain is None:
            return None

        parts = urlparse(url)

        # iol.co.za/isolezwe
        domain = domain + parts.path

        # find the medium with the longest matching domain
        for medium in sorted(Medium.query.all(), key=lambda m: len(m.domain or ''), reverse=True):
            if medium.domain and domain.startswith(medium.domain):
                return medium

        return None
Example #59
0
def get_url_info(base_url):
    url_split = urlparse.urlsplit(base_url)
    url_info = {}
    url_info['site'] = url_split.netloc
    url_info['site'] = url_info['site'].split(':')[0]
    url_info['site_id'] = get_md5_i64(url_info['site'])
    url_info['path'] = url_split.path
    url_info['query'] = url_split.query
    url_info['fragment'] = url_split.fragment
    try:
        url_info['domain'] = get_tld(base_url)
    except Exception as e:
        url_info['domain'] = url_info['site']
    if url_info.get('domain'):
        url_info['domain_id'] = get_md5_i64(url_info['domain'])
    else:
        url_info['domain_id'] = None

    url_info['url'] = base_url
    url_info['url_id'] = get_md5_i64(base_url)
    return url_info
Example #60
0
def classifier_urls(urls):
    oscwd = os.getcwd()
    r = get_redis(settings.CLASSIFIER_DB)
    keys = get_whitelist()
    ecommerce_spiders = []
    whitelist_spiders = []
    fuzzy_spiders = []
    for url in urls:
        spidername = get_tld(url, fail_silently=True)
        if spidername:
            filename = oscwd + settings.TEMP_PATH + '/spiderInitfiles_of_eCommerce' + '/spiderInit_' + spidername + '.py'
            if os.path.isfile(filename):
                r.rpush('Ecommerce', url)
                ecommerce_spiders.append(url)
            elif spidername in keys:
                r.rpush('Whitelist', url)
                whitelist_spiders.append(url)
            else:
                r.rpush('Fuzzy', url)
                fuzzy_spiders.append(url)
    return ecommerce_spiders, whitelist_spiders, fuzzy_spiders