def analyze_url(url): """ Look just at the URL to see if a suitable title text can be found. This method is much faster than actually visiting the URL to find the title element in the downloaded file. We want to do this for special sites like Facebook, which doesn't allow anonymous downloading of certain pages, like group pages. Args: url: A string that is a URL Returns: A string that is the title text to be used. If no suitable title text can be produced, return the empty string, "". """ try: tl = get_tld(url) except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound): logging.debug("bad TLD; trying with http:// prefix") try: tl = get_tld("http://" + url) except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound): logging.debug("still bad TLD; giving up") return "" if tl == "facebook.com" and "facebook.com/groups/" in url: return "Facebook group page post" return ""
def current_site_domain(request=None): try: if request: from tld import get_tld domain = get_tld('http://' + request.get_host()) else: domain = settings.SUBDOMAIN_BASE_DOMAIN except Exception: from django.contrib.sites.models import Site try: if request: d = Site.objects.get_current(request=request).domain else: d = Site.objects.first().domain if d[0:4] != 'http': d = 'http://' + d domain = get_tld(d) except Exception: try: domain = settings.SUBDOMAIN_BASE_DOMAIN except Exception: domain = Site.objects.first().domain prefix = 'www.' if getattr(settings, 'REMOVE_WWW_FROM_DOMAIN', False) \ and domain.startswith(prefix): domain = domain.replace(prefix, '', 1) return domain
def get_sources_sites(html, sites): """ (str, list of str) -> list of [str, str] Searches and returns links redirected to sites within the html links will be storing the whole url and the domain name used for searching. Returns empty list if none found Keyword arguments: html -- string of html sites -- list of site urls to look for """ result_urls_matched = [] result_urls_unmatched = [] # Format the site to assure only the domain name for searching formatted_sites = [] for site in sites: formatted_sites.append(tld.get_tld(site)) for url in re.findall( "href=[\"\'][^\"\']*?.*?[^\"\']*?[\"\']", html, re.IGNORECASE): try: domain = tld.get_tld(url[6:-1]) except (KeyboardInterrupt, SystemExit): raise except: continue if domain in formatted_sites: # If it matches even once, append the site to the list result_urls_matched.append([url[6:-1], domain]) else: result_urls_unmatched.append([url[6:-1], domain]) # Return the list return [result_urls_matched,result_urls_unmatched]
def get_source_sites(urls, sites): """ (status, list of str) -> list of str Searches and returns links redirected to sites within the urls of the tweet Returns empty list if none found Keyword arguments: tweet -- Status structure to be searched through sites -- List of site urls to look for """ # store_all = configuration()['storage']['store_all_sources'] result_urls_matched = [] result_urls_unmatched = [] formatted_sites = [] for site in sites: formatted_sites.append(tld.get_tld(site)) for url in urls: try: real_url = requests.get(url["expanded_url"], timeout=10).url domain = tld.get_tld(real_url) except: continue if domain in formatted_sites: # If it matches even once, append the site to the list result_urls_matched.append([real_url, domain]) else: result_urls_unmatched.append([real_url, domain]) # Return the list return [result_urls_matched, result_urls_unmatched]
def get_sources_sites(article, sites): """ (str, list of str) -> list of [str, str] Searches and returns links redirected to sites within the html links will be storing the whole url and the domain name used for searching. Returns empty list if none found Keyword arguments: html -- string of html sites -- list of site urls to look for """ result_urls_matched = [] result_urls_unmatched = [] # Format the site to assure only the domain name for searching formatted_sites = set() for site in sites: formatted_sites.add(tld.get_tld(site)) for url in article.get_links(article_text_links_only=True): try: domain = tld.get_tld(url.href) #apparently they don't inherit a common class so I have to hard code it except (tld.exceptions.TldBadUrl, tld.exceptions.TldDomainNotFound, tld.exceptions.TldIOError): continue if domain in formatted_sites: # If it matches even once, append the site to the list result_urls_matched.append([url.href, domain, url.text]) else: result_urls_unmatched.append([url.href, domain, url.text]) # Return the list return [result_urls_matched,result_urls_unmatched]
def _is_valid_link(self, link): """ Return True if given link is non document, Since this is not a perfect way to check but it avoids a call to server. """ # Check ONLY_ROOTDOMAIN scheme, netloc, path, params, query, fragment = urlparse(link) try: if get_tld(self.base_url) == get_tld(link) and not ONLY_ROOTDOMAIN: # if get_tld(self.base_url) == get_tld(link): return False except Exception as e: log.error(str(e), self.base_url, link) # Need to add more DOC_EXT = [".pdf", ".xmls", ".docx", ".odt"] try: urlPath = [i for i in (path.split('/')) if i] file_name = urlPath[-1] ext = file_name.split(".")[-1] except IndexError: # Its just a root URL return True return ext not in DOC_EXT
def parse(self, response): domain = get_tld(response.url) items = [] url_result = urlparse(response.url) top_domain = url_result.scheme + '://'+url_result.netloc for sel in response.xpath('//a/@href'): item = LinkscrawlItem() link = sel.extract() if link.find("http://") == 0 or link.find("https://") == 0 or link.find("www.") == 0: try: target_domain = get_tld(link) #print domain +"==================="+target_domain +"==================" + link if domain != target_domain: item['link'] = link item['source'] = top_domain yield item #items.append(item) else: yield scrapy.Request(link,callback=self.parse) except: print "The url can't get the domain. Ignored..." + link if link.startswith('/'): yield scrapy.Request(top_domain+link, callback=self.parse)
def get_source_sites(urls, sites): """ (list of urls, list of source site urls) Return a list of expanded urls found in source urls, and a list of expanded urls not found in srouce urls. """ result_urls_matched = [] result_urls_unmatched = [] formatted_source_sites = [] for site in sites: formatted_source_sites.append(tld.get_tld(site)) for url in urls: try: # with eventlet, the request will time out in 10s with eventlet.Timeout(10): real_url = requests.get(url, timeout=10).url domain = tld.get_tld(real_url) except: continue if domain in formatted_source_sites: # If it matches even once, append the site to the list result_urls_matched.append([real_url, domain]) else: result_urls_unmatched.append([real_url, domain]) return [result_urls_matched,result_urls_unmatched]
def valid_domain(domain): """This function return True if the passed domain is valid, false otherwise""" try: get_tld(domain,fix_protocol=True) return True except: return False
def crawl(n): i=0 seed = [] db = MySQLdb.connect(host='cspp53001.cs.uchicago.edu',db='jcbraunDB',user='******',passwd='3312crystal') cursor = db.cursor() outLinks = [] if (n ==0): execString = ("SELECT URL, Domain FROM safeSeed WHERE crawled=0;") cursor.execute(execString) seedx = cursor.fetchall() else: execString = ("SELECT URLTo FROM safeOutboundLinks WHERE lvl=%i;" % (n)) cursor.execute(execString) seedx = cursor.fetchall() print seedx for row in seedx: i = i+1 try: url = row[0] print url domain = get_tld(url, fail_silently=True) content = urllib2.urlopen(url, timeout=3).read(2000000) for k in re.findall('''href=["'](.[^"']+)["']''', content): z = ((re.match('http://' , k) is not None) or (re.match('//' , k) is not None)) y = re.match('/' , k) if (y): k = (("/").join((re.split("/", url)))+k) if z or y: domainTo = (get_tld(k, fail_silently=True)) print "domainTo is: %s" %domainTo reqURL = "https://sb-ssl.google.com/safebrowsing/api/lookup?client=f4p&key=AIzaSyCD0pNAG-6HVh_W6udGYZFz-2_p0yHDD5k&appver=31&pver=3.1&url=" + k response = urllib2.urlopen(reqURL).getcode() if (response==200): print ("Found dangerous site \n") bad = 1 execString = ("INSERT INTO inboundLinks (Domain, domainTo, URL, URLto, Crawled) VALUES ('%s', '%s', '%s', '%s', 'false');" % (domain, domainTo, url, k)) cursor.execute(execString) else: bad = 0 execString = ("INSERT INTO safeOutboundLinks (Lvl, Domain, domainTo, URL, URLto, Crawled, toSpam) VALUES ('%i', '%s', '%s', '%s', '%s', '0', '%i');" % ((n+1), domain, domainTo, url, k, bad)) cursor.execute(execString) print("adding %s" %k) db.commit() bank = open('notspam/%d.txt' %i, 'w') bank.write (content) content=db.escape_string(content) execString = ("INSERT INTO safeContent (Lvl, Content, Domain, URL, CopySource) VALUES ('%i', '%s', '%s', '%s', 'crawl');" % ((n+1), content, domain, url)) cursor.execute(execString) print url + " success! \n" bank.close() except Exception as e: print ("Broken link to %s" %url) print (type(e)) print (e.args) db.commit() db.close()
def valid_a_href(a_elements, main_url=None): hrefs = [a.get('href') for a in a_elements] hrefs = [link for link in hrefs if link and link.startswith('http://')] if main_url: main_tld = get_tld(main_url, fail_silently=True) hrefs = [link for link in hrefs if get_tld(link, fail_silently=True) == main_tld] return hrefs
def _from_same_site(self, ads_host, ads_target): if ads_target is None: return True if not ads_target.startswith("http"): return True ads_host_domain = get_tld(ads_host, as_object=True).domain ads_target_domain = get_tld(ads_target, as_object=True).domain return True if ads_host_domain == ads_target_domain else False
def bulk_insert_urls(self, content): logger.debug("in bulk insert urls") for line in content: items = line.split() if len(items) < 9: logger.error("error parsing line") logger.error(line) else: if ("http" in items[8] and "//" in items[8]): parts = items[8].split("//")[1].split("/") domain = parts[0] res = get_tld(items[8], as_object=True, fail_silently=True) if res is not None: tld = "%s.%s" % (res.domain, res.suffix) else: tld = parts[0] path = "" if len(parts) > 0: path = "".join(parts[1:]) url = {'ts':items[2].split(".")[0], 'host':items[4], 'tld':tld, 'domain':domain, 'path': path} try: logger.debug("inserting %s %s %s" % (url['ts'], url['host'], url['tld'])) self.conn.execute("INSERT INTO URLS(ts, host, tld, domain, path, datasource) VALUES(?,?,?,?,?,?)", (url['ts'], url['host'],url['tld'], url['domain'], url['path'], 'squid')) except Exception, e: logger.error("error inserting url %s" % str(url))
def getDomainTLD(ip="0.0.0.0"): if ip == "0.0.0.0" or ip == "" or ip == "\n": return global content global counter counter = counter +1 print str(counter) + ":" try: t = threading.Thread(target=runGetDNSByIP, args=(ip,)) #threads.append(t) #domain = str(socket.gethostbyaddr(ip)[0]) t.start() success = event.wait(5.0) print "{%s} - [%s]" % (ip, domain) if domain == "": content = content + "\n" else: content = content + get_tld("http://"+domain) + "\n" #lock.acquire() #writeFile(ip+","+get_tld("http://"+domain) + "\n", outputFile, True) #c.acquire() #print ip+","+get_tld("http://"+domain) + "\n" #content = content + ip+",,,,,,,,,,,," + get_tld("http://"+domain) + "\n" #lock.release() #return get_tld("http://"+domain) except Exception: #lock.acquire() #writeFile("\n", outputFile, True) #content = content + "\n" print ":::ERROR::: Thread {"+str(counter)+"} failed to create. IP:" + ip + "\n"
def __init__(self, tbb_path="", tor_cfg=cm.LAUNCH_NEW_TBB_TOR, tbb_fx_binary_path="", tbb_profile_path="", tbb_logfile_path="", pref_dict={}, socks_port=None, virt_display=cm.DEFAULT_XVFB_WINDOW_SIZE, canvas_exceptions=[]): self.check_tbb_paths(tbb_path, tbb_fx_binary_path, tbb_profile_path) self.tor_cfg = tor_cfg self.canvas_exceptions = [get_tld(url) for url in canvas_exceptions] self.setup_virtual_display(virt_display) self.profile = webdriver.FirefoxProfile(self.tbb_profile_path) add_canvas_permission(self.profile.path, self.canvas_exceptions) if socks_port is None: if tor_cfg == cm.USE_SYSTEM_TOR: socks_port = cm.DEFAULT_SOCKS_PORT # 9050 else: socks_port = cm.TBB_SOCKS_PORT # 9150 self.socks_port = socks_port self.update_prefs(pref_dict) self.setup_capabilities() self.export_lib_path() self.binary = self.get_tbb_binary(logfile=tbb_logfile_path) super(TorBrowserDriver, self).__init__(firefox_profile=self.profile, firefox_binary=self.binary, capabilities=self.capabilities, # default timeout is 30 timeout=60) self.is_running = True
def get_tld_locations(self,json_data): url_location_dictionary = {} url_count = 0 for article in json_data: for url in json_data[article]: url_count += 1 #split url and get suffix try: res = get_tld(url, as_object=True) except TldDomainNotFound: print "tld not found: "+ str(url) continue except TldBadUrl: print "bad url: "+ str(url) continue tld = res.suffix #check suffix for combination and get only last part if necessary, after the dot needs to be added in front for if (tld.count(".") > 0): results = re.split(r'\.', tld) tld = results[len(results)-1] tld ="."+tld #print tld try: if (self.iana[tld] == 'country-code'): for w in self.wfb: if w.tld == tld: url_location_dictionary[url]= w.iso2c else: url_location_dictionary[url]= tld.replace(".",'').upper() except KeyError: print "no entry found for: "+ str(tld) return url_location_dictionary
def _parse(self, dataset): """ Parse data to calculate and sort about news :param dataset: list, list of dict that query date as key, value is tuple, including url, cat and pub_date on news :return: sorted dataset, type like as: [{'20160420': {...}}, ...] """ total_data = [] dt_dist = 'dt_dist' # Key to news publish datetime distribute for dict_to_date in dataset: for query_date, data_list in dict_to_date.iteritems(): # `query_date` is date string, `data_list` is list result = defaultdict(lambda: defaultdict(int)) for uri, cat, dt in data_list: try: domain = get_tld(uri, as_object=True).domain result[domain][cat] += 1 result[domain]['count'] += 1 result[domain].setdefault(dt_dist, []).append(dt) except Exception: pass for site_domain in result.keys(): result[site_domain][dt_dist].sort() result['total_count'] = result.get('total_count', 0) + len(result[site_domain][dt_dist]) total_data.append({query_date: result}) return total_data
def auto_select_target(target, output=None): """Auto selection logic""" print "Target: %s" % target try: inp=IPAddress(target); if inp.is_private() or inp.is_loopback(): print "Internal IP Detected : Skipping" sys.exit() else: print "Looks like an IP, running ipOsint...\n" ipOsint.run(target, output) except SystemExit: print "exiting" except AddrFormatError: if re.match('[^@]+@[^@]+\.[^@]+', target): print "Looks like an EMAIL, running emailOsint...\n" emailOsint.run(target, output) elif get_tld(target, fix_protocol=True,fail_silently=True) is not None: print "Looks like a DOMAIN, running domainOsint...\n" domainOsint.run(target, output) else: print "Nothing Matched assuming username, running usernameOsint...\n" usernameOsint.run(target, output) except: print "Unknown Error Occured"
def get_link_text(url, mime_type, data=None): ''' Take URL, MIME type, and optional data to produce the link text. ''' tld = get_tld(url) result = "File on " + tld if mime_type.startswith("image"): result = "Image on " + tld elif mime_type == "application/pdf": result = "PDF on " + tld elif "text/html" in mime_type: try: soup = BeautifulSoup(data, 'html.parser') meta = soup.find_all("meta") possible = [i.get("content") for i in meta if i.get("property") == "og:title"] if possible: result = possible[0].strip() elif soup.title.string: result = messy_title_parse(soup.title.string) else: result = "Page on " + tld except AttributeError: # Probably just empty title when trying to get # soup.title.string result = "Page on " + tld if len(result) > 255: result = result[:253] + " …" return result
def process_arguments(): if len(sys.argv) != 2: print_error('Invalid number of arguments.') try: lines = [] count = 0 with open(sys.argv[1], 'r') as f: for line in f: if count == 0: doc_pair = {} elif count == len(prefixes): count = 0 lines.append(doc_pair) continue output = match_line(prefixes[count] + ': ', line.rstrip('\n')) doc_pair[prefixes[count]] = output count += 1 except IOError: print_error('Unable to read file \'{}\'.'.format(sys.argv[1])) for line in lines: line['FUZZY SRC'] = get_longest_ascii(line['SRC TXT']) line['FUZZY TGT'] = get_longest_ascii(line['TGT TXT']) try: domain = tld.get_tld(line['SRC URL']) except tld.exceptions.TldDomainNotFound: domain = None line['DOMAIN'] = domain return lines
def get_monthly_archive_urls(links, page_url): '''Scans the provided links for blogspot style archives which are of the form website.com/yyyy_dd_01_archive.html''' domain = get_tld(page_url) monthly_archive_urls = [] for link in links: # Try for drop down lists using <option value="url.com/..."> try: url = link.attrs['value'] match = re.search(domain + "/\d{4}_\d{2}_01_archive.html", url) if match: monthly_archive_urls.append(url) except KeyError: pass # Try for actual <a href="url.com/..." > links try: url = link.attrs['href'] match = re.search(domain + "/\d{4}_\d{2}_01_archive.html", url) if match: monthly_archive_urls.append(url) except KeyError: pass return list(set(monthly_archive_urls))
def monitor(message, context): """certstream events callback handler""" all_domains = "" if message['message_type'] == "heartbeat": return if message["message_type"] == "certificate_update": all_domains = message["data"]["leaf_cert"]["all_domains"] for domain in set(all_domains): PBAR.update(1) # all magic happens here try: if domain.count(".") > 1 and not domain.startswith("*.") and not re.search("\d$", domain) and "cloudflaressl" not in domain and "xn--" not in domain and not domain.endswith("local"): tld = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) if tld is not None and tld.tld in BOUNTY_LIST and tld.tld != domain and tld.subdomain != "www": if check_subdomain_not_known_in_db(domain): update_subdomain(domain, "N") MONITOR_QUEUE.put(domain) except Exception as e: logging.exception("message") print (domain) t.sleep(.1)
def get_domain_pubdate(file_path, yesterday): with open(file_path) as fp: url, pub_date = fp.readlines()[:2] if int(pub_date.strip()) and pub_date.startswith(yesterday): domain = tld.get_tld(url.strip(), as_object=True).domain return domain, pub_date.strip() return None, None
def run(self, target): domainname = get_tld("http://" + target) whoisdomaincmd = "whois " + domainname + " > whois-domain-" + domainname + ".txt" print "Whois DOMAIN lookup cmd: " + whoisdomaincmd print commands.getoutput(whoisdomaincmd)
def scrap(self, url): self.external_domain = "http://"+get_tld(url) response = requests.get(url) soup = bs4.BeautifulSoup(response.text) self._description(soup) self._get_episodes(soup)
def bulk_insert_urls(self, content): for line in content: items = line.split() if len(items) < 9: logger.error("error parsing line") logger.error(line) else: if ("http" in items[8] and "//" in items[8]): parts = items[8].split("//")[1].split("/") domain = parts[0] res = get_tld(items[8], as_object=True, fail_silently=True) if res is not None: tld = "%s.%s" % (res.domain, res.suffix) else: tld = parts[0] path = "" if len(parts) > 0: path = "".join(parts[1:]) #sometimes dest can just be a '-', need to set it to a valid host so postgres does not barf if items[11].split("/")[1].strip() == "-": dest = "0.0.0.0" else: dest = items[11].split("/")[1] #url = {'ts':items[2].split(".")[0], 'host':items[4], 'tld':tld, 'domain':domain, 'path': path} url = {'ts':items[2].replace(".",""), 'host':items[4], 'tld':tld, 'domain':domain, 'path': path, 'verb':items[7], 'clength':items[6], 'statuscode':items[5].split("/")[1], 'dest':dest, 'contenttype':items[12], } try: #print "inserting %s %s %s %s %s %s" % (url['ts'], url['host'],url['tld'], url['domain'], url['path'], 'squid') sql = "SELECT deviceid from vpnips WHERE ip=%s" data = (url['host'],) self.cur.execute(sql,data) deviceid = self.cur.fetchone() sql = "INSERT INTO http3 (id, httpverb, httpverbparam, httpstatuscode, httphost, contenttype, contentlength, src, dest, timestamp) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" data = (deviceid,url['verb'],url['path'], url['statuscode'],url['tld'], url['contenttype'], url['clength'], url['host'], url['dest'], url['ts']) self.cur.execute(sql,data) except Exception, e: logger.error(e) logger.error("error inserting url %s" % str(url))
def _record_time(self, request): if hasattr(request, '_start_time'): ms = int((time.time() - request._start_time) * 1000) if request.is_ajax(): is_ajax = True else: is_ajax = False is_authenticated = False is_staff = False is_superuser = False if is_user_authenticated(request.user): is_authenticated = True if request.user.is_staff: is_staff = True if request.user.is_superuser: is_superuser = True referer = request.META.get('HTTP_REFERER') referer_tld = None referer_tld_string = '' if referer: try: referer_tld = get_tld(referer, as_object=True) except (TldBadUrl, TldDomainNotFound, TldIOError): pass if referer_tld: referer_tld_string = referer_tld.tld url = request.get_full_path() url_query = parse.parse_qs(parse.urlparse(url).query) # This allows you to measure click rates for ad-campaigns, just # make sure that your ads have `?campaign=something` in the URL campaign_keyword = getattr( settings, 'INFLUXDB_METRICS_CAMPAIGN_KEYWORD', 'campaign') campaign = '' if campaign_keyword in url_query: campaign = url_query[campaign_keyword][0] data = [{ 'measurement': 'django_request', 'tags': { 'host': settings.INFLUXDB_TAGS_HOST, 'is_ajax': is_ajax, 'is_authenticated': is_authenticated, 'is_staff': is_staff, 'is_superuser': is_superuser, 'method': request.method, 'module': request._view_module, 'view': request._view_name, 'referer': referer, 'referer_tld': referer_tld_string, 'full_path': url, 'path': request.path, 'campaign': campaign, }, 'fields': {'value': ms, }, }] write_points(data)
def run(self, target): domainname = get_tld("http://"+target) whoisdomaincmd='whois ' + domainname + ' > whois-domain-' + domainname + '.txt' print "Whois DOMAIN lookup cmd: " + whoisdomaincmd print commands.getoutput(whoisdomaincmd)
def extract_tld(self, url): try: return get_tld(url) except: traceback.print_exc() print "\n\nInvalid url: %s" % url return url
def __init__(self, entry, site_hash=None ): self.request = entry['request'] self.response = entry['response'] self.result = { 'url': None, 'url_sha512': None, 'ip': None, 'vhost': None, 'tld': None, 'ip_country': None, 'content_sha512': None, 'content_sha256': None, 'content_type': None , 'content_encoding': None, 'content': None, 'in_alexa': False, 'http_status': None, 'redirect_url': None } self.result['url'] = self.request['url'] self.result['url_sha512'] = sha512(self.result['url']).hexdigest() try: self.result['tld'] = get_tld(self.result['url']) except TldDomainNotFound: pass if 'serverIPAddress' in entry: self.result['ip'] = entry['serverIPAddress'] for header in self.request['headers']: if header['name'] == 'Host': self.result['vhost'] = header['value']
def parse(self, response): print "[$$$] " + response.url domain = response.url # print '-' * 50 # print 'response: ' + domain # print "-" * 50 urls = [] item = SpidersqliItem() item['host'] = domain soup = BeautifulSoup(response.body, 'lxml') links = soup.findAll('a') for link in links: # 获得了目标的url但还需要处理 _url = link.get('href') # print "-" * 50 # print type(_url) # print _url # print "-" * 50 # 接着对其进行判断处理 # 先判断它是否是无意义字符开头以及是否为None值 # 判断URL后缀,不是列表的不抓取 if _url is None or re.match( '^(javascript|:;|#)', _url ) or re.match( '.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$', _url): continue # 然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫 if '=' in _url and '?' in _url: if re.match('^(http|https)', _url): if get_tld(_url) in domain: urls.append(_url) else: urls.append(domain + '/' + _url) # for url_ in urls: # yield scrapy.Request(url_) # yield scrapy.Request(url_ for url_ in urls) item['url'] = urls # print "=" * 50 # print item # print "=" * 50 yield item
def fetch_cert(cert_hash, search_domain, result_dict): """Fetches a certificate by hash and adds it to `result_dict` if it hasn't been seen before.""" cert_response = self._cert(cert_hash) for cert_domain in cert_response.result.domains: cert_domain = cert_domain.lower() parsed_domain = tld.get_tld(cert_domain, as_object=True, fail_silently=True, fix_protocol=True) if not parsed_domain: continue if parsed_domain.tld.lower() != search_domain.lower(): continue if cert_domain not in result_dict: self.logger.debug( "Adding new subdomain: {}".format(cert_domain)) result_dict[cert_domain] = cert_response
def extractLinks(url, docRoot): """ Extract all Links from beautifulSoup document """ allLinks = [] section = docRoot.find_all("a") rootTLDObj = get_tld(url, as_object=True) if len(section) > 0: for tag in section: if tag.name == "a" and "href" in tag.attrs.keys(): linkValue = tag['href'] if linkValue.startswith('/'): allLinks.append( rootTLDObj.parsed_url.scheme + '://' + rootTLDObj.parsed_url.netloc + linkValue) elif linkValue.startswith("javascript:") is False: allLinks.append(linkValue) return(allLinks)
def parse_url(url): """parse url""" if not is_string(url): return _get_default_parsed_url(EMPTY_STRING) parsed = get_tld(url, as_object=True, fail_silently=True) if not parsed: return _get_default_parsed_url(url) path, query, fragment = [getattr(parsed.parsed_url, key, "") for key in ["path", "query", "fragment"]] return { "url": url, "path": path, "query": query, "fragment": fragment, "domain": parsed.domain, "subdomain": parsed.subdomain, "topLevelDomain": parsed.tld }
def remove_letters(self): ''' :return: ''' url = get_tld(self.url, as_object=True, fix_protocol=True) domain = url.domain if len(domain) < 4: print( "With domains with less than 4 letters, this check is not accurate\n" ) new_urls_without_letter = [] n = 0 m = len(domain) while n < m: new_domain = domain[0:n] + domain[n + 1:m] n = n + 1 new_urls_without_letter.append(new_domain) new_urls_list = list(set(new_urls_without_letter)) return new_urls_list
def __init__(self, root_url): self.root_url = root_url self.domain = get_tld(root_url) self.links_await = set() # 带爬取的url self.links_visited = set() # 已爬取的url self.attachment_saved = set() # 已保存的附件url self.links_all_dict = {} # 以字典形式保存所有的链接{url: title},不含附件 self.s = requests.Session() retries = Retry(total=3, backoff_factor=0.1, status_forcelist=[404, 500, 502, 503, 504]) self.s.mount('http://', HTTPAdapter(max_retries=retries)) # 初始化数据库和附件保存目录 init_db() self.target_dir = os.path.join(TARGET_DIR, self.domain) if not os.path.exists(self.target_dir): os.mkdir(self.target_dir)
def derive_account_campaign(tid, title, url): if title in _titles: return _titles[title] if url in _target_urls: return _target_urls[url] if tid in _ids: return _ids[tid] for pattern, rval in _target_url_in: if pattern in url: return rval tld = get_tld(url) global insane_identity_counter insane_identity_counter += 1 return tld, "%s/%s" % (tld, insane_identity_counter)
def checkReputation(url): reputation = '' try: res = tld.get_tld(url, as_object=True) hostname = res.domain website_list = [] with open( 'C:\\Users\\PRRAI\\PycharmProjects\\portfolio\\jobs\\names.csv', 'r') as f: website_list = f.readlines() r = re.compile('^.*' + hostname + '.*$') newList = list(filter(r.match, website_list)) if len(newList) != 0: reputation = 'Malicious' else: reputation = 'Safe' except: reputation = 'Could not fetch' return reputation
def domain(self) -> str: if self._domain is not None: return self._domain if len(self) == 0: self._domain = '' return '' tld_result = tld.get_tld(self.lower(), fail_silently=True, as_object=True, fix_protocol=True) if tld_result is None: self._domain = '' return '' if tld_result.tld in {'in-addr.arpa', 'ip6.arpa'}: self._domain = tld_result.tld return tld_result.tld self._domain = tld_result.fld return tld_result.fld
def get_primary_zones(logger, zones): """ The whois_lookups script is not reliable for international zones Therefore, we want to trim the list to ones that we know work. This is the more traditional (.com, .net, .org, etc.) This list of trust will be expanded over time. """ supported_tld_list = ["com", "net", "org"] new_zones = [] for zone in zones: try: tld = get_tld(zone, fix_protocol=True) except: logger.warning(zone + " was not compatible with TLD") continue if tld in supported_tld_list: new_zones.append(zone) return (new_zones)
def get_domain(url): url = url_normalize(url) if url is None: return None, None flag = True try: res = get_tld(url, fail_silently=True, as_object=True) domain = res.tld subdomain = res.subdomain except: return None, None if subdomain != "www" and subdomain != "m" and subdomain != "": flag = False else: s = urlsplit(url, allow_fragments=False) if s.query != '' or s.path != '': flag = False return flag, domain
def lpDL(url): bu = tld.get_tld(url, as_object=True) baseurl = bu.fld page = requests.get(url) s = bs4.BeautifulSoup(page.content,'html.parser') tag = s.find_all('a') for t in tag: try: if t['href'].split("-")[-2] == 'Bukkit' and t['href'].split("-")[-1][-4:].lower() == '.jar' : req = urllib.request.Request(url+t['href'], headers={'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"}) g = urllib.request.urlopen(req) with open('mods/'+t['href'].split("/")[-1].rstrip(), 'b+w') as f: f.write(g.read()) modlocations.append('mods/'+t['href'].split("/")[-1].rstrip()) f.close() break except KeyError: pass except IndexError: pass
def passive_query(hostx, key): keywords = get_tld(hostx.primary_domain, as_object=True, fail_silently=True, fix_protocol=True).domain # print(keywords) par = {'access_token': key, 'keywords': keywords} try: response = requests.get( "https://buckets.grayhatwarfare.com/api/v1/buckets", params=par, timeout=4) gwf_api = response.json() if gwf_api["buckets_count"] > 0: try: for bucket in gwf_api["buckets"]: # print(bucket["bucket"]) hostx.buckets.append(bucket["bucket"]) except: pass except: cprint("error", "[*] Error: connecting with GrayHatWarfare API", 1) par = {'access_token': key, 'keywords': hostx.orgName} try: response = requests.get( "https://buckets.grayhatwarfare.com/api/v1/buckets", params=par, timeout=4) gwf_api = response.json() if gwf_api["buckets_count"] > 0: try: for bucket in gwf_api["buckets"]: hostx.buckets.append(bucket["bucket"]) except: pass except: cprint("error", "[*] Error: connecting with GrayHatWarfare API", 1)
def get_all_website_links(url): """ Returns all URLs that is found on `url` in which it belongs to the same website """ # all URLs of `url` urls = set() # domain name of the URL without the protocol domain_name = urlparse(url).netloc soup = BeautifulSoup( requests.get(url, verify=False, timeout=3.05).content, "html.parser") for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: # href empty tag continue # join the URL if it's relative (not absolute link) href = urljoin(url, href) parsed_href = urlparse(href) # remove URL GET parameters, URL fragments, etc. href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if not is_valid(href): # not a valid URL continue #if href in internal_urls: # already in the set #continue if domain_name not in href: # external link try: topLevel = get_tld(href) if topLevel != "mil" and topLevel != "gov": if href not in external_urls: print(f"{GRAY}[!] External link: {href}{RESET}") print("Found at: " + url) external_urls.add(href) locationOfLink.add(url) except: pass continue urls.add(href) return urls
def main(harfile_path): """Reads a har file from the filesystem, converts to CSV, then dumps to stdout. """ txt_file = 'easylist.txt' raw_rules = readfile(txt_file) harfile = open(harfile_path, encoding = 'UTF-8') harfile_json = json.loads(harfile.read()) i = 0 first_party = harfile_path.split('.')[1]+'.'+harfile_path.split('.')[2] rules = AdblockRules(raw_rules) blocked = 0 blocked_domains = set() opt = {'script': True,'image':True,'stylesheet':True,'object':True,'subdocument':True,'xmlhttprequest':True,'websocket':True,'webrtc':True,'popup':True,'generichide':True,'genericblock':True} for entry in harfile_json['log']['entries']: i = i + 1 url = entry['request']['url'] urlparts = urlparse(entry['request']['url']) size_bytes = entry['response']['bodySize'] size_kilobytes = float(entry['response']['bodySize'])/1024 mimetype = 'unknown' if 'mimeType' in entry['response']['content']: mimetype = entry['response']['content']['mimeType'] option = '' res = get_tld(url, as_object=True) mime_opt = mimetype.split('/')[0] if mime_opt in opt: option = mime_opt if res.fld != first_party and option in opt and rules.should_block(url, {option: opt[option]}): blocked += 1 blocked_domains.add(res.fld) blocked_domains = [dom for dom in blocked_domains] if blocked_domains else 'No domains blocked' print(f'\nSite: {first_party}\n# of total HTTP requests: {i}\n# of HTTP requests blocked: {blocked}\nBlocked domains: {blocked_domains}\n')
def parse(self, response): try: html = response.body.decode('utf-8') except UnicodeDecodeError: return emails = [] phones = [] print("parse") # Find mailto's mailtos = response.xpath( "//a[starts-with(@href, 'mailto')]/@href").getall() tels = response.xpath("//a[starts-with(@href, 'tel:')]/@href").getall() phones += [tel.replace("tel:", "") for tel in tels] emails = [mail.replace('mailto:', '') for mail in mailtos] body_emails = self.email_regex.findall(html) emails += [email for email in body_emails if \ get_tld ('https://' + email.split ('@')[-1] , fail_silently=True)] yield { 'emails': list(set(emails)), 'phones': list(set(phones)), 'page': response.request.url } if self.greedy: links = response.xpath("//a/@href").getall() # If there are external links, scrapy will block them # because of the allowed_domains setting for link in links: skip = False for key in self.forbidden_keys: if key in link: skip = True break if skip: continue try: yield scrapy.Request(link, callback=self.parse) except ValueError: try: yield response.follow(link, callback=self.parse) except: pass
def find_platform_names(): old_dir = os.path.dirname( os.path.join(APP_ROOT, 'dataset_repo\\merged_files\\')) new_dir = os.path.dirname(os.path.join(APP_ROOT, 'dataset_repo\\Outputs\\')) df1 = pd.read_csv(old_dir + "\\" + "merged_all_csv.csv", sep=',', encoding="ISO-8859-1", error_bad_lines=False) df2 = df1['TweetText'] urls_dict = {} urls_list = [] df = pd.DataFrame() df['Platform name'] = '' df['Sum count of Tweets'] = '' q = 0 for text in df2: urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(text)) for url in urls: urls_list.append(url) for url in urls_list: parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) tld = get_tld(domain) # Top-Level domain plat_name = tld.split('.') if plat_name[0] not in urls_dict: urls_dict[plat_name[0]] = 1 else: urls_dict[plat_name[0]] = urls_dict[plat_name[0]] + 1 for k, v in urls_dict.items(): df.set_value(q, 'Platform name', k) df.set_value(q, 'Sum count of Tweets', v) q = q + 1 df.to_csv(new_dir + '\\platform_count.csv', sep=',', index=False)
def run_monitor_one_shot(client, origin_file, ignore_patterns): global records global parse_start_time temp_dest="~/.gc_history" alert = False matched = None # Make User History and fetch minimal info remote_cmd = f'sudo cp -p {origin_file} {temp_dest}' remote_cmd += f' && sudo chmod 777 {temp_dest}' remote_cmd += f' && sudo stat -f "%Sm %N" {temp_dest}' remote_cmd += f" && sqlite3 {temp_dest} \"SELECT last_visit_time, datetime(datetime(last_visit_time / 1000000 + (strftime('%s', '1601-01-01')), 'unixepoch'), 'localtime'), url FROM urls ORDER BY last_visit_time DESC LIMIT 15\"" remote_cmd += f' && rm {temp_dest}' msg = NetHelpers.ssh_cmd_v2(client, remote_cmd) response = msg.split("\n"); msg = f"{response[0]}\n" for record in reversed(response[1:]): data = record.split("|") if len(data) != 3: # Malformed. Ignore. continue data[0] = int(data[0]) if data[0] <= parse_start_time: # We've already digested this record continue elif re.search(ignore_patterns, data[2]): msg += f"Ignoring: " elif any([re.search(pattern, data[2]) for pattern in Constants.BLACKLIST]): alert = True msg += f"ALERT!! " res = get_tld(data[2], as_object=True) #Get the root as an object matched = res.fld msg += f"[{data[1]}] {data[2]})\n" if data[0] > parse_start_time: parse_start_time = data[0] records[data[1]] = data[2] return(alert, msg, matched)
def get_ba_dic(): ba_data = xlrd.open_workbook('sjzx1.xls') ba_table = ba_data.sheets()[0] ba_dic = {} # 已备案ba_dic nrows = ba_table.nrows for i in range(5,nrows):# line range row (number -1,nrows) ips = ba_table.row_values(i)[25] ports = ba_table.row_values(i)[26] url = ba_table.row_values(i)[28] if ips == '' and url == '': continue else: if ips == '': if url[0:4]=='http': try: ips = socket.getaddrinfo(get_tld(url), 'http')[0][4][0] # get url ip except Exception as e: continue else: continue for ip in ips.split(','): if not -1==ip.find('-'): # x.x.x.x-x for j in range(int(ip[ip.rfind('.')+1:].split('-')[0]),int(ip[ip.rfind('.')+1:].split('-')[1])+1): # ip range p = [] # port list for port in ports.split(','): # append port if not -1 == port.find('-'): # x-x for k in range(int(port[port.rfind('p') + 1:].split('-')[0]),int(port[port.rfind('p') + 1:].split('-')[1]) + 1): # port range p.append((port[:3] + str(k)).encode('utf-8')) # tcpk else: p.append(port.encode('utf-8')) ba_dic[(ip[:ip.rfind('.')+1]+str(j)).encode('utf-8')] = p # add ip else: # x.x.x.x p=[] # port list for port in ports.split(','): # append port if not -1 == port.find('-'): # x-x for j in range(int(port[port.rfind('p') + 1:].split('-')[0]),int(port[port.rfind('p') + 1:].split('-')[1]) + 1): p.append((port[:3]+str(j)).encode('utf-8')) else: p.append(port.encode('utf-8')) ba_dic[ip.encode('utf-8')] = p # add ip return ba_dic
def save_screenshot_s3(self, site_url, country_code): conn = boto.connect_s3( aws_access_key_id=S3_ACCESS_KEY, aws_secret_access_key=S3_SECRET_KEY, #is_secure=False, # uncommmnt if you are not using ssl calling_format=boto.s3.connection.OrdinaryCallingFormat(), ) bucket = conn.get_bucket('synergetica') key = bucket.new_key( '%s/%s_%s_%s.png' % (self.screenshot_daily_folder, get_tld(site_url).replace( '.', '_'), country_code, int(time.time()))) key.set_contents_from_string( self.driver.get_screenshot_as_base64().decode('base64')) key.set_acl('public-read') url = key.generate_url(expires_in=0, query_auth=False, force_http=True) time.sleep(4) print 'screen_url=', url return url
def update_subdomain(subdomain, alive): """Subdomain database is maintained locally to keep track of identified live and known subdomains.""" tld = get_tld(subdomain, as_object=True, fail_silently=True, fix_protocol=True) try: #synchronize multithread DB_CURSOR.execute LOCK.acquire(True) if alive == "N": DB_CURSOR.execute( "insert into subdomains(subdomain, domain, first_found, alive, source) values(?, ?, ?, ?, ?)", (subdomain, tld.tld, datetime.now(), 0, "BountyMonitor")) CONNECTION.commit() elif alive == "Y": DB_CURSOR.execute( "update subdomains set alive=1 where subdomain = ?", (subdomain, )) CONNECTION.commit() finally: LOCK.release()
def getDomainTLD(ip="0.0.0.0"): if ip == "0.0.0.0" or ip == "" or ip == "\n": return global content global counter counter = counter + 1 print str(counter) + ":" try: t = threading.Thread(target=runGetDNSByIP, args=(ip, )) t.start() success = event.wait(timeout) print "{%s} - [%s]" % (ip, domain) if domain == "": content = content + "\n" else: content = content + get_tld("http://" + domain) + "\n" except Exception: print ":::ERROR::: Thread {" + str( counter) + "} failed to create. IP:" + ip + "\n"
def monitor(message, context): """certstream events callback handler""" all_domains = "" if message['message_type'] == "heartbeat": return if message["message_type"] == "certificate_update": all_domains = message["data"]["leaf_cert"]["all_domains"] for domain in set(all_domains): try: tld = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) if tld.fld in targets and not domain.startswith("*"): # if not utils.exists(domain): logging.info("New domain found "+ domain) MONITOR_QUEUE.put(domain) except Exception as e: logger.exception("Checking domain " + domain + " failed") logger.exception(e)
def _choose_organization_name(force_install=False): esg_root_id = esg_property_manager.get_property("esg_root_id") if esg_root_id: logger.info("esg_root_id = [%s]", esg_root_id) return elif force_install: try: default_org_name = tld.get_tld("http://" + socket.gethostname(), as_object=True).domain except tld.exceptions.TldDomainNotFound, error: logger.exception("Could not find top level domain for %s.", socket.gethostname()) default_org_name = "llnl" while True: org_name_input = raw_input( "What is the name of your organization? [{default_org_name}]: " .format(default_org_name=default_org_name)) or default_org_name org_name_input.replace("", "_") esg_property_manager.write_as_property("esg_root_id", org_name_input) break
def check_whitelist(self, uri): if not uri: return False if any(whitelist_str in uri for whitelist_str in self.custom_whitelist): print "in custom whitelist!" return True parsed_uri = urlparse(uri) parsed_domain = '{uri.netloc}'.format(uri=parsed_uri) try: domain = get_tld(uri) if domain in self.alexa_whitelist: return True except Exception as e: print "error: ", str(e) return False
def switch_all_letters(self): """ The following function generates all the possible combinations using homoglyphs """ url = get_tld(self.url, as_object=True, fix_protocol=True) domain = url.domain domains = hg.Homoglyphs().get_combinations(domain) a = [] i = 0 print("Generated " + str(len(domains)) + " domains\n") for domain in domains: idna_domain = domain.encode('idna').decode('idna') if not a.__contains__(idna_domain): a.append(domain.encode('idna').decode('idna')) i = i + 1 print( str(i) + ' out of ' + str(len(domains)) + ' domains: ' + str(len(a))) return a
def validate_domain(self, url) -> str: """ Attempt to clean the provided url, and pull return the domain, or ip address """ is_valid_tld = tld.is_tld(url) # looks like a domain if is_valid_tld: res = tld.get_tld(url, fix_protocol=True, as_object=True) return res.parsed_url.netloc # not a domain, try ip address: if not is_valid_tld: parsed_url = urllib.parse.urlparse(url) if not parsed_url.netloc: # add the //, so that our url reading code # parses it properly parsed_url = urllib.parse.urlparse(f"//{url}") return parsed_url.netloc
def for_url(cls, url): domain = get_tld(url, fail_silently=True) # fail silently if domain is None: domain = cls.is_tld_exception(url) if domain is None: return None parts = urlparse(url) # iol.co.za/isolezwe domain = domain + parts.path # find the medium with the longest matching domain for medium in sorted(Medium.query.all(), key=lambda m: len(m.domain or ''), reverse=True): if medium.domain and domain.startswith(medium.domain): return medium return None
def get_url_info(base_url): url_split = urlparse.urlsplit(base_url) url_info = {} url_info['site'] = url_split.netloc url_info['site'] = url_info['site'].split(':')[0] url_info['site_id'] = get_md5_i64(url_info['site']) url_info['path'] = url_split.path url_info['query'] = url_split.query url_info['fragment'] = url_split.fragment try: url_info['domain'] = get_tld(base_url) except Exception as e: url_info['domain'] = url_info['site'] if url_info.get('domain'): url_info['domain_id'] = get_md5_i64(url_info['domain']) else: url_info['domain_id'] = None url_info['url'] = base_url url_info['url_id'] = get_md5_i64(base_url) return url_info
def classifier_urls(urls): oscwd = os.getcwd() r = get_redis(settings.CLASSIFIER_DB) keys = get_whitelist() ecommerce_spiders = [] whitelist_spiders = [] fuzzy_spiders = [] for url in urls: spidername = get_tld(url, fail_silently=True) if spidername: filename = oscwd + settings.TEMP_PATH + '/spiderInitfiles_of_eCommerce' + '/spiderInit_' + spidername + '.py' if os.path.isfile(filename): r.rpush('Ecommerce', url) ecommerce_spiders.append(url) elif spidername in keys: r.rpush('Whitelist', url) whitelist_spiders.append(url) else: r.rpush('Fuzzy', url) fuzzy_spiders.append(url) return ecommerce_spiders, whitelist_spiders, fuzzy_spiders