def compare_host(host1, host2): """ True if the domain.suffix part of both hosts is the same TAB05 """ (_, domain1, suffix1) = tldextract.extract(host1) (_, domain2, suffix2) = tldextract.extract(host2) return domain1 == domain2 and suffix1 == suffix2
def _html_to_dict(self, url): #r = requests.get(url).text r = Crawlera().get(url).text print url try: company_name = BeautifulSoup(r).find('h1',{'itemprop':'name'}) company_name = company_name.find('strong').text except: return {"handle": url} address = BeautifulSoup(r).find('h1',{'itemprop':'name'}).find('span').text city = BeautifulSoup(r).find('span',{'itemprop':'addressLocality'}).text state = BeautifulSoup(r).find('span',{'itemprop':'addressRegion'}).text postal_code = BeautifulSoup(r).find('span',{'itemprop':'postalCode'}).text description = BeautifulSoup(r).find('article',{'itemprop':'description'}).text.strip().replace('\nMore...','') logo = BeautifulSoup(r).find('figure').find('img')['src'] website = BeautifulSoup(r).find('li',{'class':'website'}).find('a')['href'].split('gourl?')[-1] domain = "{}.{}".format(tldextract.extract(website).domain, tldextract.extract(website).tld) ''' Phone ''' main = BeautifulSoup(r).find('li',{'class':'phone'}).find('strong',{'class':'primary'}).text numbers = BeautifulSoup(r).find('li',{'class':'phone'}).findAll('li') nums = [number.find('span').text for number in numbers] names = [number.text.split(number.find('span').text)[0] for number in numbers] numbers = dict(zip(names, nums)) numbers['main'] = main _vars = [company_name, address, city, state, postal_code, description, logo, website, domain] labels = ["name","address","city","state","postal_code", "description", "logo", "website", "domain"] company = dict(zip(labels, _vars)) company["numbers"] = numbers company["handle"] = url return company
def handle(self): SO_ORIGINAL_DST = 80 # self.request is the client connection/socket dst = self.request.getsockopt(socket.SOL_IP, SO_ORIGINAL_DST, 16) # Get the original destination IP before iptables redirect _, dst_port, ip1, ip2, ip3, ip4 = struct.unpack("!HHBBBB8x", dst) dst_ip = '%s.%s.%s.%s' % (ip1,ip2,ip3,ip4) peername = '%s:%s' % (self.request.getpeername()[0], self.request.getpeername()[1]) print success('Client %s -> %s:443' % (peername, dst_ip)) RemoteHostnames[dst_ip] = getCertHostnamesCached(dst_ip) #RemoteHostnames[dst_ip] = ['*.*.*.*','*.*.*','*.*','*'] # example fixed wildcard cert CN = RemoteHostnames[dst_ip][0] # SSL_Certificate_CN2 module will return CN as first list element if add_extra_hostnames: import tldextract domain = tldextract.extract(CN).domain tld = tldextract.extract(CN).tld bonus_hostnames = [] # kludge to work around lack of good support for SNI (server name indication) in python bonus_hostnames.append('www.%s.%s' % (domain,tld)) bonus_hostnames.append('*.%s.%s' % (domain,tld)) bonus_hostnames.append('%s.%s' % (domain,tld)) # without this, requests to (e.g.) https://google.com fail as the CN is for extra_name in bonus_hostnames: # www.google.com and there is no subjectAltName 'google.com' in the cert. if extra_name not in RemoteHostnames[dst_ip]: # however, adding extra hostnames as subjectAltNames makes other certs fail to validate, so disabled by default RemoteHostnames[dst_ip].append(extra_name) PhoneConnected = False CreateSignedX509Certificate(ip=dst_ip, hostnames=RemoteHostnames[dst_ip], peername=peername) try: (certfile, keyfile) = GeneratedCert[dst_ip] #print 'Setting up SSL socket using %s' % certfile stream_phone = ssl.wrap_socket(self.request, server_side=True, certfile=certfile, keyfile=keyfile, ssl_version=ssl.PROTOCOL_TLSv1) PhoneConnected = True except (ssl.SSLError), e: print error('SSLError on connection to phone (%s)' % e) self.finish()
def start(self): for ext in file_extensions: if ext in url_file(self.url): db.collections.update_one({ 'structure': '#URLEntry', 'url': self.url }, {'$set': { 'last_scraped': time.strftime("%Y-%m-%d %H:%M:%S")}}) print('Skipping: {}'.format(self.url)) return None try: with self.sess as sess: html_doc = sess.get(self.url, timeout=3).text except (InvalidSchema, ConnectionError, Timeout, TooManyRedirects): db.collections.remove( { 'structure': '#URLEntry', 'url': self.url } ) return None soup = BeautifulSoup(html_doc, 'html.parser') urls = self.get_urls(soup) for url in urls: existing = db.collections.find_one({ 'structure': '#URLEntry', 'url': url }) if existing is None: try: tld = tldextract.extract(url).suffix except: tld = '*' entry = URLEntry(domain=self.get_domain(url), url=url, tld=tld) db.collections.insert_one(entry.export()) this_existing = db.collections.find_one({ 'structure': '#URLEntry', 'domain': self.get_domain(self.url), 'url': self.url }) if this_existing is not None: db.collections.update_one({ 'structure': '#URLEntry', 'domain': self.get_domain(self.url), 'url': self.url }, {'$set': { 'last_scraped': time.strftime("%Y-%m-%d %H:%M:%S")}}) else: try: tld = tldextract.extract(self.url).suffix except: tld = '*' entry = URLEntry(domain=self.get_domain(self.url), url=self.url, tld=tld) db.collections.insert_one(entry.export())
def test_tldextract(): ''' verify that tldextract parses just the netloc This is neither documented or tested by tldextract (!) ''' assert tldextract.extract('example.com').registered_domain == 'example.com' assert tldextract.extract('www.example.com').registered_domain == 'example.com'
def crawlList(list): main_dict = parsedDictionary.parsedDictionary() #iterate through domains for i in range(0, len(list)): print "Scripts present at " + list[i] scripts = getScripts(list[i]) printList(scripts) #iterate through this domain's scripts #this codes checks if the script is linked externally or is hosted on the same domain (given by a relative URL) dict = parsedDictionary.parsedDictionary() for y in range(0, len(scripts)): full = '' if( (scripts[y].startswith("//")) or (scripts[y].startswith("http"))): full = tldextract.extract(scripts[y]) if(len(full.domain) <= 1): full = tldextract.extract(list[i]) else: full = tldextract.extract(list[i]) link = full.domain + '.' + full.suffix if(not dict.exists(link)): dict.addElement(link) main_dict.add(dict) print main_dict.Dict print "}}}}}" print dict.Dict print "\n -------------------------------" sortedlist = main_dict.sortByValue() print " \n Top scripts: " printList(sortedlist)
def _cache_html_to_df(self, html): company = BeautifulSoup(html) title = company.find('div',{'class':'companyTitle'}) description = company.find('div',{'class':'companyDescription'}) revenue = company.find('div',{'class':'companyRevenue'}) address = company.find('div',{'class':'companyAddress'}) employee_count = company.find('p',{'class':'companyEmployeeCountText'}) website = company.find('div',{'class':'website'}) phone = company.find('span',{'class':'hq'}) industries = company.find('p', {'class':'industry'}) industries = industries.find_all('span') if industries else [] industries = [industry.text for industry in industries] data = [title, description, revenue, address, employee_count, website, phone] columns = ["name", "description", "revenue", "address", "headcount","website","phone"] # add industries data = [val.text.strip() if val else "" for val in data] data = dict(zip(columns, data)) data["industry"] = industries print data data["domain"] = "{}.{}".format(tldextract.extract(data["website"]).domain, tldextract.extract(data["website"]).tld) try: data['logo'] = company.find('img',{'class':'companyLogo'})['src'] except: data['logo'] = "" data["source"] = "zoominfo" data['headcount'] = data['headcount'].split('Employees')[0] data['description'] = data['description'].split('Company Description')[-1] data['revenue'] = data['revenue'].split('in Revenue')[0] # add fullcontact address support print data return data
def process_item(self, item, spider): domain_name=tldextract.extract(item['url']).domain db = self.connection[domain_name] #ÓÃÓòÃû×÷Ϊ self.collection = db[settings['MONGODB_COLLECTION']] valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: if domain_name in spider.crawledPagesPerSite and spider.crawledPagesPerSite[domain_name]>spider.maximumPagesPerSite: return None self.collection.insert(dict(item)) if domain_name in spider.crawledPagesPerSite: spider.crawledPagesPerSite[domain_name]+=1 else: spider.crawledPagesPerSite[domain_name]=1 print "crawledPagesPerSite", spider.crawledPagesPerSite[domain_name] print "spider.allowed_domains", spider.allowed_domains print "spider.maximumPagesPerSite", spider.maximumPagesPerSite print "domain_name", domain_name, item['url'] if spider.crawledPagesPerSite[domain_name]>spider.maximumPagesPerSite: suffix=tldextract.extract(item['url']).suffix domain_and_suffix=domain_name+"."+suffix print domain_and_suffix if domain_and_suffix in spider.allowed_domains: spider.allowed_domains.remove(domain_and_suffix) spider.dynamic_deny_domain.append(domain_name) #spider.rules[0].link_extractor.allow_domains.remove(domain_and_suffix) spider.rules[0].link_extractor.deny_domains.add(domain_and_suffix) print "spider.allowed_domains", spider.allowed_domains return None log.msg("Item added to MongoDB database!",level=log.DEBUG, spider=spider) return item
def same_domain(url1, url2): url1_extract = tldextract.extract(url1) url2_extract = tldextract.extract(url2) if url1_extract.domain == url2_extract.domain: return True else: return False
def loadLists(writer=sys.stdout): if isStale(suspect_file): print >> writer, "Updating ISC Suspicious Domains..." new_file = requests.get(isc_url) with open(suspect_file, 'w') as sf_buffer: sf_buffer.write(new_file.content) if safebrowsing_bootstrap: print("Initial download of SafeBrowsing DB... this will take a few minutes.") updateSafebrowsing() elif isStale(safebrowsing_db, maxTime=259200): print >> writer, "Updating Google Safebrowsing DB..." updateSafebrowsing() if isStale(topthousand_file, maxTime=2629743): print >> writer, "Updating Alexa Top 1000..." new_file = requests.get(topmillion_url) with zipfile.ZipFile(StringIO(new_file.content), 'r') as zipData: with zipData.open('top-1m.csv', 'r') as oneMil: with open(topthousand_file, 'w') as topThousand: for i in range(0,1000): topThousand.write(oneMil.readline()) for sf_read in open(suspect_file): badDomain = tldextract.extract(sf_read) ISC_LIST.append(badDomain) for topthousand_read in open(topthousand_file): cleaned_line = topthousand_read.split(",")[1].strip() valuableDomain = tldextract.extract(cleaned_line) ALEXA_LIST.append(valuableDomain)
def insert(data): if data.strip(): con = MySQLdb.connect(host="localhost", # your host, usually localhost user="******", # your username passwd="1234", # your password db="rabbitmq") # name of the data base cur = con.cursor() query="insert into rabbitmq (url,domain,ttl,class,type,ip,worker)values(%s,%s,%s,%s,%s,%s,%s)" tld="" try: tld=tldextract.extract(data).registered_domain except: traceback.format_exc() try: digs= os.popen("dig +tries=1 +timeout=1 +noall +answer "+tldextract.extract(tld).registered_domain).read() digs=str(digs).split('\n') for dig in digs: if(dig.strip()): try: dig=dig.replace("\t\t","\t") dig=dig.replace("\t\t","\t") temp=dig.split('\t') print "Data: "+temp[0] +"\t Data: "+ temp[1]+"\t Data: "+ temp[2]+"\t Data: "+ temp[3]+"\t Data: "+ temp[4] params=(data.strip(),tld.strip(),temp[1].strip(),temp[2].strip(),temp[3].strip(),temp[4].strip(),worker) cur.execute(query,params) except: params=(data.strip(),tld.strip(),"","","","",worker) cur.execute(query,params) except: params=(data.strip(),tld.strip(),"","","","",worker) cur.execute(query,params) con.commit() cur.close() con.close()
def email_pattern_research(): website = request.args['domain'] domain = "{}.{}".format(tldextract.extract(website).domain, tldextract.extract(website).tld) api_key = "9a31a1defcdc87a618e12970435fd44741d7b88794f7396cbec486b8" name = request.args['name'] if "name" in request.args.keys() else "" q.enqueue(EmailGuess().search_sources, domain, name, api_key, timeout=6000) return {'email_research_started':True}
def is_same_domain(url1, url2): """Check seedurl and other url belongs to same domain. >>>is_same_domain("http://kracekumar.wordpress.com", "http://wordpress.com") True >>>is_same_domain("http://kracekumar.com", "http://tumblr.com") False """ return tldextract.extract(url1).domain == tldextract.extract(url2).domain
def mxsniff(email_or_domain, ignore_errors=False, cache=None): """ Lookup MX records for a given email address, URL or domain name and identify the email service provider(s) from an internal list of known service providers. :param str email_or_domain: Email, domain or URL to lookup :return: Identified service provider, or a list if there's more than one (in unusual circumstances) >>> mxsniff('example.com')['match'] ['nomx'] >>> mxsniff('__invalid_domain_name__.com')['match'] ['nomx'] >>> mxsniff('*****@*****.**')['match'] ['google-gmail'] >>> sorted(mxsniff('https://google.com/').items()) [('domain', 'google.com'), ('match', ['google-apps']), ('mx', [(10, 'aspmx.l.google.com'), (20, 'alt1.aspmx.l.google.com'), (30, 'alt2.aspmx.l.google.com'), (40, 'alt3.aspmx.l.google.com'), (50, 'alt4.aspmx.l.google.com')]), ('mx_tld', ['google.com']), ('query', 'https://google.com/')] """ domain = get_domain(email_or_domain) if cache and domain in cache: return cache[domain] result = [] tld = [] try: answers = [] # Default value in case of verbose mode where an error occurs answers = sorted([(rdata.preference, rdata.exchange.to_text(omit_final_dot=True).lower()) for rdata in dns.resolver.query(domain, 'MX')]) for preference, exchange in answers: rdomain = tldextract.extract(exchange).registered_domain if rdomain not in tld: tld.append(rdomain) provider = provider_domains.get(exchange) if provider and provider not in result: result.append(provider) except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.NoNameservers): pass except dns.exception.DNSException as e: if ignore_errors: pass else: raise MXLookupException('{exc} {error} ({domain})'.format( exc=e.__class__.__name__, error=text_type(e), domain=domain)) if not result: # Check for self-hosted email servers; identify them with the label 'self' if tldextract.extract(domain).registered_domain in tld: result.append('self') if not result: if answers: result.append('unknown') # We don't know this one's provider else: result.append('nomx') # This domain has no mail servers result = {'query': email_or_domain, 'domain': domain, 'match': result, 'mx': answers, 'mx_tld': tld} if cache: cache[domain] = result return result
def check_domain_limit(self, url): for domain in self.limit_domain: ext = tldextract.extract(domain) # *的时候匹配所有二级域名,或者只匹配特定的域名 if ((ext[0] == "*" or ext[0] == "") and tldextract.extract(url)[1] == ext[1]) or \ (".".join(tldextract.extract(url)) == domain): return True return False
def check_match(_url, url): target_url = _url['target_url'] allowed_domains = _url['allowed_domains'] match = False url_domain = tldextract.extract(url).domain.lower() target_url_domain = tldextract.extract(target_url).domain.lower() if url_domain == target_url_domain or url_domain in allowed_domains: match = True return match
def _check_match(self, url): match = False url_domain = tldextract.extract(url).domain.lower() target_url_domain = tldextract.extract( self._data['url_data']['target_url'] ).domain.lower() if url_domain == target_url_domain or \ url_domain in self._data['url_data']['allowed_domains']: match = True return match
def validRedirect(history): first=history[0] last=history[-1] if "blogspot" in first and "blogspot" in last: return True first = tldextract.extract(first) last = tldextract.extract(last) if first.domain!=last.domain: return False else: return True
def extract_HAR_features(harfile): """ Opens a HAR file (JSON), extracts features from it and store them in a dict. Returns the dict with the features. """ har_features = {} har = json.loads(open(harfile).read()) domain = har["log"]["pages"][0]["id"] # Extract domain ext = tldextract.extract(domain) domain = ext.domain + '.' + ext.suffix domainNoTLD = ext.domain # initialize variables domainStringSent, firstparty_data, thirdparty_data, firstparty_html, thirdparty_html, firstparty_requests, thirdparty_requests = 0, 0, 0, 0, 0, 0, 0 for entry in har["log"]["entries"]: requestUrl = str(entry["request"]["url"]) ext = tldextract.extract(requestUrl) requestDomain = ext.domain + '.' + ext.suffix # Check if the domainNoTLD is passed in the parameters of the request url_parameters = re.search('https?:\/\/.*\/(.*)', requestUrl) if url_parameters: if domainNoTLD in url_parameters.group(1): domainStringSent += 1 # Check if this is a first-party request (Request domain == site domain) result = re.search('https?:\/\/(.*)\/.*', requestUrl) if result: if domain in result.group(1): # print requestUrl, 'is FIRST party request of size', entry["response"]["bodySize"] firstparty_requests += 1 firstparty_data += int(entry["response"]["bodySize"]) if entry["response"]["content"]["mimeType"]: mimeType = entry["response"]["content"]["mimeType"] if 'text' in mimeType or 'javascript' in mimeType: firstparty_html += entry["response"]["bodySize"] else: # print requestUrl, 'is THIRD party request of size', entry["response"]["bodySize"] thirdparty_requests += 1 thirdparty_data += int(entry["response"]["bodySize"]) if entry["response"]["content"]["mimeType"]: mimeType = entry["response"]["content"]["mimeType"] if 'text' in mimeType or 'javascript' in mimeType: thirdparty_html += entry["response"]["bodySize"] har_features['TP_DataRatio'] = safe_division(thirdparty_data, firstparty_data + thirdparty_data) har_features['TP_HtmlRatio'] = safe_division(thirdparty_html, firstparty_html + thirdparty_html) har_features['TP_RequestRatio'] = safe_division(thirdparty_requests, firstparty_requests + thirdparty_requests) har_features['domainStringSent'] = domainStringSent har_features['initialResponseSize'] = har["log"]["entries"][0]["response"]["bodySize"] har_features['initialResponseRatio'] = safe_division(har_features['initialResponseSize'], firstparty_data + thirdparty_data) return har_features
def url_as_diff(new, old): if new == old: return '<same>' if new == '-': return new old_parse = urlparse.urlsplit(old) new_parse = urlparse.urlsplit(new) changed = set() for f in old_parse._fields: new_f = getattr(new_parse, f) if new_f and new_f == getattr(old_parse, f): new_parse = new_parse._replace(**{f: '<{}>'.format(f)}) elif new_f: changed.add(f) if tuple(changed) == ('scheme',): return '{}://<same>'.format(new_parse.scheme) if (not new_parse.netloc.startswith('<') and new_parse.port is None and old_parse.port is None): new_domain = tldextract.extract(new_parse.netloc) old_domain = tldextract.extract(old_parse.netloc) for f in old_domain._fields: new_f = getattr(new_domain, f) if new_f and new_f == getattr(old_domain, f): new_domain = new_domain._replace(**{f: '<{}>'.format(f)}) new_domain = '.'.join(new_domain).replace('<domain>.<suffix>', '<domain+>') new_parse = new_parse._replace(netloc=new_domain) if new_parse.path == old_parse.path + '/': new_parse = new_parse._replace(path='<path>/') if new_parse.path.startswith('/') and old_parse.path.startswith('/'): new_dirs = new_parse.path[1:].split('/') old_dirs = old_parse.path[1:].split('/') if new_dirs[-1] and new_dirs[-1] == old_dirs[-1]: new_dirs[-1] = '<basename>' old_dirs = {d: i for i, d in enumerate(old_dirs)} for i, new_dir in enumerate(new_dirs): if new_dir in old_dirs: new_dirs[i] = '<dir{}>'.format(old_dirs[new_dir] + 1) new_parse = new_parse._replace(path='/' + '/'.join(new_dirs)) if (old_parse.query and new_parse.query and not new_parse.query.startswith('<')): old_query = set(old_parse.query.split('&')) new_query = set(new_parse.query.split('&')) if new_query > old_query: new_params = '&'.join(sorted(map(urllib.quote, new_query - old_query))) new_parse = new_parse._replace(query='<query>' + '&' + new_params) out = new_parse.geturl() return out
def _company_cache_html_to_df(self, html): company_info = pd.DataFrame() c = BeautifulSoup(html) #print c.find('dd',{'class','basic-info-about'}).text if True: cols = [i.find('h4').text for i in c.find('dd',{'class','basic-info-about'}).findAll('li')] vals = [i.find('p').text.strip() for i in c.find('dd',{'class','basic-info-about'}).findAll('li')] company_info = company_info.append(dict(zip(cols,vals)),ignore_index=True) company_info.columns = [col.replace(' ','_').strip().lower() for col in company_info.columns] description = c.find('div', {'class':'description'}) description = description.text.strip() if description else None company_info['description'] = description # rename companies title columns img = c.find('div',{'class':'image-wrapper'}).find('img')['src'] company_info['logo'] = img # new code not in other methods in different file company_info['name'] = c.find('h1',{'class':'name'}).text.strip() employee_count = c.find('a',{'class':'employee-count'}) if employee_count: company_info['employee_count'] = int(employee_count.text.replace(',','')) url = None for i in c.find_all("h3"): if i.find("a"): url = i.find("a")["href"] url = url.split("?")[-1] args = dict([i.split("=") for i in url.split("&")]) if "f_CC" in args.keys(): url = "http://linkedin.com/company/{0}".format(args["f_CC"]) else: url = None company_info["linkedin_url"] = url if 'headquarters' in company_info.columns: company_info['address'] = company_info['headquarters'] company_info.drop('headquarters', axis=1, inplace=True) if 'industry' in company_info.columns: company_info['industry'] = [[company_info['industry'].ix[0]] for i in range(company_info.shape[0])] website = company_info['website'].ix[0] domain = "{}.{}".format(tldextract.extract(website).domain, tldextract.extract(website).tld) company_info['domain'] = domain company_info['source'] = "linkedin" company_info['headcount'] = company_info['company_size'] company_info['headcount'] = company_info['headcount'].ix[0].split(' ')[0] if 'company_size' in company_info.columns: company_info.drop('company_size', axis=1, inplace=True) return company_info '''
def _company_profile(self, company_name, api_key=""): g = Google().search(company_name) g = g[~g.link_text.str.contains("Map for")] #print g #print g.link.tolist()[0] domain = g.link.tolist()[0] domain = "{}.{}".format(tldextract.extract(domain).domain, tldextract.extract(domain).tld) print domain company = clearbit.Company.find(domain=domain, stream=True) company = company if company else {} company["company_name"] = company_name del company["founders"]
def enhance_flow(flowDF, ftu): """ Add some useful columns to a http dataframe. Parameters ---------- flowDF : dataframe The enhanced HTTP log dataframe Returns ------- flowDF: the dataframe with some columns added """ #create some useful pre-features #stringify the port. probably no longer needed since we defensivley stringify things elsewhere. #flowDF['resp_p_str'] = flowDF['resp_p'].apply(str) #extract the browser string from the user agent. if 'browser_string' in ftu: flowDF['browser_string'] = flowDF['user_agent'].apply(lambda agent: httpagentparser.simple_detect(agent)[1]) def paramsSSV(uri): fullUri = 'http://bogus.com/'+uri parseResult = parse_qs(urlparse(fullUri).query) return ' '.join(parseResult.keys()) #create a SSV of the URI parameter keys if 'URIparams' in ftu: flowDF['URIparams'] = flowDF['uri'].apply(paramsSSV) def tokensSSV(uri): fullUri = 'http://bogus.com/'+uri parseResult = parse_qs(urlparse(fullUri).query) return ' '.join([" ".join(vals) for vals in parseResult.values()]) #create a SSV of the URI parameter values if 'URItokens' in ftu: flowDF['URItokens'] = flowDF['uri'].apply(tokensSSV) #extract the subdomain from the host if 'subdomain' in ftu: flowDF['subdomain'] = flowDF['host'].apply(lambda host: tldextract.extract(host)[0]) #extract the TLD from the host if 'tld' in ftu: flowDF['tld'] = flowDF['host'].apply(lambda host: tldextract.extract(host)[1]) return flowDF
def ensure_add_cookie(self, cookie, override_domain=None): """Ensures a cookie gets added to the driver Selenium needs the driver to be currently at the domain of the cookie before allowing you to add it, so we need to get through this limitation. The cookie parameter is a dict which must contain the keys (name, value, domain) and may contain the keys (path, expiry). We first check that we aren't currently in the cookie's domain, if we aren't, we GET the cookie's domain and then add the cookies to the driver. We can override the cookie's domain using 'override_domain'. The use for this parameter is that sometimes GETting the cookie's domain redirects you to a different sub domain, and therefore adding the cookie fails. So sometimes the user may need to override the cookie's domain to a less strict one, Eg.: 'site.com' instead of 'home.site.com', in this way even if the site redirects us to a subdomain, the cookie will stick. If you set the domain to '', the cookie gets added with whatever domain the browser is currently at (at least in chrome it does), so this ensures the cookie gets added. It also retries adding the cookie with a more permissive domain if it fails in the first try, and raises an exception if that fails. The standard selenium behaviour in this case was to not do anything, which was very hard to debug. """ if override_domain: cookie['domain'] = override_domain cookie_domain = cookie['domain'] if cookie['domain'][0] != '.' else cookie['domain'][1:] try: browser_domain = tldextract.extract(self.current_url).fqdn except AttributeError: browser_domain = '' if cookie_domain not in browser_domain: # TODO Check if hardcoding 'http' causes trouble # TODO Consider using a new proxy for this next request to not cause an anomalous # request. This way their server sees our ip address as continuously having the # same cookies and not have a request mid-session with no cookies self.get('http://' + cookie_domain) # Fixes phantomjs bug, all domains must start with a period if self.name == "phantomjs": cookie['domain'] = '.' + cookie['domain'] self.add_cookie(cookie) # If we fail adding the cookie, retry with a more permissive domain if not self.is_cookie_in_driver(cookie): cookie['domain'] = tldextract.extract(cookie['domain']).registered_domain self.add_cookie(cookie) if not self.is_cookie_in_driver(cookie): raise WebDriverException( "Couldn't add the following cookie to the webdriver\n{}\n".format(cookie) )
def parse_ad(rad): d = dict() t = rad.find('h3').find('a') dest_url = t.get('href') if dest_url: d['dest_url'] = dest_url # d = dict(d,**{'dest_url':dest_url}) dest_url_parsed = parse_qs(dest_url) if dest_url_parsed: dest_url_parsed = {k:v[0] for k,v in dest_url_parsed.iteritems()} if dest_url_parsed: d['dest_url_parsed'] = dest_url_parsed if dest_url_parsed.has_key('adurl'): adurl = dest_url_parsed['adurl'] if adurl: d['adurl'] = adurl d['adurl_domain'] = tldextract.extract(adurl).domain title = t.getText() if title: d['title'] = title #d = dict(d,**{'title':title}) disp_url = rad.find('div','kv') if disp_url: d['disp_url'] = disp_url.getText() d['disp_url_domain'] = tldextract.extract(d['disp_url']).domain # ad_text_html = rad.find('span','ac') if ad_text_html: d['ad_text_html'] = ad_text_html.renderContents() ad_text_lines = [re.sub(r"</?b>","",x) for x in d['ad_text_html'].split('<br/>')] if len(ad_text_lines)>=1: d['ad_text_line_1'] = ad_text_lines[0] if len(ad_text_lines)>=2: d['ad_text_line_2'] = ad_text_lines[1] else: d['ad_text_line_2'] = '' else: d['ad_text_line_1'] = '' div_f_html = rad.find('div','f') if div_f_html: d['div_f_html'] = div_f_html.renderContents() d['div_f_text'] = div_f_html.get_text('|||') # ad_text = ttt.getText(separator='|||') return d
def parse_file(filename, output_file): input_file = open(filename,'r') domains_lookedup = [] excluded_domains = [] total_domain_count = 0 if output_file != 0: data = [csv_headers()] noutput_file = output_file.split('.',1)[0]+'.csv' print """ ****************** Writing output to %s ****************** """%noutput_file for domain in input_file.readlines(): ndomain = tldextract.extract(domain) tld_domain = ndomain[1]+'.'+ndomain[2] if tld_domain not in domains_lookedup: domains_lookedup.append(tld_domain) total_domain_count += 1 whois_data = get_whois_data(tld_domain,1) if whois_data != 0: data.append(whois_data) else: excluded_domains.append(tld_domain) time.sleep(2) print """ Attempted to retrieve whois information for %s domains Successful lookups: %s Unsuccessful lookups: %s """%(str(total_domain_count),str(total_domain_count-len(excluded_domains)),str(len(excluded_domains))) write_to_file(data,noutput_file) else: for domain in input_file.readlines(): ndomain = tldextract.extract(domain) tld_domain = ndomain[1]+'.'+ndomain[2] if tld_domain not in domains_lookedup: domains_lookedup.append(tld_domain) total_domain_count += 1 whois_info = get_whois_data(tld_domain,2) if whois_info != 0: print "\n****************** %s ******************"%tld_domain.strip() for key,value in whois_info.items(): print key+": "+value else: excluded_domains.append(domain) time.sleep(2) print """ Attempted to retrieve whois information for %s domains Successful lookups: %s Unsuccessful lookups: %s """%(str(total_domain_count),str(total_domain_count-len(excluded_domains)),str(len(excluded_domains))) print excluded_domains
def clean_row(row): """ Clean a row to [0, 2, 27, 30, 31, 7, 17, 26, 32, 33, 34, 35, 37, 44, 58] """ if len(row) < NUM_FIELDS: row.extend("") new_row = [row[0]] # just id # format date unix_epoch = datetime(1970, 1, 1) sql_date = row[1] parsed_sql_date = datetime(int(sql_date[:4]), int(sql_date[4:6]), int(sql_date[6:8])) days_since_epoch = (parsed_sql_date - unix_epoch).days new_row.extend([days_since_epoch]) # add other fields new_row.extend( [row[26], row[29], row[30], row[6], row[16], row[25], row[31], row[32], row[33], row[34], row[36], row[43]] ) # format url domain = tldextract.extract(row[57]).domain new_row.extend([domain]) return new_row
def add_to_org(sender, **kwargs): org = sender.objects.last() user = org.creator member, created = OrganisationMember.objects.get_or_create( org=org, member=user) if created: tld = tldextract.extract(org.url) client = Client() client.name = org.name client.organisation = org client.schema_name = org.slug client.paid_until = datetime.now() + timedelta(days=90) try: client.domain_url = tld.domain client.save() except KeyError: try: client.domain_url = tld.domain + '-' + tld.subdomain client.save() except KeyError: client.domain_url = org.slug client.save()
def _process_record(self, r): # user agent and browser os, browser = self.__process_user_agent(r['user_agent']) if os: self.data['os'].add(os) if browser: self.data['browser'].add(browser) # http basic auth usernames and passwords if r['username'] != '-': self.data['http-usernames'].add(r['username']) if r['password'] != '-': self.data['http-passwords'].add(r['password']) # ip address if r['id.orig_h'] != '-': self.data['device-ip'] = r['id.orig_h'] host = r['host'] uri = r['uri'] if host != "-" or uri != "-": data = (host, uri, r['ts']) self.data['http-queries'].add( data ) # Also get referrer data ref = r['referrer'] domain = tldextract.extract(ref) if ref: refhost = "%s.%s" % (domain.domain, domain.suffix) refdata = (refhost, ref, r['ts']) self.data['http-queries'].add(refdata)
def spider(url,lvl=1): tld_url = tldextract.extract(url) tld_url = '.'.join(tld_url[:3]) pos = url.rfind('/') outFile = url[pos+1:] print (outFile) response = requests.get(url) #storing all the information including headers in the variable source code if response.status_code == 200: plain_text = response.text #sort source code and store only the plaintext convert_data = BeautifulSoup(plain_text) #converting plain_text to Beautiful Soup object so the library can sort thru it for link in convert_data.findAll('a'): #sorting useful information if link.get('href').find('//') == 0: #address URLs that start with // href = 'https:' + link.get('href') elif validators.url(link.get('href')): #address absolute URLs href = link.get('href') else: #address relative URLs href = url + link.get('href') #Building a clickable url #insertSQL(href, convert_data) print(indent(lvl) +str(lvl) + '. ' +href) #displaying the result back to the user #outData = codecs.open(saveLocation +'\\' +outFile +'.html', 'w', 'utf-8') #outData.write(plain_text) #outData.close() if lvl < max_depth: spider(href, lvl+1)
def parse_source(url): """Return stripped url containing only domain and suffix. """ return '{0.domain}.{0.suffix}'.format(extract(url))
from urllib.request import urlopen import lxml.html import tldextract url = 'https://www.google.com' connection = urlopen(url) dom = lxml.html.fromstring(connection.read()) extracted = tldextract.extract(url) main_domain = extracted.domain yes = 0 no = 0 count = 0 for link in dom.xpath( '//a/@href'): # select the url in href for all a tags(links) link = str(link) count += 1 if main_domain in link: yes += 1 elif link.startswith('/'): yes += 1 else: print(link) no += 1 print((yes / count) * 100) print((no / count) * 100) n_per = (no / count) * 100 if n_per > 70: score15 = -1 elif 60 <= n_per <= 70: score15 = 0 else:
def get_domain_from_host(validation_dns_record): """ Given an FQDN, return the domain portion of a host """ domain_tld_info = tldextract.extract(validation_dns_record) return "%s.%s" % (domain_tld_info.domain, domain_tld_info.suffix)
def get_domain_from_url(url): url_object = tldextract.extract(url) domain_name = url_object.domain + "." + url_object.suffix return domain_name
def processing(data): bytesOut = data['bytesOut'] bytesIn = data['bytesIn'] pktsOut = data['pktsOut'] pktsIn = data['pktsIn'] tlsSubject = data['tlsSubject'] tlsIssuerDn = data['tlsIssuerDn'] tlsSni = data['tlsSni'] tlsVersion = data['tlsVersion'] outRatio = [] # 出流量/出包数 inRatio = [] # 入流量/入包数 orgName = [] sni = [] ''' 计算 (出流量/出包数)与(入流量/入包数) ''' for i in range(len(bytesIn)): outRatio.append(bytesOut[i] / pktsOut[i]) inRatio.append(bytesIn[i] / pktsIn[i]) # print('outRatio: {}, inRatio: {}'.format(bytesOut[i] / pktsOut[i], bytesIn[i] / pktsIn[i])) ''' 过滤organization全部放入orgName列表 ''' pattern_O = 'O=.*?([,/]+|$)' for tmp in tlsSubject: # 读取TLSSubject内容,将O=字段截出,保存入orgName列表,空值填入NaN字符串 if pd.isna(tmp): orgName.append('NULL') else: res = re.search(pattern_O, tmp) if res: res = res.group() if res.startswith('O='): res = res[2:] if res.endswith(','): res = res[:-1] if res.endswith('.'): res = res[:-1] if res.endswith('./'): res = res[:-2] orgName.append(res) else: orgName.append('null') # 区分所有字段的缺失与单字段的缺失 ''' 过滤Subject中的CN ''' pattern_CN = 'CN=.*?(/|$)' commonName = [] for tmp in tlsSubject: if pd.isna(tmp): commonName.append('NULL') else: res = re.search(pattern_CN, tmp) if res: res = res.group() if res.startswith('CN='): res = res[3:] if res.endswith('/'): res = res[:-1] commonName.append(res) else: commonName.append('null') ''' 过滤tlsIssuerDn中的CN ''' pattern_CN = 'CN=.*?(/|$)' dn_commonName = [] for tmp in tlsIssuerDn: if pd.isna(tmp): dn_commonName.append('NULL') else: res = re.search(pattern_CN, tmp) if res: res = res.group() if res.startswith('CN='): res = res[3:] if res.endswith('/'): res = res[:-1] dn_commonName.append(res) else: dn_commonName.append('null') ''' 从tlsSni取顶级域名 ''' for tmp in tlsSni: if pd.isna(tmp): sni.append('NULL') else: tld = tldextract.extract(tmp) sni.append(tld.domain) X = pd.DataFrame({ 'O': orgName, 'CN': commonName, 'Dn': dn_commonName, 'Sni': sni, 'Version': tlsVersion, 'OutRatio': outRatio, 'InRatio': inRatio }) return X
def get_domain(url): """Get the domain of a URL using tldextract.""" return tldextract.extract(url).domain
'--verbose3', help='DEBUG level verbosity. Also displays headers and requests', action='store_true') args = parser.parse_args() if args.url.startswith('http' or 'https'): url = args.url else: print 'ERROR: The URL must start with http or https' exit() print('The URL being tested is ' + colorgrn.format(str(url))) #extract base url string, will convert string like google.com to googlecom #domain = str(str(url.split("//")[1:]))[1:-1] #burl = domain.replace("'","") ext = tldextract.extract(url) domain = '.'.join(ext[1:]) #Set unique user Agent if args.useragent == ('google'): headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'} elif args.useragent == ('bing'): headers = { 'User-Agent': 'Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)' } elif args.useragent == ('ie6'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)' } elif args.useragent == ('ie10'):
def start_requests(self): urls = list() non_shopify_list = list() bots_list = list() # Get all urls to scrape with open(os.path.dirname(__file__) + self.url_file, "rt") as f: urls = [url.strip() for url in f.readlines()] # Supported non shopify sites list with open(os.path.dirname(__file__) + self.non_shopify_file, "rt") as f: non_shopify_list = [url.strip() for url in f.readlines()] # Supported bots sites list with open(os.path.dirname(__file__) + self.bots_file, "rt") as f: bots_list = [url.strip() for url in f.readlines()] for url in urls: t = tldextract.extract(url) root = t.domain + '.' + t.suffix proxy_enabled = self.settings.get('PROXY_ENABLED') adidas_proxy_enabled = self.settings.get('ADIDAS_PROXY_ENABLED') # Adidas site (uses scrapy-splash) if "adidas.com" in url: # With proxy if adidas_proxy_enabled: yield SplashRequest(url, self.adidas_parse, headers=self.adidas_headers(), args={ 'images_enabled': 'false', 'proxy': self.random_proxy() }) # Without proxy else: yield SplashRequest(url, self.adidas_parse, headers=self.adidas_headers(), args={'images_enabled': 'false'}) # Non shopify site elif any(root in s for s in non_shopify_list): # With proxy if proxy_enabled: yield scrapy.Request(url, self.non_shoify, meta={'proxy': self.random_proxy()}) # Without proxy else: yield scrapy.Request(url, self.non_shoify) # Bots elif any(root in s for s in bots_list): # With proxy if proxy_enabled: yield scrapy.Request(url, self.bots_parse, meta={'proxy': self.random_proxy()}) # Without proxy else: yield scrapy.Request(url, self.bots_parse) # Shopify sites else: # With proxy if proxy_enabled: yield scrapy.Request(url, self.shopify_parse, meta={'proxy': self.random_proxy()}) # Without proxy else: yield scrapy.Request(url, self.shopify_parse)
def non_shoify(self, response): t = tldextract.extract(response.url) root = t.domain + '.' + t.suffix if "footshop.com" in root: products = Selector(response).xpath( '//div[@class="col-xs-6 col-md-4 col-lg-3"]') for product in products: item = Sneaker() item['name'] = product.xpath('a/@title').extract()[0] item['url'] = product.xpath('a/@href').extract()[0] # item['image'] = product.xpath('a/div/img/@data-src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "caliroots.com" in root: products = Selector(response).xpath( '//ul[@class="product-list row"]//li[contains(@class,"product")]' ) for product in products: item = Sneaker() item['name'] = product.xpath('.//a/p[2]/text()').extract()[0] item['url'] = "https://caliroots.com" + \ product.xpath('.//a/@href').extract()[0] # item['image'] = product.xpath('.//a/div/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "size.co.uk" in root: products = Selector(response).xpath( '//ul[@class="listProducts productList"]//li[contains(@class,"productListItem")]' ) for product in products: item = Sneaker() item['name'] = product.xpath( './/span/span/span/a/text()').extract()[0] item['url'] = "https://www.size.co.uk" + \ product.xpath('.//span/span/span/a/@href').extract()[0] # item['image'] = product.xpath('.//span/a/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "jdsports.co.uk" in root: products = Selector(response).xpath( '//ul[@class="listProducts productList"]//li[contains(@class,"productListItem")]' ) for product in products: item = Sneaker() item['name'] = product.xpath( './/span/a/img/@title').extract()[0] item['url'] = "https://www.jdsports.co.uk" + \ product.xpath('.//span/a/@href').extract()[0] # item['image'] = product.xpath('.//span/a/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "5pointz.co.uk" in root: products = Selector(response).xpath( '//ol[@class="listing listing--grid"]//li[contains(@class,"listing-item")]//article//figure' ) for product in products: item = Sneaker() item['name'] = product.xpath('a/@title').extract()[0] item['url'] = product.xpath('a/@href').extract()[0] # item['image'] = product.xpath('a/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "footasylum.com" in root: products = Selector(response).xpath( '//div[@class="productDataOnPage_inner"]//ul[@class="main-list row"]//li[contains(@class,"left")]' ) for product in products: item = Sneaker() item['name'] = product.xpath( 'div/span[2]/img/@alt').extract()[0] item['url'] = product.xpath('div/span[1]/text()').extract()[0] # item['image'] = "https://www.footasylum.com" + product.xpath('div/span[2]/img/@data-original').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "asphaltgold.de" in root: products = Selector(response).xpath( '//div[@class="product-grid"]//section[contains(@class,"item")]' ) for product in products: item = Sneaker() item['name'] = product.xpath('a/@title').extract()[0] item['url'] = product.xpath('a/@href').extract()[0] # item['image'] = product.xpath('a/img//@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "wellgosh.com" in root: products = Selector(response).xpath( '//div[@class="category-products row grid-mode"]//article[contains(@class,"small-6")]' ) for product in products: item = Sneaker() item['name'] = product.xpath('.//figure/a/@title').extract()[0] item['url'] = product.xpath('.//figure/a/@href').extract()[0] # item['image'] = product.xpath('.//figure/a/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "hypedc.com" in root: products = Selector(response).xpath( '//div[@class="category-products row"]//div[contains(@class,"item")]' ) for product in products: item = Sneaker() item['name'] = product.xpath('.//a/@title').extract()[0] item['url'] = product.xpath('.//a/@href').extract()[0] # item['image'] = product.xpath('.//a/div/img/@data-src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "bstnstore.com" in root: products = Selector(response).xpath( '//ul[@class="block-grid four-up mobile-two-up productlist"]//li[contains(@class,"item")]//div[@class="itemWrapper pOverlay"]//div[@class="pImageContainer"]//a[@class="plink image"]' ) for product in products: item = Sneaker() item['name'] = product.xpath('div/@data-alt').extract()[0] item['url'] = "https://www.bstnstore.com" + \ product.xpath('@href').extract()[0] # item['image'] = "https://www.bstnstore.com" + product.xpath('div/div[2]/@data-src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "allikestore.com" in root: products = Selector(response).xpath( '//ul[@class="products-grid"]//li[contains(@class,"item")]//div[@class="item-wrap"]' ) for product in products: item = Sneaker() item['name'] = product.xpath('a/@title').extract()[0] item['url'] = product.xpath('a/@href').extract()[0] # item['image'] = product.xpath('a/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "back-door.it" in root: products = Selector(response).xpath( '//ul[@class="products clearfix"]//li') for product in products: item = Sneaker() item['name'] = product.xpath('a[1]/h6/text()').extract()[0] item['url'] = product.xpath('a[1]/@href').extract()[0] # item['image'] = product.xpath('div/a[2]/span/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "mrporter.com" in root: products = Selector(response).xpath( '//div[@class="pl-grid__column pl-grid__column--main"]//ul[@class="pl-products"]//li[contains(@class,"pl-products-item")]' ) for product in products: item = Sneaker() item['name'] = product.xpath( 'a/div[2]/div/span[2]/text()').extract()[0].replace( " Sneakers", "") item['url'] = "https://www.mrporter.com" + \ product.xpath('a/@href').extract()[0] # item['image'] = product.xpath('a/div[1]/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "titolo.ch" in root: products = Selector(response).xpath( '//ul[@class="small-block-grid-2 medium-block-grid-3 large-block-grid-4 no-bullet"]//li[contains(@class,"item")]//div[@class="list-inner-wrapper"]' ) for product in products: item = Sneaker() item['name'] = product.xpath('a/@title').extract()[0] item['url'] = product.xpath('a/@href').extract()[0] # item['image'] = product.xpath('div[1]/a/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item elif "xileclothing.com" in root: products = Selector(response).xpath( '//ul[@class="itemsList"]/li/div[1]') for product in products: item = Sneaker() item['name'] = product.xpath('a/img/@alt').extract()[0] item['url'] = product.xpath('a/@href').extract()[0] # item['image'] = "https://www.xileclothing.com" + product.xpath('a/img/@src').extract()[0] # item['size'] = '**NOT SUPPORTED YET**' yield item
def validate_result(self, pattern_text: str): # noqa D102 result = tldextract.extract(pattern_text) return result.fqdn != ""
def extract_domain(url): ext = tldextract.extract(url) domain = ext.domain return domain
def get_alexa(num, address=ALEXA_1M, filename='top-1m.csv'): """Grabs Alexa 1M""" url = urlopen(address) zipfile = ZipFile(StringIO(url.read())) return [tldextract.extract(x.split(',')[1]).domain for x in \ zipfile.read(filename).split()[:num]]
def get_registered_domain(hostname): """Get the root DNS domain of an FQDN.""" return tldextract.extract(hostname).registered_domain
def count_words_tld(address): """Count occurrences of keywords in domain""" count = count_words(tldextract.extract(address).domain) return count
def scrapper(url): if "www" in url: url = url.replace("www.", "") print(url) else: pass headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1", "DNT": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate" } final_report = [] final_score = 0 from result_dict import result_dict domain = tldextract.extract(url).domain suffix = tldextract.extract(url).suffix subdomain = tldextract.extract(url).subdomain pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>' # row 15 HTTPS test result = {'name': 'https_test', 'message': '', 'marks': ''} if "https" in url or "http" in url: print("if worked") try: a = url.split(":") a[0] = "https:" web = "".join(a) except: pass print("This is web ", web) try: print("try of if worked") r = requests.get(web, headers=headers) # req = urllib.request.Request(url, headers=headers) # r = urllib.request.urlopen(req) result[ 'message'] = 'Félicitations. Votre site les données transitants par votre site sont sécurisées avec un certificat SSL' result['marks'] = 4 except: try: a = url.split(":") a[0] = "http:" url3 = "".join(a) except: pass print("try of except worked") r = requests.get(url3, headers=headers, verify=False) url = url3 # req = urllib.request.Request(url, headers=headers) # r = urllib.request.urlopen(req) result['message'] = ''' Votre site ne dispose pas de certificat SSL. Les données qui y transitent peuvent donc être récupérés par des parties malveillantes. Google donne une grande importance à la sécurité des visiteurs. ''' result['marks'] = 0 print("HTTPS didn't worked") else: print("else worked") try: url2 = 'https://' + url r = requests.get(url2, headers=headers) url = url2 # req = urllib.request.Request(url, headers=headers) # r = urllib.request.urlopen(req) result[ 'message'] = 'Félicitations. Votre site les données transitants par votre site sont sécurisées avec un certificat SSL' result['marks'] = 4 except: url1 = 'http://' + url print("from else except ", url1) r = requests.get(url1, headers=headers, verify=False) url = url1 # req = urllib.request.Request(url, headers=headers) # r = urllib.request.urlopen(req) result['message'] = ''' Votre site ne dispose pas de certificat SSL. Les données qui y transitent peuvent donc être récupérés par des parties malveillantes. Google donne une grande importance à la sécurité des visiteurs. ''' result['marks'] = 0 result_dict['https_test'] = result final_score = final_score + result['marks'] soup = BeautifulSoup(r.text, "lxml") # This is for row 1 (title) try: title_content = soup.find('title').text title_ln = len(title_content) if title_ln < 70: result = { 'name': 'title', 'message': 'Félicitations votre site dispose d’un titre avec un nombre de caractères optimale soit moins de 70 caractères', 'title_length': title_ln, 'title_content': title_content, 'marks': 5 } final_score = final_score + 5 result_dict['title'] = result elif title_ln > 70: result = { 'name': 'title', 'message': 'Votre titre est trop long, le nombre de caractères optimal est de 70 caractères, essayez de le raccourcir', 'title_length': title_ln, 'title_content': title_content, 'marks': 2 } final_score = final_score + 2 result_dict['title'] = result except: result = { 'name': 'title', 'message': 'Votre site ne dispose pas de balise meta title. La balise <title> correspond au titre de votre page web. Il s’agit d’un champ essentiel à ne pas négliger dans le cadre d’une bonne stratégie d’optimisation du référencement naturel puisqu’elle est l’un des critères les plus importants pour les moteurs de recherche (Google, Bing...)', 'title_length': 0, 'marks': 0 } final_score = final_score + 0 result_dict['title'] = result # This is for row 2 (meta @description) name = 'meta_description' length_var_name = 'meta_desc_len' try: meta_tag = soup.find("meta", {"name": "description"}) desc_content = meta_tag['content'] desc_text_ln = len(desc_content) #desc_text_ln = int(desc_text_ln) if desc_text_ln < 150: result = { 'name': name, 'message': 'Votre méta-description est trop courte, le nombre de caractère optimale doit être entre 150 et 250 caractères.', length_var_name: desc_text_ln, 'desc_content': desc_content, 'marks': 1 } final_score = final_score + result['marks'] result_dict['meta_description'] = result print('try worked1') elif desc_text_ln > 150 and desc_text_ln < 250: result = { 'name': name, 'message': 'Félicitations votre site dispose d’une méta-description avec un nombre de caractère optimal entre 150 et 155 caractères', length_var_name: desc_text_ln, 'desc_content': desc_content, 'marks': 3 } final_score = final_score + result['marks'] result_dict['meta_description'] = result print('try worked2') elif desc_text_ln > 250: result = { 'name': name, 'message': ' Votre méta-description est trop longue, essayez de la raccourcir, le nombre optimal est entre 150 et 250 caractères, le reste risque d’être tronqué sur l’affichage du résultat sur les moteurs de recherche.', length_var_name: desc_text_ln, 'desc_content': desc_content, 'marks': 2 } final_score = final_score + result['marks'] result_dict['meta_description'] = result print('try worked3') except: result1 = { 'name': name, 'message': 'Votre site ne dispose pas de méta-description, La balise meta description manque sur votre page. Vous devez inclure cette balise afin de fournir une brève description de votre page pouvant être utilisée par les moteurs de recherche. Des méta-descriptions bien écrites et attrayantes peuvent également aider les taux de clics sur votre site dans les résultats de moteur de recherche.', length_var_name: 0, 'marks': 0 } final_score = final_score + result1['marks'] result_dict['meta_description'] = result1 print('except worked') # This is for row 3 (meta @keywords) name = 'meta_keywords' length_var_name = 'meta_key_len' try: meta_tag = soup.find("meta", {"name": "keywords"}) meta_key_content_ln = len(meta_tag['content']) #title_ln = int(meta_key_content_ln) if meta_key_content_ln: result = { 'name': name, 'message': 'Bravo vous avez spécifié des meta keywords . Vos mots-clés principaux doivent apparaître dans vos méta-tags pour vous aider à identifier le sujet de votre page Web dans les moteurs de recherche.', length_var_name: meta_key_content_ln, 'marks': 1 } final_score = final_score + result['marks'] result_dict['meta_keywords'] = result print('try worked1') except: result1 = { 'name': name, 'message': 'Vos mots-clés principaux doivent apparaître dans vos méta-tags pour vous aider à identifier le sujet de votre page Web dans les moteurs de recherche.', length_var_name: 0, 'marks': 0 } final_score = final_score + result1['marks'] result_dict['meta_keywords'] = result print('except worked') # This is for row 4 (meta @robots) name = 'meta_robots' length_var_name = 'meta_robots_len' try: meta_tag = soup.find("meta", {"name": "robots"}) meta_robots_content = len(meta_tag['content']) # title_ln = int(desc_text_ln) if meta_robots_content: result = { 'name': name, 'message': "Votre site dispose d'un fichier robots.txt", length_var_name: meta_robots_content, 'marks': 4 } final_score = final_score + result['marks'] result_dict['meta_robots'] = result print('try worked1') except: result1 = { 'name': name, 'message': ''' Votre site n’a pas de robot.txt Le robots.txt est un fichier texte utilisant un format précis qui permet à un Webmaster de contrôler quelles zones de son site un robot d'indexation est autorisé à analyser. Ce fichier texte sera disponible à une URL bien précise pour un site donné, par exemple http://www.monsite.com/robots.txt Pour bien comprendre à quoi sert un robots.txt, il faut comprendre la manière dont fonctionnent les robots d'indexation des moteurs de recherche (appelés aussi Web spiders, Web crawlers ou Bots) tels que Google, Yahoo ou Bing. Voici leurs actions lorsqu'ils analysent un site tel que www.monsite.com : ils commencent par télécharger et analyser le fichier http://www.monsite.com/robots.txt. ''', length_var_name: 0, 'marks': 0 } final_score = final_score + result1['marks'] result_dict['meta_robots'] = result1 print('except worked') # This is for row 5 (html lang) name = 'html_lang' length_var_name = 'html_lang' try: meta_tag = soup.find("html", {"lang": True}) lang_text = meta_tag['lang'] result = { 'name': name, 'message': "Félicitations. Vous avez spécifié une langue à votre page.", length_var_name: lang_text, 'marks': 3 } final_score = final_score + result['marks'] result_dict['html_lang'] = result print('try worked1') except: result1 = { 'name': name, 'message': ''' Vous devriez spécifier une langue pour votre site, les moteurs de recherches ne comprennent pas quand un site dispose de plusieurs langues par exemple ayant des mots techniques en anglais et un contenu texte en français. Il faut donc bien spécifier la langue. ''', length_var_name: 0, 'marks': 0 } final_score = final_score + result1['marks'] result_dict['html_lang'] = result1 print('except worked') # This is for row 6 (sitemap) url = url.strip() sitemap_url = url + '/sitemap.xml' print("Sitemap url ", sitemap_url) try: code = requests.get(sitemap_url, headers=headers).status_code name = 'sitemap' if code == 200: result = { 'name': name, 'message': "Félicitations, votre site dispose d’un fichier sitemap", 'marks': 2 } final_score = final_score + result['marks'] result_dict['sitemap'] = result else: result = { 'name': name, 'message': "Votre site Web ne dispose pas d'un fichier sitemap. Les sitemaps peuvent aider les robots à indexer votre contenu de manière plus complète et plus rapide. ", 'marks': 0 } final_score = final_score + result['marks'] result_dict['sitemap'] = result except: result = { 'name': name, 'message': "Votre site Web ne dispose pas d'un fichier sitemap. Les sitemaps peuvent aider les robots à indexer votre contenu de manière plus complète et plus rapide. ", 'marks': 0 } final_score = final_score + result['marks'] result_dict['sitemap'] = result # This is for row 7 (google Analytics) searched_word = 'google-analytics' name = 'google_analytics' if searched_word in str(soup): print("Google analytics found") result = { 'name': name, 'message': "Félicitations, votre site dispose de l'outil Google Analytics", 'marks': 2 } final_score = final_score + result['marks'] result_dict['google_analytics'] = result else: result = { 'name': name, 'message': "Votre site ne dispose pas de l'outil Google Analytics.", 'marks': 0 } final_score = final_score + result['marks'] result_dict['google_analytics'] = result # This is for row 8 (page_cache) name = 'page_cache' length_var_name = 'page_cache_desc' try: meta_tag = soup.find("meta", {"http-equiv": "Cache-control"}) lang_text = meta_tag['content'] result = { 'name': name, 'message': "Vous avez activé le cache sur votre page, c'est très bien.", length_var_name: lang_text, 'marks': 3 } final_score = final_score + result['marks'] result_dict['page_cache'] = result print('try worked1') except: result1 = { 'name': name, 'message': "Vous n'avez pas activé la mise en cache sur vos pages. La mise en cache permet un chargement plus rapide des pages.", length_var_name: 0, 'marks': 0 } final_score = final_score + result1['marks'] result_dict['page_cache'] = result1 print('except worked') # API_KEY = AIzaSyD_RLUOcTN1JAq8PL8zJ79X6-kmHIDy_uM # This is for row 9 (Google safe browsing api) api_key = 'AIzaSyCVylpWnsOwzUoeTGg7akZRod-4YbhXoPU' sbl = SafeBrowsingList(api_key) bl = sbl.lookup_url(url) name = 'google_safe_browsing' print("google_safe_browsing ", url) if bl is None: print("Website is safe") result = { 'name': name, 'message': "Votre site est considéré comme sécurisé.", 'marks': 2 } final_score = final_score + result['marks'] result_dict['google_safe_browsing'] = result else: result = { 'name': name, 'message': "Votre site n'est pas considéré comme sécurisé. Google et les autres moteurs de recherche prennent en compte le niveau de sécurité de votre site pour garantir la sécurité des visiteurs.", 'marks': 0, 'threats': bl } final_score = final_score + result['marks'] result_dict['google_safe_browsing'] = result # This is for row 10 (responsive website test) name = 'responsive_test' length_var_name = 'responsive_test_desc' try: meta_tag = soup.find("meta", {"name": "viewport"}) lang_text = meta_tag['content'] result = { 'name': name, 'message': "Félicitations. Votre site est responsive.", length_var_name: lang_text, 'marks': 4 } final_score = final_score + result['marks'] result_dict['responsive_test'] = result print('try worked1') except: result1 = { 'name': name, 'message': ''' Nous n'avons pas détécté que votre site internet était responsive, soit adapté au mobile. Google prend énormément en compte ce critère pour un bon référencement. ''', length_var_name: 0, 'marks': 0 } final_score = final_score + result1['marks'] result_dict['responsive_test'] = result1 print('except worked') # Html page size # mobile_friendliness_test print("mobile friendly ", url) data = { "url": url, "requestScreenshot": True, } r1 = requests.post( 'https://searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run?key=AIzaSyDExRwe7TNEgHa_JLogOVjccqWNVoaH-EQ', data).json() # a = json.loads(r1.text) a = r1 imgstring = a['screenshot']['data'] if imgstring: print("image of mobile returned") else: print("image of mobile NOT returned") # import base64 # imgdata = base64.b64decode(imgstring) # filename = 'some_image.jpg' # I assume you have a way of picking unique filenames # with open(filename, 'wb') as f: # f.write(imgdata) name = 'mobile_friendliness_test' if a['mobileFriendliness'] == 'MOBILE_FRIENDLY': print("Website is mobile friendly") result = { 'name': name, 'message': "Félicitations. Votre site est Mobile friendly.", 'result': a['mobileFriendliness'], 'img_string': 'data:image/png;base64,' + urllib.parse.quote(imgstring), 'marks': 4 } final_score = final_score + result['marks'] result_dict['mobile_friendliness_test'] = result else: result = { 'name': name, 'message': "Votre site n'est pas optimisé pour le mobile. Les moteurs de recherches donnent une très grande importance à la compatibilité mobile.", 'marks': 0, 'result': a['mobileFriendliness'], 'img_string': 'data:image/png;base64,' + urllib.parse.quote(imgstring) } final_score = final_score + result['marks'] result_dict['mobile_friendliness_test'] = result # except: # result = { # 'name':name, # 'message':"Votre site n'est pas optimisé pour le mobile. Les moteurs de recherches donnent une très grande importance à la compatibilité mobile.", # 'marks':0, # 'result': "Not Mobile Friendly" # } # final_score = final_score + result['marks'] # result_dict['mobile_friendliness_test'] = result # # "mobileFriendlyIssues": [ # # { # # "rule": "TAP_TARGETS_TOO_CLOSE" # # }, # # { # # "rule": "USE_LEGIBLE_FONT_SIZES" # # }, # # { # # "rule": "CONFIGURE_VIEWPORT" # # } # # ], # # google page speed # print("Google page speed ",url) # r2 = requests.get('https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={}?key=AIzaSyAXf3ILJpeIs1nfDvvmLk0MsQDsuIsG5gM'.format(url)) # b = json.loads(r2.text) # name = "page_speed" # # speed_index = b['lighthouse']['audits']['speed-index']['.displayValue'] # #print("this is speed index",speed_index) # # final_report.append({ # # "google_page_speed_data":b # # }) # result_dict['page_speed'] = b # This is for row 13 (img alt attribute) name = 'img_alt' img_tags = soup.findAll("img") no_alt = [] empty_alt = [] alt_ok = [] empty_check = [] name = "img_alt" for img_tag in img_tags: try: if not img_tag['alt'].strip(): empty_alt.append(img_tag['src']) elif img_tag['alt'].strip(): alt_ok.append(img_tag['src']) except: no_alt.append(img_tag) total_alt_num = len(empty_alt) + len(alt_ok) img_alt_result = { 'name': name, 'message': '', 'marks': '', 'no_alt': no_alt, 'empty_altm': empty_alt } if len(img_tags) == len(alt_ok): img_alt_result[ 'message'] = 'Félicitations. Toutes vos images disposent de balises alt attributs' img_alt_result['marks'] = 3 print("every image tag contains alt and all have values") elif empty_alt and len(img_tags) == total_alt_num: img_alt_result[ 'message'] = 'Certaines de vos images manquent de balises alt attributs. Voir la liste complète' img_alt_result['marks'] = 1 print("Every img have alt tag but some have empty alt") elif len(img_tags) == len(no_alt): img_alt_result[ 'message'] = "Aucune de vos images n'a de balises alt attributs, elles sont essentielles pour permettre aux moteurs de recherche de comprendre ce que représente votre image." img_alt_result['marks'] = 0 print("No images have alt tag") if no_alt: img_alt_result[ 'message'] = "Aucune de vos images n'a de balises alt attributs, elles sont essentielles pour permettre aux moteurs de recherche de comprendre ce que représente votre image." img_alt_result['marks'] = 0 print("Some images have no alt tag") final_score = final_score + img_alt_result['marks'] result_dict['img_alt'] = img_alt_result # This is for row 14 (favicon test) name = 'favicon_test' length_var_name = 'favicon_link' favicon_list = [] link_tags = soup.findAll("link") for link in link_tags: if "favicon" in link['href']: favicon_list.append(link['href']) if favicon_list: result = { 'name': name, 'message': "Félicitations. Votre site dispose d'une favicon.", length_var_name: favicon_list, 'marks': 1 } final_score = final_score + result['marks'] result_dict['favicon_test'] = result print('if worked1') else: result1 = { 'name': name, 'message': "Votre site ne dispose pas de favicon. La favicon est la petite icone qui apparait en haut du navigateur à côté du titre de votre site. Au delà de l'aspect SEO, elle permet de donner une identité visuelle à votre site.", 'marks': 0 } final_score = final_score + result1['marks'] result_dict['favicon_test'] = result1 print('else worked') # This is for strong tag test name = 'strong_tag' length_var_name = 'strong_text' try: strong_tags = soup.findAll("strong") if strong_tags: result = { 'name': name, 'message': 'Félicitations. Vous avez spécifié des balises strong dans votre texte', length_var_name: strong_tags, 'marks': 2 } else: result = { 'name': name, 'message': " Vous n'avez spécifié aucune balise strong dans votre texte. Les balises strong permettent aux moteurs de recherche de savoir quel contenu est intéressant et pertinent dans votre texte.", 'marks': 0 } final_score = final_score + result['marks'] result_dict['strong_tag'] = result print('try worked1') except: result1 = { 'name': name, 'message': " Vous n'avez spécifié aucune balise strong dans votre texte. Les balises strong permettent aux moteurs de recherche de savoir quel contenu est intéressant et pertinent dans votre texte.", 'marks': 0 } final_score = final_score + result1['marks'] result_dict['strong_tag'] = result1 print('except worked') # This is for Microdata test (itemscope , itemtype) name = 'micro_data_test' try: soup.find(True, {'itemscope': True}) or soup.find( True, {'itemtype': True}) result = { 'name': name, 'message': "Félicitations. Votre site utilise des Microdonnées Schema.org", 'marks': 3 } final_score = final_score + result['marks'] result_dict['micro_data_test'] = result print('try worked1') except: result1 = { 'name': name, 'message': ''' Vos visiteurs aiment les beadcrumbs, mais Google aussi. Les beadcrumbs donnent à Google un autre moyen de comprendre la structure de votre site Web. Toutefois, comme indiqué précédemment, Google peut également utiliser vos beadcrumbs dans les résultats de recherche, ce qui rend votre résultat beaucoup plus attrayant pour les utilisateurs. ''', 'marks': 0 } final_score = final_score + result1['marks'] result_dict['micro_data_test'] = result1 print('except worked') # This is for AMP Version name = 'amp_html_test' try: tag = soup.find('link', {'rel': "amphtml"}) result = { 'name': name, 'message': " Félicitations. Votre site dispose d'une version AMP", 'amp_html_link': tag['href'], 'marks': 3 } final_score = final_score + result['marks'] result_dict['amp_html_test'] = result print('try worked1') except: result1 = { 'name': name, 'message': '''L’objectif est que les pages AMP s’affichent presque de façon instantannée, c’est-à-dire généralement 90% plus rapidement que d’habitude. Grâce à cette grande vitesse, l’expérience utilisateur sur mobile se trouve largement améliorée, ce qui d’après des études fait chuter le taux de rebo ''', 'marks': 0 } final_score = final_score + result1['marks'] result_dict['amp_html_test'] = result1 print('except worked') # This is for Breadcrumps searched_word = 'breadcrumb' name = 'breadcrumb' if searched_word in str(soup).lower(): print("Breadcrum found") result = { 'name': name, 'message': "Félicitations, nous avons détécté l'utilisation de beadcrumbs sur votre site.", 'marks': 2 } final_score = final_score + result['marks'] result_dict['breadcrumb'] = result else: result = { 'name': name, 'message': "Nous n'avons pas détécté de Beadcrumb sur votre site. Les Beadcrumbs sont une partie importante de presque tous les bons sites Web. Ces petites aides à la navigation ne permettent pas seulement aux internautes de savoir où elles se trouvent sur votre site, elles aident également Google à déterminer la structure de votre site.", 'marks': 0 } final_score = final_score + result['marks'] result_dict['breadcrumb'] = result # Open graph Test name = 'open_graph_test' open_graph_tags = [] og_tags = soup.findAll('meta', {"property": True}) for og in og_tags: if "og" in str(og): open_graph_tags.append(og['property']) result = { 'name': name, 'message': "", 'marks': "", 'og_tags': open_graph_tags } if open_graph_tags: result[ 'message'] = 'Félicitations nous avons détécté des balises Open Graph.' result['marks'] = 1 print("If worked") else: result['message'] = ''' Les balises méta Open Graph sont conçues pour communiquer des informations sur votre site Web aux réseaux sociaux lorsque des liens vers votre site Web sont partagés. Ces balises vous permettent de créer des titres, des descriptions et des images personnalisés à utiliser lorsque vos pages sont partagées sur Facebook, LinkedIn et Google+. Ainsi, tout comme lorsque Google ou un autre moteur de recherche visite votre site et recherche les données (ou balises) appropriées afin d'afficher correctement votre site Web dans les résultats de recherche, les réseaux sociaux agissent de la même manière. La seule différence est que les réseaux sociaux recherchent ces tags spécifiques Open Graph (ou tags Twitter). ''' result['marks'] = 0 print("else worked") result_dict['open_graph_test'] = result final_score = final_score + result['marks'] # Twitter Test name = 'twitter_test' twitter_tags = [] og_tags = soup.findAll('meta', {"property": True}) for twitter in twitter_tags: if "twitter" in str(og_tags): twitter_tags.append(og['property']) result = { 'name': name, 'message': "", 'marks': "", 'og_tags': twitter_tags } if twitter_tags: result['message'] = ' Parfait. Vous avez spécifié des Twitter Cards' result['marks'] = 2 print("If worked") else: result[ 'message'] = "Twitter via les twitter Cards vous permet d'identifier l'auteur de la publication / de la page ainsi que l'éditeur, qui est généralement le nom du site Web. Ces deux valeurs ne sont pas obligatoires, mais permettent d’ajouter des données supplémentaires à ceux qui souhaiteraient l’ajouter." result['marks'] = 0 print("else worked") result_dict['twitter_test'] = result final_score = final_score + result['marks'] # This is for Social Media test fb = 'facebook.com' linkedin = 'linkedin.com' twitter = 'twitter.com' name = 'social_media_test' social_sites_found = [] if fb in str(soup): social_sites_found.append('facebook') print("facebook.com found") elif linkedin in str(soup): social_sites_found.append('linkedin') print("linkedin.com found") elif twitter in str(soup): social_sites_found.append('twitter') print("twitter.com found") result = { 'name': name, 'message': "", 'marks': '', 'social_sites_found': social_sites_found } if social_sites_found: result[ 'message'] = 'Nous avons détécté une liaison vers les réseaux sociaux sur votre site.' result['marks'] = 2 else: result[ 'message'] = " Nous n'avons pas détécté de lien vers vos réseaux sociaux sur votre site. Même si ça n'impacte pas grandement votre SEO, avoir des liens vers les réseaux sociaux de sa marque est plus agréable et utile pour les utilisateurs." result['marks'] = 0 final_score = final_score + result['marks'] result_dict['social_media_test'] = result # for H1/h2/h3 h1_tag = soup.findAll('h5') h_tags = [] for i in range(1, 6): h_tag = soup.find_all('h' + str(i)) result = {"tag": 'h' + str(i), "total_num": len(h_tag)} h_tags.append(result) result = { "name": "heading_tags_test", "message": "", "marks": "", "total_num _tags": h_tags } if h_tags[0] and h_tags[1] and h_tags[2]: result['message'] = "Félicitations vos en en-têtes sont structurées" result['marks'] = 3 elif h_tags[0] or h_tags[1] or h_tags[2] or h_tags[3] or h_tags[4]: result[ 'message'] = "FVos en-têtes ne sont pas structurés, il faut d'abord spécifier des en-têtes H1 puis H2 puis H3 etc.." result['marks'] = 1 else: result[ 'message'] = "Vous n'avez pas spécifié d'en têtes, c'est un élément essentiel du SEO, ça permet aux moteurs de recherche de savoir de quoi le chapitre ou la section va discuter." result['marks'] = 0 final_score = final_score + result['marks'] result_dict['heading_tags_test'] = result # This is for page characters name = 'page_characters' try: tags1 = soup.findAll('p') tags2 = soup.findAll('h1') tags3 = soup.findAll('h2') tags4 = soup.findAll('h3') tags5 = soup.findAll('h4') tags6 = soup.findAll('h5') tags = tags1 + tags2 + tags3 + tags4 + tags5 + tags6 text = "" for tag in tags: text = text + tag.text num_words = len(text.split(' ')) result = { 'name': name, 'message': "", 'marks': "", 'num_words': num_words } if num_words > 300: result[ 'message'] = "Félicitations, la quantité de texte est supérieur à 300 mots." result['marks'] = 5 else: result[ 'message'] = "La quantité de texte est insuffisante, il faut que vos pages contiennent plus de 300 mots pour que le contenu soit intéressant pour les moteurs de recherche." result['marks'] = 0 print('try worked1') except: result = { 'name': name, 'message': ''' La quantité de texte est insuffisante, il faut que vos pages contiennent plus de 300 mots pour que le contenu soit intéressant pour les moteurs de recherche. ''', 'marks': 0 } print('except worked') final_score = final_score + result['marks'] result_dict['page_characters'] = result # page = requests.get(url,headers=headers).text # collecting all urls in website domain = tldextract.extract(url).domain suffix = tldextract.extract(url).suffix subdomain = tldextract.extract(url).subdomain pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>' link_levels = [] found_links = re.findall(pattern, r.text) links = [] external_links = [] web = domain + '.' + suffix for link in found_links: if url not in link and "." not in link and "#" not in link: links.append(url + link) elif url not in link and "#" not in link and web not in link: external_links.append(link) links = list(dict.fromkeys(links)) # keywords in URL test && levels in url keywords_in_url = [] directories_in_url = [] for url in links: if 'https' in url: if subdomain: url1 = "https://" + subdomain + '.' + domain + '.' + suffix else: url1 = "https://" + domain + '.' + suffix elif 'http' in url: if subdomain: url1 = "http://" + subdomain + '.' + domain + '.' + suffix else: url1 = "http://" + domain + '.' + suffix a = url t = set(url1.split('/')) p = set(a.split('/')) e = p - t keywords = list(e) if keywords: for item in keywords: keywords_in_url.append(item) directories_in_url.append(len(keywords)) keywords_in_url = list(dict.fromkeys(keywords_in_url)) else: pass result = { "name": "keywords_in_url", "keywords": keywords_in_url, "message": "", "marks": '' } if keywords_in_url: result[ 'message'] = "Vos urls disposent de keywords, Veuillez vérifier qu'elles correspondent bien à ce que vous voulez mettre en avant sur votre site." result['marks'] = 1 else: result['message'] = "Vos urls ne semblent pas avoir de keywords." result['marks'] = 0 result_dict['keywords_in_url'] = result final_score = final_score + result['marks'] if directories_in_url: directories = max(directories_in_url) else: directories = 0 result = { "name": "directories_in_url", "directories": directories, "message": "", "marks": '' } if directories < 5: result[ 'message'] = "Félicitations, votre URL est composée de moins de 5 dossiers", result['marks'] = 2 else: result[ 'message'] = "Vos url sont composées de plus de 5 dossiers, veuillez en diminuer le nombre", result['marks'] = 0 result_dict['directories_in_url'] = result final_score = final_score + result['marks'] # # broken_link test # broken_links = [] # all_links = links + external_links # for link in all_links: # try: # print("Checking link health of ",link) # r1 = requests.get(url,headers = headers) # except: # broken_links.append(link) # result = { # "name":"broken_links_test", # "message":"", # "marks":'', # "broken_links":broken_links # } # if broken_links: # result['message'] = "Nous avons détécté un ou plusieurs liens qui ne fonctionnent plus sur votre site internet. Voir la liste complète" # result['marks'] = 0 # else: # result['message'] = "Félicitations, vous n'avez pas de brokenlinks." # result['marks'] = 3 # final_score = final_score + result['marks'] # result_dict['broken_links_test'] = result # external links test result = { "name": "external_links_test", "message": "", "marks": '', "external_links": external_links } if external_links: result[ 'message'] = "Félicitations, vous avez plusieurs external links. Voir la liste complète" result['marks'] = 9 else: result[ 'message'] = "Nous n'avons pas détécté de external links pour votre site internet. Les liens retour (external internal links) de qualité, sont primordiaux pour une bon référencement." result['marks'] = 0 final_score = final_score + result['marks'] result_dict['external_links_test'] = result #word cloud if text: cloud = WordCloud(background_color="white").generate(text) plt.imshow(cloud) plt.axis('off') image = io.BytesIO() plt.savefig(image, format='png') image.seek(0) # rewind the data string = base64.b64encode(image.read()) image_64 = 'data:image/png;base64,' + urllib.parse.quote(string) result = { "name": "word_cloud", "img": image_64, "message": "Nuage des mots les plus présents sur votre page" } result_dict['word_cloud'] = result else: result = { "name": "word_cloud", "img": "", "message": "Aucun contenu texte n'a été détécté" } # Internal links test result = { "name": "internal_links_test", "message": "", "marks": '', "internal_links": links } if links: result[ 'message'] = "Félicitations. Nous avons détécté l'utilisation de liens internes sur votre page." result['marks'] = 4 else: result[ 'message'] = "Nous n'avons pas détécté de liens internes sur votre page. En plus de faire la liaison entre vos différentes pages, les liens internes permettent de mieux guider les robots Google et mettent en évidence le lien entre vos différentes pages." result['marks'] = 0 final_score = final_score + result['marks'] result_dict['internal_links_test'] = result test_count = {"test_passed": "", "test_failed": "", "without_marks": ""} passed = 0 failed = 0 without_marks = 0 for k, v in result_dict.items(): try: if v['marks'] == 0: failed = failed + 1 elif v['marks'] > 0: passed = passed + 1 else: pass except: without_marks = without_marks + 1 test_count['test_passed'] = passed test_count['test_failed'] = failed test_count['without_marks'] = without_marks result_dict['test_count'] = test_count return (final_score, result_dict)
def count_symbols_tld(address): """Count occurrences of symbols in domain""" count = count_symbols(tldextract.extract(address).domain) return count
def _get_tld_extract(url: str) -> tldextract.tldextract.ExtractResult: extract_result = tldextract.extract(url) return extract_result
def domain_length(address): """Extracts domain from URL and count characters""" domain = tldextract.extract(address) return len(domain.domain)
import time import re import tldextract import socket import smtplib import dns import dns.resolver from Check_Valid_Email import Main # ext=tldextract.extract("https://acviss") # print("0",ext[0]) # print("1",ext[1]) # print(ext[2]) search_term = input("Enter the domain name in this format[domain.com]:-") ext = tldextract.extract(search_term) url = f"https://www.linkedin.com/search/results/people/?keywords={ext[1]}&origin=GLOBAL_SEARCH_HEADER" login_url = "https://www.linkedin.com/uas/login?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Ffeed%2F&fromSignIn=true&trk=cold_join_sign_in" # if you want to run linkedin on background then this option will be helpful # options = Options() # options.headless = True # driver = webdriver.Chrome(options=options, executable_path=r"/usr/local/bin/chromedriver") # print("headless") driver = webdriver.Chrome("/usr/local/bin/chromedriver") driver.get(login_url) time.sleep(5) # logging in using username and password emailid = input("Enter the email:")
res.update({"visit_id": visit_id}) visited_site_visits_sites.append(site) ################################################################################ # Now we have an object containing all necessary information, saved in result array ################################################################################# # add Content-Length for resObject in result: content_length = 0 first_party_content_length = 0 third_party_content_length = 0 advertisements_content_length = 0 if resObject['index'] <= display_index: ext = tldextract.extract(resObject["visited_site"]) visited_tld = ext.domain for header, url in cur.execute( "SELECT headers, url" " FROM http_responses" " WHERE visit_id = ?" " AND crawl_id = ?", [resObject["visit_id"], resObject["crawl_id"]]): if "Content-Length" in header: current_length = header.index("Content-Length") content_length = content_length + current_length if "http" in url: if rules.should_block(url) is True: advertisements_content_length = advertisements_content_length + current_length xt = tldextract.extract(url)
def dns_extractor(target): dnsr = dns.resolver if target.startswith('https://') or target.startswith('http://'): ext = tldextract.extract(target) domain = ext.domain suffix = ext.suffix target = domain + '.' + suffix try: print() ns = dnsr.query(target, 'NS') for rs in ns: print(bold(green('NS records: ')) + str(rs)) except dns.exception.DNSException: print(bad('Query failed > NS records.')) try: print() a = dnsr.query(target, 'A') for rs in a: print(bold(green('A records: ')) + str(rs)) except dns.exception.DNSException: print(bad('Query failed > A records.')) try: print() mx = dnsr.query(target, 'MX') for rs in mx: print(bold(green('MX records: ')) + str(rs)) except dns.exception.DNSException: print(bad('Query failed > MX records.')) try: print() txt = dnsr.query(target, 'TXT') for spf in txt: print(bold(green('SPF records: ')) + str(spf)) except dns.exception.DNSException: print(bad('Query failed > SPF records.')) else: ext = tldextract.extract(target) domain = ext.domain suffix = ext.suffix target = domain + '.' + suffix try: print() ns = dnsr.query(target, 'NS') for rs in ns: print(bold(green('NS records: ')) + str(rs)) except dns.exception.DNSException: print(bad('Query failed > NS records.')) try: print() a = dnsr.query(target, 'A') for rs in a: print(bold(green('A records: ')) + str(rs)) except dns.exception.DNSException: print(bad('Query failed > A records.')) try: print() mx = dnsr.query(target, 'MX') for rs in mx: print(bold(green('MX records: ')) + str(rs)) except dns.exception.DNSException: print(bad('Query failed > MX records.')) try: print() txt = dnsr.query(target, 'TXT') for spf in txt: print(bold(green('SPF records: ')) + str(spf)) except dns.exception.DNSException: print(bad('Query failed > SPF records.'))
def get_parent(self, response): site = tldextract.extract(response.url).domain if site: site = match_site(site) return site
def generate_targets(self, env, targets, collections_by_id): # Setup specific template: template = env.get_template('site-target-template.md') # Export targets for target in targets: # Skip blocked items: if target['field_crawl_frequency'] == 'NEVERCRAWL': logger.warning("The Target '%s' is blocked (NEVERCRAWL)." % target['title']) self.blocked_record_count += 1 # FIXME SHOULD DELETE THE FILE IF IT EXISTS! continue # Skip items that have no crawl permission? # hasOpenAccessLicense == False, and inScopeForLegalDeposit == False ? # Skip items with no URLs: if len(target['fieldUrls']) == 0: logger.warning("The Target '%s' has no URLs!" % target['title']) # FIXME SHOULD DELETE THE FILE IF IT EXISTS! continue # Skip hidden targets: if target['field_hidden']: logger.warning("The Target '%s' is hidden!" % target['title']) # FIXME SHOULD DELETE THE FILE IF IT EXISTS! continue # Get the ID, WCT ID preferred: tid = target['id'] if target.get('field_wct_id', None): tid = target['field_wct_id'] # Get the url, use the first: url = target['fieldUrls'][0]['url'] # Extract the domain: parsed_url = tldextract.extract(url) publisher = parsed_url.registered_domain # Lookup in CDX: #wayback_date_str = CdxIndex().get_first_capture_date(url) # Get date in '20130401120000' form. #if wayback_date_str is None: # logger.warning("The URL '%s' is not yet available, inScopeForLegalDeposit = %s" % (url, target['inScopeForLegalDeposit'])) # self.missing_record_count += 1 # continue start_date = self.get_target_start_date(target) wayback_date = datetime.datetime.strptime(start_date, '%Y-%m-%dT%H:%M:%SZ') wayback_date_str = wayback_date.strftime('%Y%m%dT%H%M%S') first_date = wayback_date.isoformat() record_id = "%s/%s" % ( wayback_date_str, base64.b64encode(hashlib.md5(url.encode('utf-8')).digest())) # Honour embargo #ago = datetime.datetime.now() - wayback_date #if ago.days <= 7: # self.embargoed_record_count += 1 # continue # Strip out Windows newlines if 'description' in target and target['description'] != None: target['description'] = target['description'].replace( '\r\n', '\n') # Otherwise, build the record: rec = { 'slug': tid, 'id': target['id'], 'wct_id': target.get('field_wct_id', None), 'record_id': record_id, 'date': first_date, 'target_url': url, 'title': target['title'], 'publisher': publisher, 'start_date': target['crawlStartDateISO'], 'end_date': target['crawlEndDateISO'], 'open_access': target['hasOpenAccessLicense'], 'npld': target['inScopeForLegalDeposit'], 'scope': target['field_scope'], 'nominating_organisation': target.get('nominating_organisation', {}).get('title', None), 'collections': [], 'subjects': [] } # Add any collection: for col_id in target['collectionIds']: col = collections_by_id.get(int(col_id), {}) if 'name' in col: rec['collections'].append({ 'id': col['id'], 'name': col['name'] }) # For subjects for sub_id in target['subjectIds']: pass #col = subjects.get(int(target['collectionIds'][0]), {}) #if 'name' in col: # rec['collections'].append({ # 'id': col['id'], # 'name': col['name'] # }) # And the organisation: if 'nominating_organisation' in target and target[ 'nominating_organisation'] != None: rec['organisation'] = { 'id': target['nominating_organisation']['id'], 'name': target['nominating_organisation']['title'], 'abbreviation': target['nominating_organisation']['field_abbreviation'] } # And write: file_path = self.get_target_file_path(target) target['file_path'] = file_path target_md = luigi.LocalTarget( "/Users/andy/Documents/workspace/ukwa-site/content/target/%s/index.en.md" % file_path) with target_md.open('w') as f: for part in template.generate({ "record": rec, "json": json.dumps(rec, indent=2), "description": target['description'] }): f.write(part.encode("utf-8"))
def domain_is_in_blacklist(url): domain = tldextract.extract(url).domain return domain in domain_blacklist
def extract_domain(value): import tldextract uri = tldextract.extract(value).registered_domain return uri
def api_id(): if 'url' in request.args: url = str(request.args['url']) bb = get_bias(url) bias = bb[0] bias_desc = bb[1] print(bias_desc) if (bias_desc == 'Left'): n = 1 elif (bias_desc == 'Moderate Left'): n = 2 elif (bias_desc == 'Neutral'): n = 3 elif (bias_desc == 'Moderate Right'): n = 4 else: n = 5 """ f = open('C:/Users/sahil/Desktop/Files/Coding/PalyHacks-DevilsAdvocate/DevilsAdvocate/svg-doughnut-chart-with-animation-and-tooltip/dist/vals.js', 'r') # pass an appropriate path of the required file lines = f.readlines() lines[n-1] = str(int(lines[n].split(',')[0]) + 1) + ",\n" # n is the line number you want to edit; subtract 1 as indexing of list starts from 0 f.close() # close the file and reopen in write mode to enable writing to file; you can also open in append mode and use "seek", but you will have some unwanted old data if the new data is shorter in length. f = open('C:/Users/sahil/Desktop/Files/Coding/PalyHacks-DevilsAdvocate/DevilsAdvocate/svg-doughnut-chart-with-animation-and-tooltip/dist/vals.js', 'w') f.writelines(lines) # do the remaining operations on the file f.close() """ with open('vals.json', 'r') as f: vals = json.load(f)['vals'] vals[n - 1] = vals[n - 1] + 1 with open('vals.json', 'w') as f: json.dump({"vals": vals}, f) try: all_articles = similar_articles.get(url)["articles"] except: return jsonify({ 'bias': bias_desc, 'similar_articles': { "articles": [] } }) article_biases = [] articles_biases_text = [] for article in all_articles: print("article", article) b = get_bias_dict(article["url"]) article_biases.append(b[0]) articles_biases_text.append(b[1]) article_biases = np.array(article_biases) if ("Right" in bias_desc): k = min(3, len(article_biases) - 1) bs = np.argpartition(article_biases, k)[:k] elif ("Left" in bias_desc): k = min(3, len(article_biases) - 1) bs = np.argpartition(-1 * article_biases, k)[:k] if ("Neutral" in bias_desc): articles = all_articles else: articles = [] for b in bs: print( tldextract.extract( all_articles[b]["url"]).registered_domain, tldextract.extract(url).registered_domain) if not (tldextract.extract( all_articles[b]["url"]).registered_domain == tldextract.extract(url).registered_domain): articles.append(all_articles[b]) return jsonify({ 'bias': bias_desc, 'similar_articles': { "articles": articles } }) else: return "Error: No text field provided."
args.bro_log = os.path.expanduser(args.bro_log) # Create a VirusTotal Query Class vtq = vt_query.VTQuery() # See our 'Risky Domains' Notebook for the analysis and # statistical methods used to compute this risky set of TLDs risky_tlds = set([ 'info', 'tk', 'xyz', 'online', 'club', 'ru', 'website', 'in', 'ws', 'top', 'site', 'work', 'biz', 'name', 'tech', 'loan', 'win', 'pro' ]) # Run the bro reader on the dns.log file looking for risky TLDs reader = bro_log_reader.BroLogReader(args.bro_log, tail=True) for row in reader.readrows(): # Pull out the TLD query = row['query'] tld = tldextract.extract(query).suffix # Check if the TLD is in the risky group if tld in risky_tlds: # Make the query with the full query results = vtq.query_url(query) if results.get('positives', 0) > 1: # At least two hits print('\nRisky Domain DNS Query Found') print('From: {:s} To: {:s} QType: {:s} RCode: {:s}'.format( row['id.orig_h'], row['id.resp_h'], row['qtype_name'], row['rcode_name'])) pprint(results)
def has_suffix(url): """Return whether the url has a suffix using tldextract.""" return bool(tldextract.extract(url).suffix)
def get_next_page_url(self, base, page): parsed_uri = tldextract.extract(base) scene_path = match_page_scenepath(parsed_uri.domain) return self.format_url(base, scene_path % page)
def extract(serp_url, parser=None, lower_case=True, trimmed=True, collapse_whitespace=True, use_naive_method=False): """ Parse a SERP URL and return information regarding the engine name, keyword and :class:`SearchEngineParser`. :param serp_url: Suspected SERP URL to extract a keyword from. :type serp_url: ``str`` or :class:`urlparse.ParseResult` :param parser: Optionally pass in a parser if already determined via call to get_parser. :type parser: :class:`SearchEngineParser` :param lower_case: Lower case the keyword. :type lower_case: ``True`` or ``False`` :param trimmed: Trim keyword leading and trailing whitespace. :type trimmed: ``True`` or ``False`` :param collapse_whitespace: Collapse 2 or more ``\s`` characters into one space ``' '``. :type collapse_whitespace: ``True`` or ``False`` :param use_naive_method: In the event that a parser doesn't exist for the given ``serp_url``, attempt to find an instance of ``_naive_re_pattern`` in the netloc of the ``serp_url``. If found, try to extract a keyword using ``_naive_params``. :type use_naive_method: ``True`` or ``False`` :returns: an :class:`ExtractResult` instance if ``serp_url`` is valid, ``None`` otherwise """ # Software should only work with Unicode strings internally, converting # to a particular encoding on output. url_parts = _unicode_urlparse(serp_url) if url_parts is None: return None result = None if parser is None: parser = get_parser(url_parts) if parser is None: if not use_naive_method: return None # Tried to get keyword from non SERP URL # Try to use naive method of detection if _naive_re.search(url_parts.netloc): query = _unicode_parse_qs(url_parts.query, keep_blank_values=True) for param in _naive_params: if param in query: import tldextract tld_res = tldextract.extract(url_parts.netloc) return ExtractResult(tld_res.domain, query[param][0], None) return None # Naive method could not detect a keyword either result = parser.parse(url_parts) if result is None: return None if lower_case: result.keyword = result.keyword.lower() if trimmed: result.keyword = result.keyword.strip() if collapse_whitespace: result.keyword = re.sub(r'\s+', ' ', result.keyword, re.UNICODE) return result