Beispiel #1
0
def compare_host(host1, host2):
    """ True if the domain.suffix part of both hosts is the same TAB05 """

    (_, domain1, suffix1) = tldextract.extract(host1)
    (_, domain2, suffix2) = tldextract.extract(host2)

    return domain1 == domain2 and suffix1 == suffix2
Beispiel #2
0
    def _html_to_dict(self, url):
        #r = requests.get(url).text
        r = Crawlera().get(url).text
        print url
        try:
            company_name = BeautifulSoup(r).find('h1',{'itemprop':'name'})
            company_name = company_name.find('strong').text
        except:
            return {"handle": url}
        address = BeautifulSoup(r).find('h1',{'itemprop':'name'}).find('span').text
        city = BeautifulSoup(r).find('span',{'itemprop':'addressLocality'}).text
        state = BeautifulSoup(r).find('span',{'itemprop':'addressRegion'}).text
        postal_code = BeautifulSoup(r).find('span',{'itemprop':'postalCode'}).text
        description = BeautifulSoup(r).find('article',{'itemprop':'description'}).text.strip().replace('\nMore...','')
        logo = BeautifulSoup(r).find('figure').find('img')['src']
        website = BeautifulSoup(r).find('li',{'class':'website'}).find('a')['href'].split('gourl?')[-1]
        domain = "{}.{}".format(tldextract.extract(website).domain, tldextract.extract(website).tld)
        ''' Phone '''
        main = BeautifulSoup(r).find('li',{'class':'phone'}).find('strong',{'class':'primary'}).text
        numbers = BeautifulSoup(r).find('li',{'class':'phone'}).findAll('li')
        nums = [number.find('span').text for number in numbers]
        names = [number.text.split(number.find('span').text)[0] for number in numbers]
        numbers = dict(zip(names, nums))
        numbers['main'] = main

        _vars = [company_name, address, city, state, postal_code, description, logo, website, domain]
        labels = ["name","address","city","state","postal_code", "description", "logo", "website", "domain"]
        company = dict(zip(labels, _vars))
        company["numbers"] = numbers
        company["handle"] = url
        return company
Beispiel #3
0
    def handle(self):
    	SO_ORIGINAL_DST = 80
        # self.request is the client connection/socket
	dst = self.request.getsockopt(socket.SOL_IP, SO_ORIGINAL_DST, 16) # Get the original destination IP before iptables redirect
	_, dst_port, ip1, ip2, ip3, ip4 = struct.unpack("!HHBBBB8x", dst)
	dst_ip = '%s.%s.%s.%s' % (ip1,ip2,ip3,ip4)
	peername = '%s:%s' % (self.request.getpeername()[0], self.request.getpeername()[1])
	print success('Client %s -> %s:443' % (peername, dst_ip))
	RemoteHostnames[dst_ip] = getCertHostnamesCached(dst_ip)
	#RemoteHostnames[dst_ip] = ['*.*.*.*','*.*.*','*.*','*'] # example fixed wildcard cert
	CN = RemoteHostnames[dst_ip][0] # SSL_Certificate_CN2 module will return CN as first list element
	if add_extra_hostnames:
		import tldextract
		domain = tldextract.extract(CN).domain
		tld = tldextract.extract(CN).tld
		bonus_hostnames = [] # kludge to work around lack of good support for SNI (server name indication) in python
		bonus_hostnames.append('www.%s.%s' % (domain,tld))
		bonus_hostnames.append('*.%s.%s' % (domain,tld))
		bonus_hostnames.append('%s.%s' % (domain,tld)) # without this, requests to (e.g.) https://google.com fail as the CN is 
		for extra_name in bonus_hostnames:             # www.google.com and there is no subjectAltName 'google.com' in the cert.
			if extra_name not in RemoteHostnames[dst_ip]:
			# however, adding extra hostnames as subjectAltNames makes other certs fail to validate, so disabled by default
				RemoteHostnames[dst_ip].append(extra_name)
	PhoneConnected = False
	CreateSignedX509Certificate(ip=dst_ip, hostnames=RemoteHostnames[dst_ip], peername=peername)
	try:
		(certfile, keyfile) = GeneratedCert[dst_ip]
		#print 'Setting up SSL socket using %s' % certfile
		stream_phone = ssl.wrap_socket(self.request, server_side=True, certfile=certfile,
					       keyfile=keyfile, ssl_version=ssl.PROTOCOL_TLSv1)
		PhoneConnected = True
	except (ssl.SSLError), e:
		print error('SSLError on connection to phone (%s)' % e)
		self.finish()
Beispiel #4
0
    def start(self):
        for ext in file_extensions:
            if ext in url_file(self.url):
                db.collections.update_one({
                    'structure': '#URLEntry',
                    'url': self.url
                    }, {'$set': { 'last_scraped': time.strftime("%Y-%m-%d %H:%M:%S")}})
                print('Skipping: {}'.format(self.url))
                return None

        try:
            with self.sess as sess:
                html_doc = sess.get(self.url, timeout=3).text
        except (InvalidSchema, ConnectionError, Timeout, TooManyRedirects):
            db.collections.remove(
                        {
                            'structure': '#URLEntry',
                            'url': self.url
                        }
                    )
            return None

        soup = BeautifulSoup(html_doc, 'html.parser')
        urls = self.get_urls(soup)

        for url in urls:
            existing = db.collections.find_one({
                'structure': '#URLEntry',
                'url': url
                })

            if existing is None:

                try:
                    tld = tldextract.extract(url).suffix
                except:
                    tld = '*'

                entry = URLEntry(domain=self.get_domain(url), url=url, tld=tld)
                db.collections.insert_one(entry.export())

        this_existing = db.collections.find_one({
                'structure': '#URLEntry',
                'domain': self.get_domain(self.url),
                'url': self.url
                })
        if this_existing is not None:
            db.collections.update_one({
                    'structure': '#URLEntry',
                    'domain': self.get_domain(self.url),
                    'url': self.url
                    }, {'$set': { 'last_scraped': time.strftime("%Y-%m-%d %H:%M:%S")}})
        else:
            try:
                tld = tldextract.extract(self.url).suffix
            except:
                tld = '*'

            entry = URLEntry(domain=self.get_domain(self.url), url=self.url, tld=tld)
            db.collections.insert_one(entry.export())
Beispiel #5
0
def test_tldextract():
    '''
    verify that tldextract parses just the netloc
    This is neither documented or tested by tldextract (!)
    '''
    assert tldextract.extract('example.com').registered_domain == 'example.com'
    assert tldextract.extract('www.example.com').registered_domain == 'example.com'
def crawlList(list):

    main_dict = parsedDictionary.parsedDictionary()

    #iterate through domains
    for i in range(0, len(list)):
        print "Scripts present at " + list[i]
        scripts = getScripts(list[i])
        printList(scripts)

        #iterate through this domain's scripts
        #this codes checks if the script is linked externally or is hosted on the same domain (given by a relative URL)
        dict = parsedDictionary.parsedDictionary()
        for y in range(0, len(scripts)):
            full = ''
            if( (scripts[y].startswith("//")) or (scripts[y].startswith("http"))):
                full = tldextract.extract(scripts[y])
                if(len(full.domain) <= 1):
                    full = tldextract.extract(list[i])
            else:
                full = tldextract.extract(list[i])

            link = full.domain + '.' + full.suffix
            if(not dict.exists(link)):
                dict.addElement(link)
        main_dict.add(dict)
        print main_dict.Dict
        print "}}}}}"
        print dict.Dict
        print "\n -------------------------------"
    sortedlist = main_dict.sortByValue()
    print " \n Top scripts: "
    printList(sortedlist)
Beispiel #7
0
 def _cache_html_to_df(self, html):
     company = BeautifulSoup(html)
     title = company.find('div',{'class':'companyTitle'})
     description = company.find('div',{'class':'companyDescription'})
     revenue = company.find('div',{'class':'companyRevenue'})
     address = company.find('div',{'class':'companyAddress'})
     employee_count = company.find('p',{'class':'companyEmployeeCountText'})
     website = company.find('div',{'class':'website'})
     phone = company.find('span',{'class':'hq'})
     industries = company.find('p', {'class':'industry'})
     industries = industries.find_all('span') if industries else []
     industries = [industry.text for industry in industries]
     
     data = [title, description, revenue, address, employee_count,
             website, phone]
     columns = ["name", "description", "revenue", "address",
                "headcount","website","phone"]
     # add industries
     data = [val.text.strip() if val else "" for val in data]
     data = dict(zip(columns, data))
     data["industry"] = industries
     print data
     data["domain"] = "{}.{}".format(tldextract.extract(data["website"]).domain,
                                     tldextract.extract(data["website"]).tld)
     try:
       data['logo'] = company.find('img',{'class':'companyLogo'})['src']
     except:
       data['logo'] = ""
     data["source"] = "zoominfo"
     data['headcount'] = data['headcount'].split('Employees')[0]
     data['description'] = data['description'].split('Company Description')[-1]
     data['revenue'] = data['revenue'].split('in Revenue')[0]
     # add fullcontact address support
     print data
     return data
Beispiel #8
0
	def process_item(self, item, spider):
		domain_name=tldextract.extract(item['url']).domain
		db = self.connection[domain_name] #ÓÃÓòÃû×÷Ϊ
		self.collection = db[settings['MONGODB_COLLECTION']]
		valid = True
		for data in item:
			if not data:
				valid = False
				raise DropItem("Missing {0}!".format(data))
			if valid:
				if domain_name in spider.crawledPagesPerSite and spider.crawledPagesPerSite[domain_name]>spider.maximumPagesPerSite:
					return None
					
				self.collection.insert(dict(item))
				if domain_name in spider.crawledPagesPerSite:
					spider.crawledPagesPerSite[domain_name]+=1
				else:
					spider.crawledPagesPerSite[domain_name]=1
				print "crawledPagesPerSite", spider.crawledPagesPerSite[domain_name]
				print "spider.allowed_domains", spider.allowed_domains
				print "spider.maximumPagesPerSite", spider.maximumPagesPerSite
				print "domain_name", domain_name, item['url']
				if spider.crawledPagesPerSite[domain_name]>spider.maximumPagesPerSite:
					suffix=tldextract.extract(item['url']).suffix
					domain_and_suffix=domain_name+"."+suffix
					print domain_and_suffix
					if domain_and_suffix in spider.allowed_domains:
						spider.allowed_domains.remove(domain_and_suffix)
						spider.dynamic_deny_domain.append(domain_name)
						#spider.rules[0].link_extractor.allow_domains.remove(domain_and_suffix)
						spider.rules[0].link_extractor.deny_domains.add(domain_and_suffix)
					print "spider.allowed_domains", spider.allowed_domains
					return None
				log.msg("Item added to MongoDB database!",level=log.DEBUG, spider=spider)
				return item
Beispiel #9
0
def same_domain(url1, url2):
  url1_extract = tldextract.extract(url1)
  url2_extract = tldextract.extract(url2)
  if url1_extract.domain == url2_extract.domain:
    return True
  else:
    return False
def loadLists(writer=sys.stdout):
  if isStale(suspect_file):
    print >> writer, "Updating ISC Suspicious Domains..."
    new_file = requests.get(isc_url)
    with open(suspect_file, 'w') as sf_buffer:
      sf_buffer.write(new_file.content)

  if safebrowsing_bootstrap:
      print("Initial download of SafeBrowsing DB... this will take a few minutes.")
      updateSafebrowsing()
  elif isStale(safebrowsing_db, maxTime=259200):
    print >> writer, "Updating Google Safebrowsing DB..."
    updateSafebrowsing()

  if isStale(topthousand_file, maxTime=2629743):
    print >> writer, "Updating Alexa Top 1000..."
    new_file = requests.get(topmillion_url)
    with zipfile.ZipFile(StringIO(new_file.content), 'r') as zipData:
      with zipData.open('top-1m.csv', 'r') as oneMil:
        with open(topthousand_file, 'w') as topThousand:
          for i in range(0,1000):
            topThousand.write(oneMil.readline())

  for sf_read in open(suspect_file):
    badDomain = tldextract.extract(sf_read)
    ISC_LIST.append(badDomain)

  for topthousand_read in open(topthousand_file):
    cleaned_line = topthousand_read.split(",")[1].strip()
    valuableDomain = tldextract.extract(cleaned_line)
    ALEXA_LIST.append(valuableDomain)
Beispiel #11
0
def insert(data):
    if data.strip():
        con = MySQLdb.connect(host="localhost", # your host, usually localhost
                             user="******", # your username
                              passwd="1234", # your password
                              db="rabbitmq") # name of the data base

        cur = con.cursor()
        query="insert into rabbitmq (url,domain,ttl,class,type,ip,worker)values(%s,%s,%s,%s,%s,%s,%s)"
        tld=""
        try:
            tld=tldextract.extract(data).registered_domain
        except:
            traceback.format_exc()
        try:
            digs= os.popen("dig +tries=1 +timeout=1 +noall +answer "+tldextract.extract(tld).registered_domain).read()
            digs=str(digs).split('\n')
            for dig in digs:
                if(dig.strip()):
                    try:
                        dig=dig.replace("\t\t","\t")
                        dig=dig.replace("\t\t","\t")
                        temp=dig.split('\t')
                        print "Data: "+temp[0] +"\t Data: "+ temp[1]+"\t Data: "+ temp[2]+"\t Data: "+ temp[3]+"\t Data: "+ temp[4]
                        params=(data.strip(),tld.strip(),temp[1].strip(),temp[2].strip(),temp[3].strip(),temp[4].strip(),worker)
                        cur.execute(query,params)
                    except:
                        params=(data.strip(),tld.strip(),"","","","",worker)
                        cur.execute(query,params)
        except:
            params=(data.strip(),tld.strip(),"","","","",worker)
            cur.execute(query,params)
        con.commit()
        cur.close()
        con.close()
Beispiel #12
0
def email_pattern_research():
    website = request.args['domain']
    domain = "{}.{}".format(tldextract.extract(website).domain,
                            tldextract.extract(website).tld)
    api_key = "9a31a1defcdc87a618e12970435fd44741d7b88794f7396cbec486b8"
    name = request.args['name'] if "name" in request.args.keys() else ""
    q.enqueue(EmailGuess().search_sources, domain, name, api_key, timeout=6000)
    return {'email_research_started':True}
Beispiel #13
0
def is_same_domain(url1, url2):
    """Check seedurl and other url belongs to same domain.
    >>>is_same_domain("http://kracekumar.wordpress.com", "http://wordpress.com")
    True
    >>>is_same_domain("http://kracekumar.com", "http://tumblr.com")
    False
    """
    return tldextract.extract(url1).domain == tldextract.extract(url2).domain
Beispiel #14
0
def mxsniff(email_or_domain, ignore_errors=False, cache=None):
    """
    Lookup MX records for a given email address, URL or domain name and identify the email service provider(s)
    from an internal list of known service providers.

    :param str email_or_domain: Email, domain or URL to lookup
    :return: Identified service provider, or a list if there's more than one (in unusual circumstances)

    >>> mxsniff('example.com')['match']
    ['nomx']
    >>> mxsniff('__invalid_domain_name__.com')['match']
    ['nomx']
    >>> mxsniff('*****@*****.**')['match']
    ['google-gmail']
    >>> sorted(mxsniff('https://google.com/').items())
    [('domain', 'google.com'), ('match', ['google-apps']), ('mx', [(10, 'aspmx.l.google.com'), (20, 'alt1.aspmx.l.google.com'), (30, 'alt2.aspmx.l.google.com'), (40, 'alt3.aspmx.l.google.com'), (50, 'alt4.aspmx.l.google.com')]), ('mx_tld', ['google.com']), ('query', 'https://google.com/')]
    """
    domain = get_domain(email_or_domain)
    if cache and domain in cache:
        return cache[domain]

    result = []
    tld = []

    try:
        answers = []  # Default value in case of verbose mode where an error occurs
        answers = sorted([(rdata.preference, rdata.exchange.to_text(omit_final_dot=True).lower())
            for rdata in dns.resolver.query(domain, 'MX')])
        for preference, exchange in answers:
            rdomain = tldextract.extract(exchange).registered_domain
            if rdomain not in tld:
                tld.append(rdomain)
            provider = provider_domains.get(exchange)
            if provider and provider not in result:
                result.append(provider)
    except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.NoNameservers):
        pass
    except dns.exception.DNSException as e:
        if ignore_errors:
            pass
        else:
            raise MXLookupException('{exc} {error} ({domain})'.format(
                exc=e.__class__.__name__, error=text_type(e), domain=domain))

    if not result:
        # Check for self-hosted email servers; identify them with the label 'self'
        if tldextract.extract(domain).registered_domain in tld:
            result.append('self')
        if not result:
            if answers:
                result.append('unknown')  # We don't know this one's provider
            else:
                result.append('nomx')  # This domain has no mail servers

    result = {'query': email_or_domain, 'domain': domain, 'match': result, 'mx': answers, 'mx_tld': tld}
    if cache:
        cache[domain] = result
    return result
Beispiel #15
0
    def check_domain_limit(self, url):
        for domain in self.limit_domain:
            ext = tldextract.extract(domain)
            # *的时候匹配所有二级域名,或者只匹配特定的域名
            if ((ext[0] == "*" or ext[0] == "") and tldextract.extract(url)[1] == ext[1]) or \
                    (".".join(tldextract.extract(url)) == domain):
                return True

        return False
Beispiel #16
0
def check_match(_url, url):
    target_url = _url['target_url']
    allowed_domains = _url['allowed_domains']
    match = False
    url_domain = tldextract.extract(url).domain.lower()
    target_url_domain = tldextract.extract(target_url).domain.lower()
    if url_domain == target_url_domain or url_domain in allowed_domains:
        match = True
    return match
Beispiel #17
0
 def _check_match(self, url):
     match = False
     url_domain = tldextract.extract(url).domain.lower()
     target_url_domain = tldextract.extract(
         self._data['url_data']['target_url']
     ).domain.lower()
     if url_domain == target_url_domain or \
             url_domain in self._data['url_data']['allowed_domains']:
         match = True
     return match
Beispiel #18
0
def validRedirect(history):
	first=history[0]
	last=history[-1]
	if "blogspot" in first and "blogspot" in last:
		return True
	first = tldextract.extract(first)
	last = tldextract.extract(last)
	if first.domain!=last.domain:
		return False
	else:
		return True
def extract_HAR_features(harfile):
	"""
	Opens a HAR file (JSON), extracts features from it and store them in a dict.
	Returns the dict with the features.
	"""
	har_features = {}
	har = json.loads(open(harfile).read())

	domain = har["log"]["pages"][0]["id"]
	# Extract domain
	ext = tldextract.extract(domain)
	domain = ext.domain + '.' + ext.suffix
	domainNoTLD = ext.domain
	# initialize variables
	domainStringSent, firstparty_data, thirdparty_data, firstparty_html, thirdparty_html, firstparty_requests, thirdparty_requests = 0, 0, 0, 0, 0, 0, 0

	for entry in har["log"]["entries"]:
		requestUrl = str(entry["request"]["url"])
		ext = tldextract.extract(requestUrl)
		requestDomain = ext.domain + '.' + ext.suffix
		# Check if the domainNoTLD is passed in the parameters of the request
		url_parameters = re.search('https?:\/\/.*\/(.*)', requestUrl)
		if url_parameters:
			if domainNoTLD in url_parameters.group(1):
				domainStringSent += 1
		# Check if this is a first-party request (Request domain == site domain)
		result = re.search('https?:\/\/(.*)\/.*', requestUrl)
		if result:
			if domain in result.group(1):
				# print requestUrl, 'is FIRST party request of size', entry["response"]["bodySize"]
				firstparty_requests += 1
				firstparty_data += int(entry["response"]["bodySize"])
				if entry["response"]["content"]["mimeType"]:
					mimeType = entry["response"]["content"]["mimeType"]
					if 'text' in mimeType or 'javascript' in mimeType:
						firstparty_html += entry["response"]["bodySize"]
			else:
				# print requestUrl, 'is THIRD party request of size', entry["response"]["bodySize"]
				thirdparty_requests += 1
				thirdparty_data += int(entry["response"]["bodySize"])
				if entry["response"]["content"]["mimeType"]:
					mimeType = entry["response"]["content"]["mimeType"]
					if 'text' in mimeType or 'javascript' in mimeType:
						thirdparty_html += entry["response"]["bodySize"]

	har_features['TP_DataRatio'] = safe_division(thirdparty_data, firstparty_data + thirdparty_data)
	har_features['TP_HtmlRatio'] = safe_division(thirdparty_html, firstparty_html + thirdparty_html)
	har_features['TP_RequestRatio'] = safe_division(thirdparty_requests, firstparty_requests + thirdparty_requests)

	har_features['domainStringSent'] = domainStringSent
	har_features['initialResponseSize'] = har["log"]["entries"][0]["response"]["bodySize"]
	har_features['initialResponseRatio'] = safe_division(har_features['initialResponseSize'], firstparty_data + thirdparty_data)

	return har_features
Beispiel #20
0
def url_as_diff(new, old):
    if new == old:
        return '<same>'
    if new == '-':
        return new
    old_parse = urlparse.urlsplit(old)
    new_parse = urlparse.urlsplit(new)

    changed = set()
    for f in old_parse._fields:
        new_f = getattr(new_parse, f)
        if new_f and new_f == getattr(old_parse, f):
            new_parse = new_parse._replace(**{f: '<{}>'.format(f)})
        elif new_f:
            changed.add(f)
    if tuple(changed) == ('scheme',):
        return '{}://<same>'.format(new_parse.scheme)

    if (not new_parse.netloc.startswith('<') and
            new_parse.port is None and old_parse.port is None):
        new_domain = tldextract.extract(new_parse.netloc)
        old_domain = tldextract.extract(old_parse.netloc)
        for f in old_domain._fields:
            new_f = getattr(new_domain, f)
            if new_f and new_f == getattr(old_domain, f):
                new_domain = new_domain._replace(**{f: '<{}>'.format(f)})
        new_domain = '.'.join(new_domain).replace('<domain>.<suffix>',
                                                  '<domain+>')
        new_parse = new_parse._replace(netloc=new_domain)

    if new_parse.path == old_parse.path + '/':
        new_parse = new_parse._replace(path='<path>/')
    if new_parse.path.startswith('/') and old_parse.path.startswith('/'):
        new_dirs = new_parse.path[1:].split('/')
        old_dirs = old_parse.path[1:].split('/')
        if new_dirs[-1] and new_dirs[-1] == old_dirs[-1]:
            new_dirs[-1] = '<basename>'
        old_dirs = {d: i for i, d in enumerate(old_dirs)}
        for i, new_dir in enumerate(new_dirs):
            if new_dir in old_dirs:
                new_dirs[i] = '<dir{}>'.format(old_dirs[new_dir] + 1)
        new_parse = new_parse._replace(path='/' + '/'.join(new_dirs))

    if (old_parse.query and new_parse.query and
            not new_parse.query.startswith('<')):
        old_query = set(old_parse.query.split('&'))
        new_query = set(new_parse.query.split('&'))
        if new_query > old_query:
            new_params = '&'.join(sorted(map(urllib.quote,
                                             new_query - old_query)))
            new_parse = new_parse._replace(query='<query>' + '&' + new_params)

    out = new_parse.geturl()
    return out
Beispiel #21
0
    def _company_cache_html_to_df(self, html):
        company_info = pd.DataFrame()
        c = BeautifulSoup(html)
        #print c.find('dd',{'class','basic-info-about'}).text
        if True:
            cols = [i.find('h4').text
                    for i in c.find('dd',{'class','basic-info-about'}).findAll('li')]
            vals = [i.find('p').text.strip()
                    for i in c.find('dd',{'class','basic-info-about'}).findAll('li')]
            company_info = company_info.append(dict(zip(cols,vals)),ignore_index=True)
            company_info.columns = [col.replace(' ','_').strip().lower()
                                    for col in company_info.columns]
            description = c.find('div', {'class':'description'})
            description = description.text.strip() if description else None
            company_info['description'] = description
            # rename companies title columns
            img = c.find('div',{'class':'image-wrapper'}).find('img')['src']
            company_info['logo'] =  img
            # new code not in other methods in different file
            company_info['name'] = c.find('h1',{'class':'name'}).text.strip()
            employee_count = c.find('a',{'class':'employee-count'})
            if employee_count:
                company_info['employee_count'] = int(employee_count.text.replace(',',''))
            url = None
            for i in c.find_all("h3"):
                if i.find("a"):
                    url = i.find("a")["href"]
                    url = url.split("?")[-1]
                    args = dict([i.split("=") for i in url.split("&")])
                    if "f_CC" in args.keys():
                      url = "http://linkedin.com/company/{0}".format(args["f_CC"])
                    else:
                      url = None
            company_info["linkedin_url"] = url

            if 'headquarters' in company_info.columns:
                company_info['address'] = company_info['headquarters']
                company_info.drop('headquarters', axis=1, inplace=True)
            if 'industry' in company_info.columns:
                company_info['industry'] = [[company_info['industry'].ix[0]] for i in range(company_info.shape[0])]

            website = company_info['website'].ix[0]
            domain = "{}.{}".format(tldextract.extract(website).domain, 
                                    tldextract.extract(website).tld)
            company_info['domain'] = domain
            company_info['source'] = "linkedin"
            company_info['headcount'] = company_info['company_size']
            company_info['headcount'] = company_info['headcount'].ix[0].split(' ')[0]

            if 'company_size' in company_info.columns:
                company_info.drop('company_size', axis=1, inplace=True)
            return company_info
        '''
 def _company_profile(self, company_name, api_key=""):
     g = Google().search(company_name)
     g = g[~g.link_text.str.contains("Map for")]
     #print g
     #print g.link.tolist()[0]
     domain = g.link.tolist()[0]
     domain = "{}.{}".format(tldextract.extract(domain).domain,
                             tldextract.extract(domain).tld)
     print domain
     company = clearbit.Company.find(domain=domain, stream=True)
     company = company if company else {}
     company["company_name"] = company_name
     del company["founders"]
Beispiel #23
0
def enhance_flow(flowDF, ftu):
    """
      Add some useful columns to a http dataframe.

      Parameters
      ----------
      flowDF : dataframe
          The enhanced HTTP log dataframe

      Returns
      -------
      flowDF: the dataframe with some columns added

    """

    #create some useful pre-features

    #stringify the port. probably no longer needed since we defensivley stringify things elsewhere.
    #flowDF['resp_p_str'] = flowDF['resp_p'].apply(str)

    #extract the browser string from the user agent.
    if 'browser_string' in ftu:
      flowDF['browser_string'] = flowDF['user_agent'].apply(lambda agent: httpagentparser.simple_detect(agent)[1])
    
    def paramsSSV(uri):
        fullUri = 'http://bogus.com/'+uri
        parseResult = parse_qs(urlparse(fullUri).query)
        return ' '.join(parseResult.keys())

    #create a SSV of the URI parameter keys
    if 'URIparams' in ftu:
      flowDF['URIparams'] = flowDF['uri'].apply(paramsSSV)
    
    def tokensSSV(uri):
        fullUri = 'http://bogus.com/'+uri
        parseResult = parse_qs(urlparse(fullUri).query)
        return ' '.join([" ".join(vals) for vals in parseResult.values()])

    #create a SSV of the URI parameter values
    if 'URItokens' in ftu:
      flowDF['URItokens'] = flowDF['uri'].apply(tokensSSV)

    #extract the subdomain from the host
    if 'subdomain' in ftu:
      flowDF['subdomain'] = flowDF['host'].apply(lambda host: tldextract.extract(host)[0])

    #extract the TLD from the host
    if 'tld' in ftu:
      flowDF['tld'] = flowDF['host'].apply(lambda host: tldextract.extract(host)[1])

    return flowDF
Beispiel #24
0
    def ensure_add_cookie(self, cookie, override_domain=None):
        """Ensures a cookie gets added to the driver

        Selenium needs the driver to be currently at the domain of the cookie
        before allowing you to add it, so we need to get through this limitation.

        The cookie parameter is a dict which must contain the keys (name, value, domain) and
        may contain the keys (path, expiry).

        We first check that we aren't currently in the cookie's domain, if we aren't, we GET
        the cookie's domain and then add the cookies to the driver.

        We can override the cookie's domain using 'override_domain'. The use for this
        parameter is that sometimes GETting the cookie's domain redirects you to a different
        sub domain, and therefore adding the cookie fails. So sometimes the user may
        need to override the cookie's domain to a less strict one, Eg.: 'site.com' instead of
        'home.site.com', in this way even if the site redirects us to a subdomain, the cookie will
        stick. If you set the domain to '', the cookie gets added with whatever domain the browser
        is currently at (at least in chrome it does), so this ensures the cookie gets added.

        It also retries adding the cookie with a more permissive domain if it fails in the first
        try, and raises an exception if that fails. The standard selenium behaviour in this case
        was to not do anything, which was very hard to debug.
        """
        if override_domain:
            cookie['domain'] = override_domain

        cookie_domain = cookie['domain'] if cookie['domain'][0] != '.' else cookie['domain'][1:]
        try:
            browser_domain = tldextract.extract(self.current_url).fqdn
        except AttributeError:
            browser_domain = ''
        if cookie_domain not in browser_domain:
            # TODO Check if hardcoding 'http' causes trouble
            # TODO Consider using a new proxy for this next request to not cause an anomalous
            #      request. This way their server sees our ip address as continuously having the
            #      same cookies and not have a request mid-session with no cookies
            self.get('http://' + cookie_domain)

        # Fixes phantomjs bug, all domains must start with a period
        if self.name == "phantomjs": cookie['domain'] = '.' + cookie['domain']
        self.add_cookie(cookie)

        # If we fail adding the cookie, retry with a more permissive domain
        if not self.is_cookie_in_driver(cookie):
            cookie['domain'] = tldextract.extract(cookie['domain']).registered_domain
            self.add_cookie(cookie)
            if not self.is_cookie_in_driver(cookie):
                raise WebDriverException(
                    "Couldn't add the following cookie to the webdriver\n{}\n".format(cookie)
                )
Beispiel #25
0
Datei: google.py Projekt: yz-/ut
def parse_ad(rad):
    d = dict()
    t = rad.find('h3').find('a')

    dest_url = t.get('href')
    if dest_url:
        d['dest_url'] = dest_url
        # d = dict(d,**{'dest_url':dest_url})
        dest_url_parsed = parse_qs(dest_url)
        if dest_url_parsed:
            dest_url_parsed = {k:v[0] for k,v in dest_url_parsed.iteritems()}
            if dest_url_parsed:
                d['dest_url_parsed'] = dest_url_parsed
                if dest_url_parsed.has_key('adurl'):
                    adurl = dest_url_parsed['adurl']
                    if adurl:
                        d['adurl'] = adurl
                        d['adurl_domain'] = tldextract.extract(adurl).domain

    title = t.getText()
    if title:
        d['title'] = title
        #d = dict(d,**{'title':title})

    disp_url = rad.find('div','kv')
    if disp_url:
        d['disp_url'] = disp_url.getText()
        d['disp_url_domain'] = tldextract.extract(d['disp_url']).domain
    #
    ad_text_html = rad.find('span','ac')
    if ad_text_html:
        d['ad_text_html'] = ad_text_html.renderContents()
        ad_text_lines = [re.sub(r"</?b>","",x) for x in d['ad_text_html'].split('<br/>')]
        if len(ad_text_lines)>=1:
            d['ad_text_line_1'] = ad_text_lines[0]
            if len(ad_text_lines)>=2:
                d['ad_text_line_2'] = ad_text_lines[1]
            else:
                d['ad_text_line_2'] = ''
        else:
            d['ad_text_line_1'] = ''


    div_f_html = rad.find('div','f')
    if div_f_html:
        d['div_f_html'] = div_f_html.renderContents()
        d['div_f_text'] = div_f_html.get_text('|||')

    # ad_text = ttt.getText(separator='|||')
    return d
def parse_file(filename, output_file):
	input_file = open(filename,'r')
	domains_lookedup = []
	excluded_domains = []
	total_domain_count = 0
	if output_file != 0:
		data = [csv_headers()]
		noutput_file = output_file.split('.',1)[0]+'.csv'
		print """
****************** Writing output to %s ******************
"""%noutput_file
		for domain in input_file.readlines():
			ndomain = tldextract.extract(domain)
			tld_domain = ndomain[1]+'.'+ndomain[2]
			if tld_domain not in domains_lookedup:
				domains_lookedup.append(tld_domain)
				total_domain_count += 1
				whois_data = get_whois_data(tld_domain,1)
				if whois_data != 0:
					data.append(whois_data)
				else:
					excluded_domains.append(tld_domain)
				time.sleep(2)
		print """
Attempted to retrieve whois information for %s domains
Successful lookups: %s
Unsuccessful lookups: %s
"""%(str(total_domain_count),str(total_domain_count-len(excluded_domains)),str(len(excluded_domains)))				
		write_to_file(data,noutput_file)
	else:
		for domain in input_file.readlines():
			ndomain = tldextract.extract(domain)
			tld_domain = ndomain[1]+'.'+ndomain[2]
			if tld_domain not in domains_lookedup:
				domains_lookedup.append(tld_domain)
				total_domain_count += 1
				whois_info = get_whois_data(tld_domain,2)
				if whois_info != 0:
					print "\n****************** %s ******************"%tld_domain.strip()
					for key,value in whois_info.items():
						print key+": "+value
				else:
					excluded_domains.append(domain)
				time.sleep(2)
		print """
Attempted to retrieve whois information for %s domains
Successful lookups: %s
Unsuccessful lookups: %s
"""%(str(total_domain_count),str(total_domain_count-len(excluded_domains)),str(len(excluded_domains)))
		print excluded_domains
def clean_row(row):
    """ Clean a row to [0, 2, 27, 30, 31, 7, 17, 26, 32, 33, 34, 35, 37, 44, 58] """

    if len(row) < NUM_FIELDS:
        row.extend("")

    new_row = [row[0]]  # just id

    # format date
    unix_epoch = datetime(1970, 1, 1)
    sql_date = row[1]
    parsed_sql_date = datetime(int(sql_date[:4]), int(sql_date[4:6]), int(sql_date[6:8]))
    days_since_epoch = (parsed_sql_date - unix_epoch).days

    new_row.extend([days_since_epoch])

    # add other fields
    new_row.extend(
        [row[26], row[29], row[30], row[6], row[16], row[25], row[31], row[32], row[33], row[34], row[36], row[43]]
    )

    # format url
    domain = tldextract.extract(row[57]).domain
    new_row.extend([domain])

    return new_row
Beispiel #28
0
def add_to_org(sender, **kwargs):
    org = sender.objects.last()
    user = org.creator

    member, created = OrganisationMember.objects.get_or_create(
        org=org, member=user)

    if created:
        tld = tldextract.extract(org.url)

        client = Client()
        client.name = org.name
        client.organisation = org
        client.schema_name = org.slug
        client.paid_until = datetime.now() + timedelta(days=90)
        try:
            client.domain_url = tld.domain
            client.save()
        except KeyError:
            try:
                client.domain_url = tld.domain + '-' + tld.subdomain
                client.save()
            except KeyError:
                client.domain_url = org.slug
                client.save()
    def _process_record(self, r):
        # user agent and browser
        os, browser = self.__process_user_agent(r['user_agent'])
        if os: self.data['os'].add(os)
        if browser: self.data['browser'].add(browser)
        
        # http basic auth usernames and passwords
        if r['username'] != '-': self.data['http-usernames'].add(r['username'])
        if r['password'] != '-': self.data['http-passwords'].add(r['password'])
        
        # ip address
        if r['id.orig_h'] != '-': self.data['device-ip'] = r['id.orig_h']
        
        host = r['host']
        uri = r['uri']

        if host != "-" or uri != "-": 
            data = (host, uri, r['ts'])
            self.data['http-queries'].add( data )
        
            # Also get referrer data
            ref = r['referrer']
            domain  = tldextract.extract(ref)
            if ref:
                refhost = "%s.%s" % (domain.domain, domain.suffix)
                refdata = (refhost, ref, r['ts'])
                self.data['http-queries'].add(refdata)
Beispiel #30
0
def spider(url,lvl=1):
    tld_url = tldextract.extract(url)
    tld_url = '.'.join(tld_url[:3])
    pos = url.rfind('/')
    outFile = url[pos+1:]
    print (outFile)
    response = requests.get(url) #storing all the information including headers in the variable source code
    if response.status_code == 200:
        plain_text = response.text #sort source code and store only the plaintext
        convert_data = BeautifulSoup(plain_text) #converting plain_text to Beautiful Soup object so the library can sort thru it
        for link in convert_data.findAll('a'):  #sorting useful information
            if link.get('href').find('//') == 0: #address URLs that start with //
                href = 'https:' + link.get('href')
            elif validators.url(link.get('href')): #address absolute URLs
                href = link.get('href')
            else: #address relative URLs
                href = url + link.get('href') #Building a clickable url
            #insertSQL(href, convert_data)
            print(indent(lvl) +str(lvl) + '.  ' +href) #displaying the result back to the user
            #outData = codecs.open(saveLocation +'\\' +outFile +'.html', 'w', 'utf-8')
            #outData.write(plain_text)
            #outData.close()


            if lvl < max_depth:
                spider(href, lvl+1)
Beispiel #31
0
def parse_source(url):
    """Return stripped url containing only domain and suffix.
    """
    return '{0.domain}.{0.suffix}'.format(extract(url))
Beispiel #32
0
from urllib.request import urlopen
import lxml.html
import tldextract
url = 'https://www.google.com'

connection = urlopen(url)
dom = lxml.html.fromstring(connection.read())
extracted = tldextract.extract(url)
main_domain = extracted.domain
yes = 0
no = 0
count = 0
for link in dom.xpath(
        '//a/@href'):  # select the url in href for all a tags(links)
    link = str(link)
    count += 1
    if main_domain in link:
        yes += 1
    elif link.startswith('/'):
        yes += 1
    else:
        print(link)
        no += 1
print((yes / count) * 100)
print((no / count) * 100)
n_per = (no / count) * 100
if n_per > 70:
    score15 = -1
elif 60 <= n_per <= 70:
    score15 = 0
else:
 def get_domain_from_host(validation_dns_record):
     """ Given an FQDN, return the domain
         portion of a host
     """
     domain_tld_info = tldextract.extract(validation_dns_record)
     return "%s.%s" % (domain_tld_info.domain, domain_tld_info.suffix)
def get_domain_from_url(url):
    url_object = tldextract.extract(url)
    domain_name = url_object.domain + "." + url_object.suffix
    return domain_name
def processing(data):
    bytesOut = data['bytesOut']
    bytesIn = data['bytesIn']
    pktsOut = data['pktsOut']
    pktsIn = data['pktsIn']
    tlsSubject = data['tlsSubject']
    tlsIssuerDn = data['tlsIssuerDn']
    tlsSni = data['tlsSni']
    tlsVersion = data['tlsVersion']

    outRatio = []  # 出流量/出包数
    inRatio = []  # 入流量/入包数
    orgName = []
    sni = []
    '''
    计算 (出流量/出包数)与(入流量/入包数)
    '''

    for i in range(len(bytesIn)):
        outRatio.append(bytesOut[i] / pktsOut[i])
        inRatio.append(bytesIn[i] / pktsIn[i])
        # print('outRatio: {}, inRatio: {}'.format(bytesOut[i] / pktsOut[i], bytesIn[i] / pktsIn[i]))
    '''
    过滤organization全部放入orgName列表
    '''
    pattern_O = 'O=.*?([,/]+|$)'

    for tmp in tlsSubject:  # 读取TLSSubject内容,将O=字段截出,保存入orgName列表,空值填入NaN字符串
        if pd.isna(tmp):
            orgName.append('NULL')
        else:
            res = re.search(pattern_O, tmp)
            if res:
                res = res.group()
                if res.startswith('O='):
                    res = res[2:]
                if res.endswith(','):
                    res = res[:-1]
                if res.endswith('.'):
                    res = res[:-1]
                if res.endswith('./'):
                    res = res[:-2]
                orgName.append(res)
            else:
                orgName.append('null')  # 区分所有字段的缺失与单字段的缺失
    '''
    过滤Subject中的CN
    '''
    pattern_CN = 'CN=.*?(/|$)'
    commonName = []

    for tmp in tlsSubject:
        if pd.isna(tmp):
            commonName.append('NULL')
        else:
            res = re.search(pattern_CN, tmp)
            if res:
                res = res.group()
                if res.startswith('CN='):
                    res = res[3:]
                if res.endswith('/'):
                    res = res[:-1]
                commonName.append(res)
            else:
                commonName.append('null')
    '''
    过滤tlsIssuerDn中的CN
    '''
    pattern_CN = 'CN=.*?(/|$)'
    dn_commonName = []

    for tmp in tlsIssuerDn:
        if pd.isna(tmp):
            dn_commonName.append('NULL')
        else:
            res = re.search(pattern_CN, tmp)
            if res:
                res = res.group()
                if res.startswith('CN='):
                    res = res[3:]
                if res.endswith('/'):
                    res = res[:-1]
                dn_commonName.append(res)
            else:
                dn_commonName.append('null')
    '''
    从tlsSni取顶级域名
    '''
    for tmp in tlsSni:
        if pd.isna(tmp):
            sni.append('NULL')
        else:
            tld = tldextract.extract(tmp)
            sni.append(tld.domain)

    X = pd.DataFrame({
        'O': orgName,
        'CN': commonName,
        'Dn': dn_commonName,
        'Sni': sni,
        'Version': tlsVersion,
        'OutRatio': outRatio,
        'InRatio': inRatio
    })
    return X
Beispiel #36
0
def get_domain(url):
    """Get the domain of a URL using tldextract."""
    return tldextract.extract(url).domain
Beispiel #37
0
    '--verbose3',
    help='DEBUG level verbosity. Also displays headers and requests',
    action='store_true')

args = parser.parse_args()

if args.url.startswith('http' or 'https'):
    url = args.url
else:
    print 'ERROR: The URL must start with http or https'
    exit()
print('The URL being tested is ' + colorgrn.format(str(url)))
#extract base url string, will convert string like google.com to googlecom
#domain = str(str(url.split("//")[1:]))[1:-1]
#burl = domain.replace("'","")
ext = tldextract.extract(url)
domain = '.'.join(ext[1:])
#Set unique user Agent
if args.useragent == ('google'):
    headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
elif args.useragent == ('bing'):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)'
    }
elif args.useragent == ('ie6'):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)'
    }
elif args.useragent == ('ie10'):
Beispiel #38
0
    def start_requests(self):
        urls = list()
        non_shopify_list = list()
        bots_list = list()

        # Get all urls to scrape
        with open(os.path.dirname(__file__) + self.url_file, "rt") as f:
            urls = [url.strip() for url in f.readlines()]

        # Supported non shopify sites list
        with open(os.path.dirname(__file__) + self.non_shopify_file,
                  "rt") as f:
            non_shopify_list = [url.strip() for url in f.readlines()]

        # Supported bots sites list
        with open(os.path.dirname(__file__) + self.bots_file, "rt") as f:
            bots_list = [url.strip() for url in f.readlines()]

        for url in urls:
            t = tldextract.extract(url)
            root = t.domain + '.' + t.suffix
            proxy_enabled = self.settings.get('PROXY_ENABLED')
            adidas_proxy_enabled = self.settings.get('ADIDAS_PROXY_ENABLED')

            # Adidas site (uses scrapy-splash)
            if "adidas.com" in url:
                # With proxy
                if adidas_proxy_enabled:
                    yield SplashRequest(url,
                                        self.adidas_parse,
                                        headers=self.adidas_headers(),
                                        args={
                                            'images_enabled': 'false',
                                            'proxy': self.random_proxy()
                                        })

                # Without proxy
                else:
                    yield SplashRequest(url,
                                        self.adidas_parse,
                                        headers=self.adidas_headers(),
                                        args={'images_enabled': 'false'})

            # Non shopify site
            elif any(root in s for s in non_shopify_list):
                # With proxy
                if proxy_enabled:
                    yield scrapy.Request(url,
                                         self.non_shoify,
                                         meta={'proxy': self.random_proxy()})

                # Without proxy
                else:
                    yield scrapy.Request(url, self.non_shoify)

            # Bots
            elif any(root in s for s in bots_list):
                # With proxy
                if proxy_enabled:
                    yield scrapy.Request(url,
                                         self.bots_parse,
                                         meta={'proxy': self.random_proxy()})

                # Without proxy
                else:
                    yield scrapy.Request(url, self.bots_parse)

            # Shopify sites
            else:
                # With proxy
                if proxy_enabled:
                    yield scrapy.Request(url,
                                         self.shopify_parse,
                                         meta={'proxy': self.random_proxy()})

                # Without proxy
                else:
                    yield scrapy.Request(url, self.shopify_parse)
Beispiel #39
0
    def non_shoify(self, response):
        t = tldextract.extract(response.url)
        root = t.domain + '.' + t.suffix

        if "footshop.com" in root:
            products = Selector(response).xpath(
                '//div[@class="col-xs-6 col-md-4 col-lg-3"]')

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('a/@title').extract()[0]
                item['url'] = product.xpath('a/@href').extract()[0]
                # item['image'] = product.xpath('a/div/img/@data-src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "caliroots.com" in root:
            products = Selector(response).xpath(
                '//ul[@class="product-list row"]//li[contains(@class,"product")]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('.//a/p[2]/text()').extract()[0]
                item['url'] = "https://caliroots.com" + \
                    product.xpath('.//a/@href').extract()[0]
                # item['image'] = product.xpath('.//a/div/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "size.co.uk" in root:
            products = Selector(response).xpath(
                '//ul[@class="listProducts productList"]//li[contains(@class,"productListItem")]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath(
                    './/span/span/span/a/text()').extract()[0]
                item['url'] = "https://www.size.co.uk" + \
                    product.xpath('.//span/span/span/a/@href').extract()[0]
                # item['image'] = product.xpath('.//span/a/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "jdsports.co.uk" in root:
            products = Selector(response).xpath(
                '//ul[@class="listProducts productList"]//li[contains(@class,"productListItem")]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath(
                    './/span/a/img/@title').extract()[0]
                item['url'] = "https://www.jdsports.co.uk" + \
                    product.xpath('.//span/a/@href').extract()[0]
                # item['image'] = product.xpath('.//span/a/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "5pointz.co.uk" in root:
            products = Selector(response).xpath(
                '//ol[@class="listing listing--grid"]//li[contains(@class,"listing-item")]//article//figure'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('a/@title').extract()[0]
                item['url'] = product.xpath('a/@href').extract()[0]
                # item['image'] = product.xpath('a/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "footasylum.com" in root:
            products = Selector(response).xpath(
                '//div[@class="productDataOnPage_inner"]//ul[@class="main-list row"]//li[contains(@class,"left")]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath(
                    'div/span[2]/img/@alt').extract()[0]
                item['url'] = product.xpath('div/span[1]/text()').extract()[0]
                # item['image'] = "https://www.footasylum.com" + product.xpath('div/span[2]/img/@data-original').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "asphaltgold.de" in root:
            products = Selector(response).xpath(
                '//div[@class="product-grid"]//section[contains(@class,"item")]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('a/@title').extract()[0]
                item['url'] = product.xpath('a/@href').extract()[0]
                # item['image'] = product.xpath('a/img//@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "wellgosh.com" in root:
            products = Selector(response).xpath(
                '//div[@class="category-products row grid-mode"]//article[contains(@class,"small-6")]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('.//figure/a/@title').extract()[0]
                item['url'] = product.xpath('.//figure/a/@href').extract()[0]
                # item['image'] = product.xpath('.//figure/a/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "hypedc.com" in root:
            products = Selector(response).xpath(
                '//div[@class="category-products row"]//div[contains(@class,"item")]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('.//a/@title').extract()[0]
                item['url'] = product.xpath('.//a/@href').extract()[0]
                # item['image'] = product.xpath('.//a/div/img/@data-src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "bstnstore.com" in root:
            products = Selector(response).xpath(
                '//ul[@class="block-grid four-up mobile-two-up productlist"]//li[contains(@class,"item")]//div[@class="itemWrapper pOverlay"]//div[@class="pImageContainer"]//a[@class="plink image"]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('div/@data-alt').extract()[0]
                item['url'] = "https://www.bstnstore.com" + \
                    product.xpath('@href').extract()[0]
                # item['image'] = "https://www.bstnstore.com" + product.xpath('div/div[2]/@data-src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "allikestore.com" in root:
            products = Selector(response).xpath(
                '//ul[@class="products-grid"]//li[contains(@class,"item")]//div[@class="item-wrap"]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('a/@title').extract()[0]
                item['url'] = product.xpath('a/@href').extract()[0]
                # item['image'] = product.xpath('a/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "back-door.it" in root:
            products = Selector(response).xpath(
                '//ul[@class="products clearfix"]//li')

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('a[1]/h6/text()').extract()[0]
                item['url'] = product.xpath('a[1]/@href').extract()[0]
                # item['image'] = product.xpath('div/a[2]/span/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "mrporter.com" in root:
            products = Selector(response).xpath(
                '//div[@class="pl-grid__column pl-grid__column--main"]//ul[@class="pl-products"]//li[contains(@class,"pl-products-item")]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath(
                    'a/div[2]/div/span[2]/text()').extract()[0].replace(
                        " Sneakers", "")
                item['url'] = "https://www.mrporter.com" + \
                    product.xpath('a/@href').extract()[0]
                # item['image'] = product.xpath('a/div[1]/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "titolo.ch" in root:
            products = Selector(response).xpath(
                '//ul[@class="small-block-grid-2 medium-block-grid-3 large-block-grid-4 no-bullet"]//li[contains(@class,"item")]//div[@class="list-inner-wrapper"]'
            )

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('a/@title').extract()[0]
                item['url'] = product.xpath('a/@href').extract()[0]
                # item['image'] = product.xpath('div[1]/a/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item

        elif "xileclothing.com" in root:
            products = Selector(response).xpath(
                '//ul[@class="itemsList"]/li/div[1]')

            for product in products:
                item = Sneaker()
                item['name'] = product.xpath('a/img/@alt').extract()[0]
                item['url'] = product.xpath('a/@href').extract()[0]
                # item['image'] = "https://www.xileclothing.com" + product.xpath('a/img/@src').extract()[0]
                # item['size'] = '**NOT SUPPORTED YET**'
                yield item
Beispiel #40
0
 def validate_result(self, pattern_text: str):  # noqa D102
     result = tldextract.extract(pattern_text)
     return result.fqdn != ""
Beispiel #41
0
def extract_domain(url):
    ext = tldextract.extract(url)
    domain = ext.domain
    return domain
Beispiel #42
0
def get_alexa(num, address=ALEXA_1M, filename='top-1m.csv'):
    """Grabs Alexa 1M"""
    url = urlopen(address)
    zipfile = ZipFile(StringIO(url.read()))
    return [tldextract.extract(x.split(',')[1]).domain for x in \
            zipfile.read(filename).split()[:num]]
Beispiel #43
0
def get_registered_domain(hostname):
    """Get the root DNS domain of an FQDN."""
    return tldextract.extract(hostname).registered_domain
Beispiel #44
0
def count_words_tld(address):
    """Count occurrences of keywords in domain"""
    count = count_words(tldextract.extract(address).domain)
    return count
Beispiel #45
0
def scrapper(url):

    if "www" in url:
        url = url.replace("www.", "")
        print(url)
    else:
        pass

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate"
    }

    final_report = []
    final_score = 0
    from result_dict import result_dict

    domain = tldextract.extract(url).domain
    suffix = tldextract.extract(url).suffix
    subdomain = tldextract.extract(url).subdomain
    pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>'

    # row 15 HTTPS test

    result = {'name': 'https_test', 'message': '', 'marks': ''}

    if "https" in url or "http" in url:
        print("if worked")

        try:
            a = url.split(":")
            a[0] = "https:"
            web = "".join(a)
        except:
            pass

        print("This is web  ", web)

        try:
            print("try of if worked")
            r = requests.get(web, headers=headers)
            # req = urllib.request.Request(url, headers=headers)
            # r = urllib.request.urlopen(req)
            result[
                'message'] = 'Félicitations. Votre site les données transitants par votre site sont sécurisées avec un certificat SSL'
            result['marks'] = 4
        except:
            try:
                a = url.split(":")
                a[0] = "http:"
                url3 = "".join(a)
            except:
                pass

            print("try of except worked")
            r = requests.get(url3, headers=headers, verify=False)
            url = url3
            # req = urllib.request.Request(url, headers=headers)
            # r = urllib.request.urlopen(req)
            result['message'] = '''
            Votre site ne dispose pas de certificat SSL. Les données qui y transitent peuvent donc être récupérés par des parties malveillantes. Google donne une grande importance à la sécurité des visiteurs.
            '''
            result['marks'] = 0
            print("HTTPS didn't worked")

    else:
        print("else worked")
        try:
            url2 = 'https://' + url
            r = requests.get(url2, headers=headers)
            url = url2
            # req = urllib.request.Request(url, headers=headers)
            # r = urllib.request.urlopen(req)
            result[
                'message'] = 'Félicitations. Votre site les données transitants par votre site sont sécurisées avec un certificat SSL'
            result['marks'] = 4

        except:
            url1 = 'http://' + url
            print("from else except ", url1)
            r = requests.get(url1, headers=headers, verify=False)
            url = url1
            # req = urllib.request.Request(url, headers=headers)
            # r = urllib.request.urlopen(req)
            result['message'] = '''
                Votre site ne dispose pas de certificat SSL. Les données qui y transitent peuvent donc être récupérés par des parties malveillantes. Google donne une grande importance à la sécurité des visiteurs.
                '''
            result['marks'] = 0

            result_dict['https_test'] = result
            final_score = final_score + result['marks']

    soup = BeautifulSoup(r.text, "lxml")

    # This is for row 1 (title)
    try:
        title_content = soup.find('title').text
        title_ln = len(title_content)

        if title_ln < 70:
            result = {
                'name': 'title',
                'message':
                'Félicitations votre site dispose d’un titre avec un nombre de caractères optimale soit moins de 70 caractères',
                'title_length': title_ln,
                'title_content': title_content,
                'marks': 5
            }
            final_score = final_score + 5
            result_dict['title'] = result
        elif title_ln > 70:
            result = {
                'name': 'title',
                'message':
                'Votre titre est trop long, le nombre de caractères optimal est de 70 caractères, essayez de le raccourcir',
                'title_length': title_ln,
                'title_content': title_content,
                'marks': 2
            }
            final_score = final_score + 2
            result_dict['title'] = result
    except:
        result = {
            'name': 'title',
            'message':
            'Votre site ne dispose pas de balise meta title. La balise <title> correspond au titre de votre page web. Il s’agit d’un champ essentiel à ne pas négliger dans le cadre d’une bonne stratégie d’optimisation du référencement naturel puisqu’elle est l’un des critères les plus importants pour les moteurs de recherche (Google, Bing...)',
            'title_length': 0,
            'marks': 0
        }
        final_score = final_score + 0
        result_dict['title'] = result

    # This is for row 2 (meta @description)
    name = 'meta_description'
    length_var_name = 'meta_desc_len'
    try:
        meta_tag = soup.find("meta", {"name": "description"})
        desc_content = meta_tag['content']
        desc_text_ln = len(desc_content)
        #desc_text_ln = int(desc_text_ln)

        if desc_text_ln < 150:
            result = {
                'name': name,
                'message':
                'Votre méta-description est trop courte, le nombre de caractère optimale doit être entre 150 et 250 caractères.',
                length_var_name: desc_text_ln,
                'desc_content': desc_content,
                'marks': 1
            }
            final_score = final_score + result['marks']
            result_dict['meta_description'] = result
            print('try worked1')

        elif desc_text_ln > 150 and desc_text_ln < 250:
            result = {
                'name': name,
                'message':
                'Félicitations votre site dispose d’une méta-description avec un nombre de caractère optimal entre 150 et 155 caractères',
                length_var_name: desc_text_ln,
                'desc_content': desc_content,
                'marks': 3
            }
            final_score = final_score + result['marks']
            result_dict['meta_description'] = result
            print('try worked2')

        elif desc_text_ln > 250:
            result = {
                'name': name,
                'message':
                ' Votre méta-description est trop longue, essayez de la raccourcir, le nombre optimal est entre 150 et 250 caractères, le reste risque d’être tronqué sur l’affichage du résultat sur les moteurs de recherche.',
                length_var_name: desc_text_ln,
                'desc_content': desc_content,
                'marks': 2
            }
            final_score = final_score + result['marks']
            result_dict['meta_description'] = result
            print('try worked3')
    except:
        result1 = {
            'name': name,
            'message':
            'Votre site ne dispose pas de méta-description, La balise meta description manque sur votre page. Vous devez inclure cette balise afin de fournir une brève description de votre page pouvant être utilisée par les moteurs de recherche. Des méta-descriptions bien écrites et attrayantes peuvent également aider les taux de clics sur votre site dans les résultats de moteur de recherche.',
            length_var_name: 0,
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['meta_description'] = result1
        print('except worked')

    # This is for row 3 (meta @keywords)
    name = 'meta_keywords'
    length_var_name = 'meta_key_len'
    try:
        meta_tag = soup.find("meta", {"name": "keywords"})
        meta_key_content_ln = len(meta_tag['content'])
        #title_ln = int(meta_key_content_ln)

        if meta_key_content_ln:
            result = {
                'name': name,
                'message':
                'Bravo vous avez spécifié des meta keywords . Vos mots-clés principaux doivent apparaître dans vos méta-tags pour vous aider à identifier le sujet de votre page Web dans les moteurs de recherche.',
                length_var_name: meta_key_content_ln,
                'marks': 1
            }
            final_score = final_score + result['marks']
            result_dict['meta_keywords'] = result
            print('try worked1')
    except:
        result1 = {
            'name': name,
            'message':
            'Vos mots-clés principaux doivent apparaître dans vos méta-tags pour vous aider à identifier le sujet de votre page Web dans les moteurs de recherche.',
            length_var_name: 0,
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['meta_keywords'] = result
        print('except worked')

    # This is for row 4 (meta @robots)
    name = 'meta_robots'
    length_var_name = 'meta_robots_len'
    try:
        meta_tag = soup.find("meta", {"name": "robots"})
        meta_robots_content = len(meta_tag['content'])
        # title_ln = int(desc_text_ln)

        if meta_robots_content:
            result = {
                'name': name,
                'message': "Votre site dispose d'un fichier robots.txt",
                length_var_name: meta_robots_content,
                'marks': 4
            }
            final_score = final_score + result['marks']
            result_dict['meta_robots'] = result
            print('try worked1')
    except:
        result1 = {
            'name': name,
            'message': '''
                        Votre site n’a pas de robot.txt
                        Le robots.txt est un fichier texte utilisant un format précis qui permet à un Webmaster de contrôler quelles zones de son site un robot d'indexation est autorisé à analyser. Ce fichier texte sera disponible à une URL bien précise pour un site donné, par exemple http://www.monsite.com/robots.txt
                        Pour bien comprendre à quoi sert un robots.txt, il faut comprendre la manière dont fonctionnent les robots d'indexation des moteurs de recherche (appelés aussi Web spiders, Web crawlers ou Bots) tels que Google, Yahoo ou Bing. Voici leurs actions lorsqu'ils analysent un site tel que www.monsite.com : ils commencent par télécharger et analyser le fichier http://www.monsite.com/robots.txt.
            ''',
            length_var_name: 0,
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['meta_robots'] = result1
        print('except worked')

    # This is for row 5 (html lang)
    name = 'html_lang'
    length_var_name = 'html_lang'
    try:
        meta_tag = soup.find("html", {"lang": True})
        lang_text = meta_tag['lang']

        result = {
            'name': name,
            'message':
            "Félicitations. Vous avez spécifié une langue à votre page.",
            length_var_name: lang_text,
            'marks': 3
        }
        final_score = final_score + result['marks']
        result_dict['html_lang'] = result
        print('try worked1')
    except:
        result1 = {
            'name': name,
            'message': '''
            Vous devriez spécifier une langue pour votre site, les moteurs de recherches ne comprennent pas quand un site dispose de plusieurs langues par exemple ayant des mots techniques en anglais et un contenu texte en français. Il faut donc bien spécifier la langue.
            ''',
            length_var_name: 0,
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['html_lang'] = result1
        print('except worked')

    # This is for row 6 (sitemap)
    url = url.strip()
    sitemap_url = url + '/sitemap.xml'
    print("Sitemap url ", sitemap_url)
    try:

        code = requests.get(sitemap_url, headers=headers).status_code

        name = 'sitemap'

        if code == 200:
            result = {
                'name': name,
                'message':
                "Félicitations, votre site dispose d’un fichier sitemap",
                'marks': 2
            }
            final_score = final_score + result['marks']
            result_dict['sitemap'] = result

        else:
            result = {
                'name': name,
                'message':
                "Votre site Web ne dispose pas d'un fichier sitemap. Les sitemaps peuvent aider les robots à indexer votre contenu de manière plus complète et plus rapide. ",
                'marks': 0
            }
            final_score = final_score + result['marks']
            result_dict['sitemap'] = result
    except:
        result = {
            'name': name,
            'message':
            "Votre site Web ne dispose pas d'un fichier sitemap. Les sitemaps peuvent aider les robots à indexer votre contenu de manière plus complète et plus rapide. ",
            'marks': 0
        }
        final_score = final_score + result['marks']
        result_dict['sitemap'] = result

    # This is for row 7 (google Analytics)
    searched_word = 'google-analytics'

    name = 'google_analytics'
    if searched_word in str(soup):
        print("Google analytics found")
        result = {
            'name': name,
            'message':
            "Félicitations, votre site dispose de l'outil Google Analytics",
            'marks': 2
        }
        final_score = final_score + result['marks']
        result_dict['google_analytics'] = result

    else:
        result = {
            'name': name,
            'message':
            "Votre site ne dispose pas de l'outil Google Analytics.",
            'marks': 0
        }
        final_score = final_score + result['marks']
        result_dict['google_analytics'] = result

    # This is for row 8 (page_cache)
    name = 'page_cache'
    length_var_name = 'page_cache_desc'
    try:
        meta_tag = soup.find("meta", {"http-equiv": "Cache-control"})
        lang_text = meta_tag['content']

        result = {
            'name': name,
            'message':
            "Vous avez activé le cache sur votre page, c'est très bien.",
            length_var_name: lang_text,
            'marks': 3
        }
        final_score = final_score + result['marks']
        result_dict['page_cache'] = result
        print('try worked1')
    except:
        result1 = {
            'name': name,
            'message':
            "Vous n'avez pas activé la mise en cache sur vos pages. La mise en cache permet un chargement plus rapide des pages.",
            length_var_name: 0,
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['page_cache'] = result1
        print('except worked')

    # API_KEY = AIzaSyD_RLUOcTN1JAq8PL8zJ79X6-kmHIDy_uM
    # This is for row 9 (Google safe browsing api)

    api_key = 'AIzaSyCVylpWnsOwzUoeTGg7akZRod-4YbhXoPU'
    sbl = SafeBrowsingList(api_key)
    bl = sbl.lookup_url(url)

    name = 'google_safe_browsing'
    print("google_safe_browsing ", url)
    if bl is None:
        print("Website is safe")
        result = {
            'name': name,
            'message': "Votre site est considéré comme sécurisé.",
            'marks': 2
        }
        final_score = final_score + result['marks']
        result_dict['google_safe_browsing'] = result

    else:
        result = {
            'name': name,
            'message':
            "Votre site n'est pas considéré comme sécurisé. Google et les autres moteurs de recherche prennent en compte le niveau de sécurité de votre site pour garantir la sécurité des visiteurs.",
            'marks': 0,
            'threats': bl
        }
        final_score = final_score + result['marks']
        result_dict['google_safe_browsing'] = result

    # This is for row 10 (responsive website test)
    name = 'responsive_test'
    length_var_name = 'responsive_test_desc'
    try:
        meta_tag = soup.find("meta", {"name": "viewport"})
        lang_text = meta_tag['content']

        result = {
            'name': name,
            'message': "Félicitations. Votre site est responsive.",
            length_var_name: lang_text,
            'marks': 4
        }
        final_score = final_score + result['marks']
        result_dict['responsive_test'] = result
        print('try worked1')
    except:
        result1 = {
            'name': name,
            'message': '''
            Nous n'avons pas détécté que votre site internet était responsive, soit adapté au mobile. Google prend énormément en compte ce critère pour un bon référencement.
            ''',
            length_var_name: 0,
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['responsive_test'] = result1
        print('except worked')

    # Html page size

    # mobile_friendliness_test
    print("mobile friendly ", url)
    data = {
        "url": url,
        "requestScreenshot": True,
    }

    r1 = requests.post(
        'https://searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run?key=AIzaSyDExRwe7TNEgHa_JLogOVjccqWNVoaH-EQ',
        data).json()

    # a = json.loads(r1.text)
    a = r1
    imgstring = a['screenshot']['data']
    if imgstring:
        print("image of mobile returned")
    else:
        print("image of mobile NOT returned")

    # import base64
    # imgdata = base64.b64decode(imgstring)
    # filename = 'some_image.jpg'  # I assume you have a way of picking unique filenames
    # with open(filename, 'wb') as f:
    #     f.write(imgdata)

    name = 'mobile_friendliness_test'

    if a['mobileFriendliness'] == 'MOBILE_FRIENDLY':
        print("Website is mobile friendly")
        result = {
            'name': name,
            'message': "Félicitations. Votre site est Mobile friendly.",
            'result': a['mobileFriendliness'],
            'img_string':
            'data:image/png;base64,' + urllib.parse.quote(imgstring),
            'marks': 4
        }
        final_score = final_score + result['marks']
        result_dict['mobile_friendliness_test'] = result

    else:
        result = {
            'name': name,
            'message':
            "Votre site n'est pas optimisé pour le mobile. Les moteurs de recherches donnent une très grande importance à la compatibilité mobile.",
            'marks': 0,
            'result': a['mobileFriendliness'],
            'img_string':
            'data:image/png;base64,' + urllib.parse.quote(imgstring)
        }
        final_score = final_score + result['marks']
        result_dict['mobile_friendliness_test'] = result

    # except:
    #         result = {
    #             'name':name,
    #             'message':"Votre site n'est pas optimisé pour le mobile. Les moteurs de recherches donnent une très grande importance à la compatibilité mobile.",
    #             'marks':0,
    #             'result': "Not Mobile Friendly"
    #         }
    #         final_score = final_score + result['marks']
    #         result_dict['mobile_friendliness_test'] = result
    #     #  "mobileFriendlyIssues": [
    # #   {
    # #    "rule": "TAP_TARGETS_TOO_CLOSE"
    # #   },
    # #   {
    # #    "rule": "USE_LEGIBLE_FONT_SIZES"
    # #   },
    # #   {
    # #    "rule": "CONFIGURE_VIEWPORT"
    # #   }
    # #  ],

    # # google page speed
    # print("Google page speed ",url)
    # r2 = requests.get('https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={}?key=AIzaSyAXf3ILJpeIs1nfDvvmLk0MsQDsuIsG5gM'.format(url))
    # b = json.loads(r2.text)
    # name = "page_speed"

    # # speed_index =  b['lighthouse']['audits']['speed-index']['.displayValue']
    # #print("this is speed index",speed_index)

    # # final_report.append({
    # #     "google_page_speed_data":b
    # # })
    # result_dict['page_speed'] = b

    # This is for row 13 (img alt attribute)
    name = 'img_alt'
    img_tags = soup.findAll("img")

    no_alt = []
    empty_alt = []
    alt_ok = []
    empty_check = []

    name = "img_alt"

    for img_tag in img_tags:
        try:
            if not img_tag['alt'].strip():
                empty_alt.append(img_tag['src'])
            elif img_tag['alt'].strip():
                alt_ok.append(img_tag['src'])
        except:
            no_alt.append(img_tag)

    total_alt_num = len(empty_alt) + len(alt_ok)

    img_alt_result = {
        'name': name,
        'message': '',
        'marks': '',
        'no_alt': no_alt,
        'empty_altm': empty_alt
    }

    if len(img_tags) == len(alt_ok):
        img_alt_result[
            'message'] = 'Félicitations. Toutes vos images disposent de balises alt attributs'
        img_alt_result['marks'] = 3
        print("every image tag contains alt and all have values")

    elif empty_alt and len(img_tags) == total_alt_num:
        img_alt_result[
            'message'] = 'Certaines de vos images manquent de balises alt attributs. Voir la liste complète'
        img_alt_result['marks'] = 1
        print("Every img have alt tag but some have empty alt")

    elif len(img_tags) == len(no_alt):
        img_alt_result[
            'message'] = "Aucune de vos images n'a de balises alt attributs, elles sont essentielles pour permettre aux moteurs de recherche de comprendre ce que représente votre image."
        img_alt_result['marks'] = 0
        print("No images have alt tag")

    if no_alt:
        img_alt_result[
            'message'] = "Aucune de vos images n'a de balises alt attributs, elles sont essentielles pour permettre aux moteurs de recherche de comprendre ce que représente votre image."
        img_alt_result['marks'] = 0
        print("Some images have no  alt tag")

    final_score = final_score + img_alt_result['marks']
    result_dict['img_alt'] = img_alt_result

    # This is for row 14 (favicon test)
    name = 'favicon_test'
    length_var_name = 'favicon_link'

    favicon_list = []
    link_tags = soup.findAll("link")
    for link in link_tags:
        if "favicon" in link['href']:
            favicon_list.append(link['href'])
    if favicon_list:

        result = {
            'name': name,
            'message': "Félicitations. Votre site dispose d'une favicon.",
            length_var_name: favicon_list,
            'marks': 1
        }
        final_score = final_score + result['marks']
        result_dict['favicon_test'] = result
        print('if worked1')
    else:
        result1 = {
            'name': name,
            'message':
            "Votre site ne dispose pas de favicon. La favicon est la petite icone qui apparait en haut du navigateur à côté du titre de votre site. Au delà de l'aspect SEO, elle permet de donner une identité visuelle à votre site.",
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['favicon_test'] = result1
        print('else worked')

    # This is for strong tag test
    name = 'strong_tag'
    length_var_name = 'strong_text'
    try:
        strong_tags = soup.findAll("strong")

        if strong_tags:
            result = {
                'name': name,
                'message':
                'Félicitations. Vous avez spécifié des balises strong dans votre texte',
                length_var_name: strong_tags,
                'marks': 2
            }
        else:
            result = {
                'name': name,
                'message':
                " Vous n'avez spécifié aucune balise strong dans votre texte. Les balises strong permettent aux moteurs de recherche de savoir quel contenu est intéressant et pertinent dans votre texte.",
                'marks': 0
            }
        final_score = final_score + result['marks']
        result_dict['strong_tag'] = result
        print('try worked1')
    except:
        result1 = {
            'name': name,
            'message':
            " Vous n'avez spécifié aucune balise strong dans votre texte. Les balises strong permettent aux moteurs de recherche de savoir quel contenu est intéressant et pertinent dans votre texte.",
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['strong_tag'] = result1
        print('except worked')

    # This is for Microdata test (itemscope , itemtype)
    name = 'micro_data_test'
    try:
        soup.find(True, {'itemscope': True}) or soup.find(
            True, {'itemtype': True})

        result = {
            'name': name,
            'message':
            "Félicitations. Votre site utilise des Microdonnées Schema.org",
            'marks': 3
        }
        final_score = final_score + result['marks']
        result_dict['micro_data_test'] = result
        print('try worked1')
    except:
        result1 = {
            'name': name,
            'message': '''
            Vos visiteurs aiment les beadcrumbs, mais Google aussi. Les beadcrumbs donnent à Google un autre moyen de comprendre la structure de votre site Web. Toutefois, comme indiqué précédemment, Google peut également utiliser vos beadcrumbs dans les résultats de recherche, ce qui rend votre résultat beaucoup plus attrayant pour les utilisateurs.
            ''',
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['micro_data_test'] = result1
        print('except worked')

    # This is for AMP Version
    name = 'amp_html_test'
    try:
        tag = soup.find('link', {'rel': "amphtml"})

        result = {
            'name': name,
            'message': " Félicitations. Votre site dispose d'une version AMP",
            'amp_html_link': tag['href'],
            'marks': 3
        }
        final_score = final_score + result['marks']
        result_dict['amp_html_test'] = result
        print('try worked1')
    except:
        result1 = {
            'name': name,
            'message':
            '''L’objectif est que les pages AMP s’affichent presque de façon instantannée, c’est-à-dire généralement 90% plus rapidement que d’habitude.
    Grâce à cette grande vitesse, l’expérience utilisateur sur mobile se trouve largement améliorée, ce qui d’après des études fait chuter le taux de rebo
    ''',
            'marks': 0
        }
        final_score = final_score + result1['marks']
        result_dict['amp_html_test'] = result1
        print('except worked')

    # This is for Breadcrumps
    searched_word = 'breadcrumb'

    name = 'breadcrumb'
    if searched_word in str(soup).lower():
        print("Breadcrum found")
        result = {
            'name': name,
            'message':
            "Félicitations, nous avons détécté l'utilisation de beadcrumbs sur votre site.",
            'marks': 2
        }
        final_score = final_score + result['marks']
        result_dict['breadcrumb'] = result

    else:
        result = {
            'name': name,
            'message':
            "Nous n'avons pas détécté de Beadcrumb sur votre site. Les Beadcrumbs sont une partie importante de presque tous les bons sites Web. Ces petites aides à la navigation ne permettent pas seulement aux internautes de savoir où elles se trouvent sur votre site, elles aident également Google à déterminer la structure de votre site.",
            'marks': 0
        }
        final_score = final_score + result['marks']
        result_dict['breadcrumb'] = result

    # Open graph Test
    name = 'open_graph_test'
    open_graph_tags = []

    og_tags = soup.findAll('meta', {"property": True})
    for og in og_tags:
        if "og" in str(og):
            open_graph_tags.append(og['property'])

    result = {
        'name': name,
        'message': "",
        'marks': "",
        'og_tags': open_graph_tags
    }

    if open_graph_tags:
        result[
            'message'] = 'Félicitations nous avons détécté des balises Open Graph.'
        result['marks'] = 1
        print("If worked")
    else:
        result['message'] = '''
        Les balises méta Open Graph sont conçues pour communiquer des informations sur votre site Web aux réseaux sociaux lorsque des liens vers votre site Web sont partagés. Ces balises vous permettent de créer des titres, des descriptions et des images personnalisés à utiliser lorsque vos pages sont partagées sur Facebook, LinkedIn et Google+.

    Ainsi, tout comme lorsque Google ou un autre moteur de recherche visite votre site et recherche les données (ou balises) appropriées afin d'afficher correctement votre site Web dans les résultats de recherche, les réseaux sociaux agissent de la même manière. La seule différence est que les réseaux sociaux recherchent ces tags spécifiques Open Graph (ou tags Twitter).
        '''
        result['marks'] = 0
        print("else worked")
    result_dict['open_graph_test'] = result
    final_score = final_score + result['marks']

    # Twitter Test
    name = 'twitter_test'
    twitter_tags = []

    og_tags = soup.findAll('meta', {"property": True})
    for twitter in twitter_tags:
        if "twitter" in str(og_tags):
            twitter_tags.append(og['property'])

    result = {
        'name': name,
        'message': "",
        'marks': "",
        'og_tags': twitter_tags
    }

    if twitter_tags:
        result['message'] = ' Parfait. Vous avez spécifié des Twitter Cards'
        result['marks'] = 2
        print("If worked")
    else:
        result[
            'message'] = "Twitter via les twitter Cards vous permet d'identifier l'auteur de la publication / de la page ainsi que l'éditeur, qui est généralement le nom du site Web. Ces deux valeurs ne sont pas obligatoires, mais permettent d’ajouter des données supplémentaires à ceux qui souhaiteraient l’ajouter."
        result['marks'] = 0
        print("else worked")
    result_dict['twitter_test'] = result
    final_score = final_score + result['marks']

    # This is for Social Media test
    fb = 'facebook.com'
    linkedin = 'linkedin.com'
    twitter = 'twitter.com'

    name = 'social_media_test'
    social_sites_found = []
    if fb in str(soup):
        social_sites_found.append('facebook')
        print("facebook.com found")
    elif linkedin in str(soup):
        social_sites_found.append('linkedin')
        print("linkedin.com found")
    elif twitter in str(soup):
        social_sites_found.append('twitter')
        print("twitter.com found")

    result = {
        'name': name,
        'message': "",
        'marks': '',
        'social_sites_found': social_sites_found
    }
    if social_sites_found:
        result[
            'message'] = 'Nous avons détécté une liaison vers les réseaux sociaux sur votre site.'
        result['marks'] = 2

    else:
        result[
            'message'] = " Nous n'avons pas détécté de lien vers vos réseaux sociaux sur votre site. Même si ça n'impacte pas grandement votre SEO, avoir des liens vers les réseaux sociaux de sa marque est plus agréable et utile pour les utilisateurs."
        result['marks'] = 0

    final_score = final_score + result['marks']
    result_dict['social_media_test'] = result

    # for H1/h2/h3
    h1_tag = soup.findAll('h5')
    h_tags = []
    for i in range(1, 6):
        h_tag = soup.find_all('h' + str(i))
        result = {"tag": 'h' + str(i), "total_num": len(h_tag)}
        h_tags.append(result)

    result = {
        "name": "heading_tags_test",
        "message": "",
        "marks": "",
        "total_num _tags": h_tags
    }
    if h_tags[0] and h_tags[1] and h_tags[2]:
        result['message'] = "Félicitations vos en en-têtes sont structurées"
        result['marks'] = 3

    elif h_tags[0] or h_tags[1] or h_tags[2] or h_tags[3] or h_tags[4]:
        result[
            'message'] = "FVos en-têtes ne sont pas structurés, il faut d'abord spécifier des en-têtes H1 puis H2 puis H3 etc.."
        result['marks'] = 1

    else:
        result[
            'message'] = "Vous n'avez pas spécifié d'en têtes, c'est un élément essentiel du SEO, ça permet aux moteurs de recherche de savoir de quoi le chapitre ou la section va discuter."
        result['marks'] = 0

    final_score = final_score + result['marks']
    result_dict['heading_tags_test'] = result

    # This is for page characters
    name = 'page_characters'
    try:
        tags1 = soup.findAll('p')
        tags2 = soup.findAll('h1')
        tags3 = soup.findAll('h2')
        tags4 = soup.findAll('h3')
        tags5 = soup.findAll('h4')
        tags6 = soup.findAll('h5')
        tags = tags1 + tags2 + tags3 + tags4 + tags5 + tags6
        text = ""
        for tag in tags:
            text = text + tag.text

        num_words = len(text.split(' '))

        result = {
            'name': name,
            'message': "",
            'marks': "",
            'num_words': num_words
        }

        if num_words > 300:
            result[
                'message'] = "Félicitations, la quantité de texte est supérieur à 300 mots."
            result['marks'] = 5
        else:
            result[
                'message'] = "La quantité de texte est insuffisante, il faut que vos pages contiennent plus de 300 mots pour que le contenu soit intéressant pour les moteurs de recherche."
            result['marks'] = 0

        print('try worked1')
    except:
        result = {
            'name': name,
            'message': '''
            
    La quantité de texte est insuffisante, il faut que vos pages contiennent plus de 300 mots pour que le contenu soit intéressant pour les moteurs de recherche.
    ''',
            'marks': 0
        }

        print('except worked')
    final_score = final_score + result['marks']
    result_dict['page_characters'] = result

    # page = requests.get(url,headers=headers).text

    # collecting all urls in website

    domain = tldextract.extract(url).domain
    suffix = tldextract.extract(url).suffix
    subdomain = tldextract.extract(url).subdomain
    pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>'

    link_levels = []
    found_links = re.findall(pattern, r.text)
    links = []
    external_links = []

    web = domain + '.' + suffix

    for link in found_links:
        if url not in link and "." not in link and "#" not in link:
            links.append(url + link)

        elif url not in link and "#" not in link and web not in link:
            external_links.append(link)

    links = list(dict.fromkeys(links))

    # keywords in URL test &&  levels in url

    keywords_in_url = []
    directories_in_url = []

    for url in links:
        if 'https' in url:
            if subdomain:
                url1 = "https://" + subdomain + '.' + domain + '.' + suffix
            else:
                url1 = "https://" + domain + '.' + suffix

        elif 'http' in url:
            if subdomain:
                url1 = "http://" + subdomain + '.' + domain + '.' + suffix
            else:
                url1 = "http://" + domain + '.' + suffix

        a = url
        t = set(url1.split('/'))
        p = set(a.split('/'))
        e = p - t
        keywords = list(e)

        if keywords:
            for item in keywords:
                keywords_in_url.append(item)
            directories_in_url.append(len(keywords))

            keywords_in_url = list(dict.fromkeys(keywords_in_url))
        else:
            pass

    result = {
        "name": "keywords_in_url",
        "keywords": keywords_in_url,
        "message": "",
        "marks": ''
    }
    if keywords_in_url:
        result[
            'message'] = "Vos urls disposent de keywords, Veuillez vérifier qu'elles correspondent bien à ce que vous voulez mettre en avant sur votre site."
        result['marks'] = 1
    else:
        result['message'] = "Vos urls ne semblent pas avoir de keywords."
        result['marks'] = 0

    result_dict['keywords_in_url'] = result
    final_score = final_score + result['marks']

    if directories_in_url:
        directories = max(directories_in_url)
    else:
        directories = 0
    result = {
        "name": "directories_in_url",
        "directories": directories,
        "message": "",
        "marks": ''
    }
    if directories < 5:
        result[
            'message'] = "Félicitations, votre URL est composée de moins de 5 dossiers",
        result['marks'] = 2
    else:
        result[
            'message'] = "Vos url sont composées de plus de 5 dossiers, veuillez en diminuer le nombre",
        result['marks'] = 0

    result_dict['directories_in_url'] = result
    final_score = final_score + result['marks']

    # # broken_link test
    # broken_links = []
    # all_links = links + external_links
    # for link in all_links:
    #     try:
    #         print("Checking link health of ",link)
    #         r1 = requests.get(url,headers = headers)
    #     except:
    #         broken_links.append(link)
    # result = {
    #     "name":"broken_links_test",
    #     "message":"",
    #     "marks":'',
    #     "broken_links":broken_links
    # }
    # if broken_links:
    #     result['message'] = "Nous avons détécté un ou plusieurs liens qui ne fonctionnent plus sur votre site internet. Voir la liste complète"
    #     result['marks'] = 0
    # else:
    #     result['message'] = "Félicitations, vous n'avez pas de brokenlinks."
    #     result['marks'] = 3

    # final_score = final_score + result['marks']
    # result_dict['broken_links_test'] = result

    # external links test
    result = {
        "name": "external_links_test",
        "message": "",
        "marks": '',
        "external_links": external_links
    }
    if external_links:
        result[
            'message'] = "Félicitations, vous avez plusieurs external links. Voir la liste complète"
        result['marks'] = 9
    else:
        result[
            'message'] = "Nous n'avons pas détécté de external links pour votre site internet. Les liens retour (external internal links) de qualité, sont primordiaux pour une bon référencement."
        result['marks'] = 0

    final_score = final_score + result['marks']
    result_dict['external_links_test'] = result

    #word cloud
    if text:
        cloud = WordCloud(background_color="white").generate(text)
        plt.imshow(cloud)
        plt.axis('off')

        image = io.BytesIO()
        plt.savefig(image, format='png')
        image.seek(0)  # rewind the data
        string = base64.b64encode(image.read())

        image_64 = 'data:image/png;base64,' + urllib.parse.quote(string)
        result = {
            "name": "word_cloud",
            "img": image_64,
            "message": "Nuage des mots les plus présents sur votre page"
        }
        result_dict['word_cloud'] = result
    else:
        result = {
            "name": "word_cloud",
            "img": "",
            "message": "Aucun contenu texte n'a été détécté"
        }

    # Internal links test
    result = {
        "name": "internal_links_test",
        "message": "",
        "marks": '',
        "internal_links": links
    }
    if links:
        result[
            'message'] = "Félicitations. Nous avons détécté l'utilisation de liens internes sur votre page."
        result['marks'] = 4
    else:
        result[
            'message'] = "Nous n'avons pas détécté de liens internes sur votre page. En plus de faire la liaison entre vos différentes pages, les liens internes permettent de mieux guider les robots Google et mettent en évidence le lien entre vos différentes pages."
        result['marks'] = 0

    final_score = final_score + result['marks']
    result_dict['internal_links_test'] = result

    test_count = {"test_passed": "", "test_failed": "", "without_marks": ""}
    passed = 0
    failed = 0
    without_marks = 0
    for k, v in result_dict.items():
        try:
            if v['marks'] == 0:
                failed = failed + 1
            elif v['marks'] > 0:
                passed = passed + 1
            else:
                pass
        except:
            without_marks = without_marks + 1

    test_count['test_passed'] = passed
    test_count['test_failed'] = failed
    test_count['without_marks'] = without_marks
    result_dict['test_count'] = test_count

    return (final_score, result_dict)
Beispiel #46
0
def count_symbols_tld(address):
    """Count occurrences of symbols in domain"""
    count = count_symbols(tldextract.extract(address).domain)
    return count
Beispiel #47
0
 def _get_tld_extract(url: str) -> tldextract.tldextract.ExtractResult:
     extract_result = tldextract.extract(url)
     return extract_result
Beispiel #48
0
def domain_length(address):
    """Extracts domain from URL and count characters"""
    domain = tldextract.extract(address)
    return len(domain.domain)
Beispiel #49
0
import time
import re
import tldextract
import socket
import smtplib
import dns
import dns.resolver
from Check_Valid_Email import Main

# ext=tldextract.extract("https://acviss")
# print("0",ext[0])
# print("1",ext[1])
# print(ext[2])

search_term = input("Enter the domain name in this format[domain.com]:-")
ext = tldextract.extract(search_term)
url = f"https://www.linkedin.com/search/results/people/?keywords={ext[1]}&origin=GLOBAL_SEARCH_HEADER"
login_url = "https://www.linkedin.com/uas/login?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Ffeed%2F&fromSignIn=true&trk=cold_join_sign_in"

# if you want to run linkedin on background then this option will be helpful
# options = Options()
# options.headless = True
# driver = webdriver.Chrome(options=options, executable_path=r"/usr/local/bin/chromedriver")
# print("headless")

driver = webdriver.Chrome("/usr/local/bin/chromedriver")

driver.get(login_url)
time.sleep(5)
# logging in using username and password
emailid = input("Enter the email:")
Beispiel #50
0
            res.update({"visit_id": visit_id})
    visited_site_visits_sites.append(site)

################################################################################
# Now we have an object containing all necessary information, saved in result array
#################################################################################

# add Content-Length
for resObject in result:
    content_length = 0
    first_party_content_length = 0
    third_party_content_length = 0
    advertisements_content_length = 0

    if resObject['index'] <= display_index:
        ext = tldextract.extract(resObject["visited_site"])
        visited_tld = ext.domain

        for header, url in cur.execute(
                "SELECT headers, url"
                " FROM http_responses"
                " WHERE visit_id = ?"
                " AND crawl_id = ?",
            [resObject["visit_id"], resObject["crawl_id"]]):
            if "Content-Length" in header:
                current_length = header.index("Content-Length")
                content_length = content_length + current_length
                if "http" in url:
                    if rules.should_block(url) is True:
                        advertisements_content_length = advertisements_content_length + current_length
                    xt = tldextract.extract(url)
Beispiel #51
0
def dns_extractor(target):
	dnsr = dns.resolver

	if target.startswith('https://') or target.startswith('http://'):
		ext = tldextract.extract(target)
		domain = ext.domain
		suffix = ext.suffix

		target = domain + '.' + suffix

		try:
			print()
			ns = dnsr.query(target, 'NS')
			for rs in ns:
				print(bold(green('NS records: ')) + str(rs))
		except dns.exception.DNSException:
			print(bad('Query failed > NS records.'))

		try:
			print()
			a = dnsr.query(target, 'A')
			for rs in a:
				print(bold(green('A records: ')) + str(rs))
		except dns.exception.DNSException:
			print(bad('Query failed > A records.'))

		try:
			print()
			mx = dnsr.query(target, 'MX')
			for rs in mx:
				print(bold(green('MX records: ')) + str(rs))
		except dns.exception.DNSException:
			print(bad('Query failed > MX records.'))

		try:
			print()
			txt = dnsr.query(target, 'TXT')
			for spf in txt:
				print(bold(green('SPF records: ')) + str(spf))
		except dns.exception.DNSException:
			print(bad('Query failed > SPF records.'))

	else:
		ext = tldextract.extract(target)
		domain = ext.domain
		suffix = ext.suffix

		target = domain + '.' + suffix

		try:
			print()
			ns = dnsr.query(target, 'NS')
			for rs in ns:
				print(bold(green('NS records: ')) + str(rs))
		except dns.exception.DNSException:
			print(bad('Query failed > NS records.'))

		try:
			print()
			a = dnsr.query(target, 'A')
			for rs in a:
				print(bold(green('A records: ')) + str(rs))
		except dns.exception.DNSException:
			print(bad('Query failed > A records.'))

		try:
			print()
			mx = dnsr.query(target, 'MX')
			for rs in mx:
				print(bold(green('MX records: ')) + str(rs))
		except dns.exception.DNSException:
			print(bad('Query failed > MX records.'))

		try:
			print()
			txt = dnsr.query(target, 'TXT')
			for spf in txt:
				print(bold(green('SPF records: ')) + str(spf))
		except dns.exception.DNSException:
			print(bad('Query failed > SPF records.'))
 def get_parent(self, response):
     site = tldextract.extract(response.url).domain
     if site:
         site = match_site(site)
     return site
Beispiel #53
0
    def generate_targets(self, env, targets, collections_by_id):
        # Setup specific template:
        template = env.get_template('site-target-template.md')

        # Export targets
        for target in targets:
            # Skip blocked items:
            if target['field_crawl_frequency'] == 'NEVERCRAWL':
                logger.warning("The Target '%s' is blocked (NEVERCRAWL)." %
                               target['title'])
                self.blocked_record_count += 1
                # FIXME SHOULD DELETE THE FILE IF IT EXISTS!
                continue
            # Skip items that have no crawl permission?
            # hasOpenAccessLicense == False, and inScopeForLegalDeposit == False ?
            # Skip items with no URLs:
            if len(target['fieldUrls']) == 0:
                logger.warning("The Target '%s' has no URLs!" %
                               target['title'])
                # FIXME SHOULD DELETE THE FILE IF IT EXISTS!
                continue
            # Skip hidden targets:
            if target['field_hidden']:
                logger.warning("The Target '%s' is hidden!" % target['title'])
                # FIXME SHOULD DELETE THE FILE IF IT EXISTS!
                continue
            # Get the ID, WCT ID preferred:
            tid = target['id']
            if target.get('field_wct_id', None):
                tid = target['field_wct_id']
            # Get the url, use the first:
            url = target['fieldUrls'][0]['url']
            # Extract the domain:
            parsed_url = tldextract.extract(url)
            publisher = parsed_url.registered_domain
            # Lookup in CDX:
            #wayback_date_str = CdxIndex().get_first_capture_date(url) # Get date in '20130401120000' form.
            #if wayback_date_str is None:
            #    logger.warning("The URL '%s' is not yet available, inScopeForLegalDeposit = %s" % (url, target['inScopeForLegalDeposit']))
            #    self.missing_record_count += 1
            #    continue
            start_date = self.get_target_start_date(target)
            wayback_date = datetime.datetime.strptime(start_date,
                                                      '%Y-%m-%dT%H:%M:%SZ')
            wayback_date_str = wayback_date.strftime('%Y%m%dT%H%M%S')
            first_date = wayback_date.isoformat()
            record_id = "%s/%s" % (
                wayback_date_str,
                base64.b64encode(hashlib.md5(url.encode('utf-8')).digest()))

            # Honour embargo
            #ago = datetime.datetime.now() - wayback_date
            #if ago.days <= 7:
            #    self.embargoed_record_count += 1
            #    continue

            # Strip out Windows newlines
            if 'description' in target and target['description'] != None:
                target['description'] = target['description'].replace(
                    '\r\n', '\n')

            # Otherwise, build the record:
            rec = {
                'slug':
                tid,
                'id':
                target['id'],
                'wct_id':
                target.get('field_wct_id', None),
                'record_id':
                record_id,
                'date':
                first_date,
                'target_url':
                url,
                'title':
                target['title'],
                'publisher':
                publisher,
                'start_date':
                target['crawlStartDateISO'],
                'end_date':
                target['crawlEndDateISO'],
                'open_access':
                target['hasOpenAccessLicense'],
                'npld':
                target['inScopeForLegalDeposit'],
                'scope':
                target['field_scope'],
                'nominating_organisation':
                target.get('nominating_organisation', {}).get('title', None),
                'collections': [],
                'subjects': []
            }

            # Add any collection:
            for col_id in target['collectionIds']:
                col = collections_by_id.get(int(col_id), {})
                if 'name' in col:
                    rec['collections'].append({
                        'id': col['id'],
                        'name': col['name']
                    })

            # For subjects
            for sub_id in target['subjectIds']:
                pass
                #col = subjects.get(int(target['collectionIds'][0]), {})
                #if 'name' in col:
                #    rec['collections'].append({
                #        'id': col['id'],
                #        'name': col['name']
                #    })

            # And the organisation:
            if 'nominating_organisation' in target and target[
                    'nominating_organisation'] != None:
                rec['organisation'] = {
                    'id':
                    target['nominating_organisation']['id'],
                    'name':
                    target['nominating_organisation']['title'],
                    'abbreviation':
                    target['nominating_organisation']['field_abbreviation']
                }

            # And write:
            file_path = self.get_target_file_path(target)
            target['file_path'] = file_path
            target_md = luigi.LocalTarget(
                "/Users/andy/Documents/workspace/ukwa-site/content/target/%s/index.en.md"
                % file_path)
            with target_md.open('w') as f:
                for part in template.generate({
                        "record":
                        rec,
                        "json":
                        json.dumps(rec, indent=2),
                        "description":
                        target['description']
                }):
                    f.write(part.encode("utf-8"))
Beispiel #54
0
def domain_is_in_blacklist(url):
    domain = tldextract.extract(url).domain
    return domain in domain_blacklist
Beispiel #55
0
def extract_domain(value):
	import tldextract
	uri = tldextract.extract(value).registered_domain
	return uri
Beispiel #56
0
def api_id():

    if 'url' in request.args:
        url = str(request.args['url'])
        bb = get_bias(url)
        bias = bb[0]
        bias_desc = bb[1]
        print(bias_desc)

        if (bias_desc == 'Left'):
            n = 1
        elif (bias_desc == 'Moderate Left'):
            n = 2
        elif (bias_desc == 'Neutral'):
            n = 3
        elif (bias_desc == 'Moderate Right'):
            n = 4
        else:
            n = 5
        """
        f = open('C:/Users/sahil/Desktop/Files/Coding/PalyHacks-DevilsAdvocate/DevilsAdvocate/svg-doughnut-chart-with-animation-and-tooltip/dist/vals.js', 'r')    # pass an appropriate path of the required file
        lines = f.readlines()
        lines[n-1] = str(int(lines[n].split(',')[0]) + 1) + ",\n"  # n is the line number you want to edit; subtract 1 as indexing of list starts from 0
        f.close()   # close the file and reopen in write mode to enable writing to file; you can also open in append mode and use "seek", but you will have some unwanted old data if the new data is shorter in length.

        f = open('C:/Users/sahil/Desktop/Files/Coding/PalyHacks-DevilsAdvocate/DevilsAdvocate/svg-doughnut-chart-with-animation-and-tooltip/dist/vals.js', 'w')
        f.writelines(lines)
        # do the remaining operations on the file
        f.close()
        """
        with open('vals.json', 'r') as f:
            vals = json.load(f)['vals']

        vals[n - 1] = vals[n - 1] + 1

        with open('vals.json', 'w') as f:
            json.dump({"vals": vals}, f)

        try:
            all_articles = similar_articles.get(url)["articles"]
        except:
            return jsonify({
                'bias': bias_desc,
                'similar_articles': {
                    "articles": []
                }
            })

        article_biases = []
        articles_biases_text = []

        for article in all_articles:
            print("article", article)
            b = get_bias_dict(article["url"])
            article_biases.append(b[0])
            articles_biases_text.append(b[1])
        article_biases = np.array(article_biases)

        if ("Right" in bias_desc):
            k = min(3, len(article_biases) - 1)
            bs = np.argpartition(article_biases, k)[:k]
        elif ("Left" in bias_desc):
            k = min(3, len(article_biases) - 1)
            bs = np.argpartition(-1 * article_biases, k)[:k]

        if ("Neutral" in bias_desc):
            articles = all_articles
        else:
            articles = []
            for b in bs:
                print(
                    tldextract.extract(
                        all_articles[b]["url"]).registered_domain,
                    tldextract.extract(url).registered_domain)
                if not (tldextract.extract(
                        all_articles[b]["url"]).registered_domain
                        == tldextract.extract(url).registered_domain):
                    articles.append(all_articles[b])

        return jsonify({
            'bias': bias_desc,
            'similar_articles': {
                "articles": articles
            }
        })

    else:
        return "Error: No text field provided."
Beispiel #57
0
        args.bro_log = os.path.expanduser(args.bro_log)

        # Create a VirusTotal Query Class
        vtq = vt_query.VTQuery()

        # See our 'Risky Domains' Notebook for the analysis and
        # statistical methods used to compute this risky set of TLDs
        risky_tlds = set([
            'info', 'tk', 'xyz', 'online', 'club', 'ru', 'website', 'in', 'ws',
            'top', 'site', 'work', 'biz', 'name', 'tech', 'loan', 'win', 'pro'
        ])

        # Run the bro reader on the dns.log file looking for risky TLDs
        reader = bro_log_reader.BroLogReader(args.bro_log, tail=True)
        for row in reader.readrows():

            # Pull out the TLD
            query = row['query']
            tld = tldextract.extract(query).suffix

            # Check if the TLD is in the risky group
            if tld in risky_tlds:
                # Make the query with the full query
                results = vtq.query_url(query)
                if results.get('positives', 0) > 1:  # At least two hits
                    print('\nRisky Domain DNS Query Found')
                    print('From: {:s} To: {:s} QType: {:s} RCode: {:s}'.format(
                        row['id.orig_h'], row['id.resp_h'], row['qtype_name'],
                        row['rcode_name']))
                    pprint(results)
Beispiel #58
0
def has_suffix(url):
    """Return whether the url has a suffix using tldextract."""
    return bool(tldextract.extract(url).suffix)
Beispiel #59
0
 def get_next_page_url(self, base, page):
     parsed_uri = tldextract.extract(base)
     scene_path = match_page_scenepath(parsed_uri.domain)
     return self.format_url(base, scene_path % page)
Beispiel #60
0
def extract(serp_url,
            parser=None,
            lower_case=True,
            trimmed=True,
            collapse_whitespace=True,
            use_naive_method=False):
    """
    Parse a SERP URL and return information regarding the engine name,
    keyword and :class:`SearchEngineParser`.

    :param serp_url:            Suspected SERP URL to extract a keyword from.
    :type serp_url:             ``str`` or :class:`urlparse.ParseResult`

    :param parser:              Optionally pass in a parser if already
                                determined via call to get_parser.
    :type parser:               :class:`SearchEngineParser`

    :param lower_case:          Lower case the keyword.
    :type lower_case:           ``True`` or ``False``

    :param trimmed:             Trim keyword leading and trailing whitespace.
    :type trimmed:              ``True`` or ``False``

    :param collapse_whitespace: Collapse 2 or more ``\s`` characters into one
                                space ``' '``.
    :type collapse_whitespace:  ``True`` or ``False``

    :param use_naive_method:    In the event that a parser doesn't exist for
                                the given ``serp_url``, attempt to find an
                                instance of ``_naive_re_pattern`` in the netloc
                                of the ``serp_url``.  If found, try to extract
                                a keyword using ``_naive_params``.
    :type use_naive_method:     ``True`` or ``False``

    :returns: an :class:`ExtractResult` instance if ``serp_url`` is valid,
              ``None`` otherwise
    """
    # Software should only work with Unicode strings internally, converting
    # to a particular encoding on output.
    url_parts = _unicode_urlparse(serp_url)
    if url_parts is None:
        return None

    result = None
    if parser is None:
        parser = get_parser(url_parts)

    if parser is None:
        if not use_naive_method:
            return None  # Tried to get keyword from non SERP URL

        # Try to use naive method of detection
        if _naive_re.search(url_parts.netloc):
            query = _unicode_parse_qs(url_parts.query, keep_blank_values=True)
            for param in _naive_params:
                if param in query:
                    import tldextract
                    tld_res = tldextract.extract(url_parts.netloc)
                    return ExtractResult(tld_res.domain, query[param][0], None)

        return None  # Naive method could not detect a keyword either

    result = parser.parse(url_parts)

    if result is None:
        return None

    if lower_case:
        result.keyword = result.keyword.lower()
    if trimmed:
        result.keyword = result.keyword.strip()
    if collapse_whitespace:
        result.keyword = re.sub(r'\s+', ' ', result.keyword, re.UNICODE)

    return result