def getDomainWebContent(self, domain): """ @param domain: a Domain object containing the domain that you want web_content for. @return a hashmap of all the webcontent for this domain. """ url = domain.value + '.' + domain.tld try: wpg = WebPageInfoGetter(url) wpg.setUpGetter(url) except Exception as e: self.web_exceptions.append(e) try: nilsimsa = wpg.getNilsimsaHash(url, False) except Exception as e: nilsimsa = -1 self.web_exceptions.append(e) try: image = wpg.getImageHash(url, False) except Exception as e: image = -1 self.web_exceptions.append(e) try: redirects = wpg.getNumberOfRedirects(url, False) except Exception as e: redirects = -1 self.web_exceptions.append(e) return {"nilsimsa" : nilsimsa, "image" : image, "redirects" : redirects}
def work(self, index, domain): """ each thread does the work here """ url = domain + '.' + self.aTLD exceptions = [] if self.web_content: #target webcontent with this thread try: wpg = WebPageInfoGetter(url) wpg.id += str(index) wpg.setUpGetter(url) except Exception as e: exceptions.append(e) try: nilsimsa = wpg.getNilsimsaHash(url, False) except Exception as e: nilsimsa = None exceptions.append(e) try: image = wpg.getImageHash(url, False) except Exception as e: image = None exceptions.append(e) try: redirects = wpg.getNumberOfRedirects(url, False) except Exception as e: redirects = None exceptions.append(e) info = "-Domain: {}\nNilsimsa: {}\nImageHash: {}\nRedirects: {}\nExceptions: {}\n".format(url, nilsimsa, image, redirects, exceptions) self.window[index%self.window_size] = info else: #target only the whois content with this thread try: whois_parser = Whois_Parser() whois_server = whois_parser.server_info['.' + self.aTLD][0] except Exception as e: exceptions.append(e) try: creation_date = whois_parser.getCreationDate(url, whois_server) except Exception as e: creation_date = None exceptions.append(e) try: privacy_prot = whois_parser.isWhoisPrivacyProtected(url, whois_server) except Exception as e: privacy_prot = None exceptions.append(e) try: is_parking = whois_parser.isParking(url, whois_server) except Exception as e: is_parking = None exceptions.append(e) info = "-Domain: {}\nCreationDate: {}\nPrivacy: {}\nParking: {}\nExceptions: {}\n".format(url, creation_date, privacy_prot, is_parking, exceptions) self.window[index%self.window_size] = info
def _record_domain_info(self, a_domain, a_tld, a_file, switch=True): """ Record all information for a domain """ exceptions = [] domain_ctypos = self._generate_ctypos_for_domain(a_domain) #first we grab all the content we can via loading up the url try: wpg = WebPageInfoGetter(a_domain) wpg.setUpGetter(a_domain) except Exception as e: exceptions.append(e) try: nilsimsa = wpg.getNilsimsaHash(a_domain, False) except Exception as e: nilsimsa = None exceptions.append(e) try: image = wpg.getImageHash(a_domain, False) except Exception as e: image = None exceptions.append(e) try: redirects = wpg.getNumberOfRedirects(a_domain, False) except Exception as e: redirects = None exceptions.append(e) #next we grab all the whois content whois_server_found = False try: whois_parser = Whois_Parser() whois_server = whois_parser.server_info['.' + a_tld][0] whois_server_found = True except Exception as e: whois_server_found = False exceptions.append(e) try: if whois_server_found: creation_date = whois_parser.getCreationDate(a_domain, whois_server) else: creation_date = None except Exception as e: creation_date = None exceptions.append(e) try: if whois_server_found: privacy_prot = whois_parser.isWhoisPrivacyProtected(a_domain, whois_server) else: privacy_prot = None except Exception as e: privacy_prot = None exceptions.append(e) try: if whois_server_found: is_parking = whois_parser.isParking(a_domain, whois_server) else: is_parking = None except Exception as e: is_parking = None exceptions.append(e) #next we grab Alexa info #try: # is_top = self.alexa_reader.isDomainInAlexaTop(a_domain) #except Exception as e: # is_top = None # exceptions.append(e) with open(a_file, "a") as data_fp: #write out all of our data to the file data_fp.write("-Domain: {}\n".format(a_domain)) data_fp.write("NumberOfCandidates: {}\n".format(len(domain_ctypos))) data_fp.write("Candidates: {}\n".format(str(domain_ctypos))) data_fp.write("Nilsimsa: {}\n".format(nilsimsa)) data_fp.write("ImageHash: {}\n".format(image)) data_fp.write("Redirects: {}\n".format(redirects)) data_fp.write("CreationDate: {}\n".format(creation_date)) data_fp.write("Privacy: {}\n".format(privacy_prot)) data_fp.write("Parking: {}\n".format(is_parking)) for exception in exceptions: data_fp.write("Exception: {}\n".format(exception))
def navigateZoneFile(self, aGzipFile, aTLD="com"): """ Method to navigate all the domains -- and their candidates -- in a file """ dataFileName = aGzipFile.split('.')[0] #load the appropriate files into memory tld_files = self._loadCurrPrevAndNextFromFile(aGzipFile) for domain in tld_files["current"].keys(): #STORE ALL INFORMATION FOR THE FILE #First, now that contents are in memory, go after the candidates candidates = [] exceptions = [] #generate typos for the domain in question gtypos = self._generate_typos_inhash(domain.lower()) #iterate through gtypos looking if it exists in the files in memory. if so, we have a candidate for typo in gtypos: if self.isDomainCandidate(typo, tld_files["previous"], tld_files["current"], tld_files["next"]) and typo not in candidates: candidates.append(typo) url = domain + '.' + aTLD try: wpg = WebPageInfoGetter(url) wpg.setUpGetter(url) except Exception as e: exceptions.append(e) try: nilsimsa = wpg.getNilsimsaHash(url, False) except Exception as e: nilsimsa = None exceptions.append(e) try: image = wpg.getImageHash(url, False) except Exception as e: image = None exceptions.append(e) try: redirects = wpg.getNumberOfRedirects(url, False) except Exception as e: redirects = None exceptions.append(e) #next we grab all the whois content try: whois_parser = Whois_Parser() whois_server = whois_parser.server_info['.' + aTLD][0] except Exception as e: exceptions.append(e) try: creation_date = whois_parser.getCreationDate(url, whois_server) except Exception as e: creation_date = None exceptions.append(e) try: privacy_prot = whois_parser.isWhoisPrivacyProtected(url, whois_server) except Exception as e: privacy_prot = None exceptions.append(e) try: is_parking = whois_parser.isParking(url, whois_server) except Exception as e: is_parking = None exceptions.append(e) with open("/home/engelsjo/Documents/Research/tld_file_parser/data/{}_data/{}.data".format(aTLD, dataFileName), "a") as data_fp: #write out all of our data to the file data_fp.write("-Domain: {}\n".format(url)) data_fp.write("NumberOfCandidates: {}\n".format(len(candidates))) data_fp.write("Candidates: {}\n".format(str(candidates))) data_fp.write("Nilsimsa: {}\n".format(nilsimsa)) data_fp.write("ImageHash: {}\n".format(image)) data_fp.write("Redirects: {}\n".format(redirects)) data_fp.write("CreationDate: {}\n".format(creation_date)) data_fp.write("Privacy: {}\n".format(privacy_prot)) data_fp.write("Parking: {}\n".format(is_parking)) for exception in exceptions: data_fp.write("Exception: {}\n".format(exception)) print("done with file")