class rule_extraction: def __init__(self): self.logger = NsLog("log") self.url_rules_o = url_rules() self.active_rules_o = active_rules() def extraction(self, parsed_domains): self.logger.info("rule_extraction.extraction() is running") domain_features = [] try: for line in tqdm(parsed_domains): # self.bar(parsed_domains) info = line # info['mail'] = 'whoisden cekilecek' nlp_info, url_features = self.url_rules_o.rules_main( info['domain'], info['tld'], info['subdomain'], info['path'], info['words_raw']) # url kurallarin calistigi yer info['nlp_info'] = nlp_info info['nlp_info']['words_raw'] = info['words_raw'] info.pop("words_raw", None) # domain_info, dns_features = self.dns_rules_o.rules_main(line_lst) # dns rules outputDict = {} # info['dns_records'] = domain_info outputDict['info'] = info outputDict['url_features'] = url_features # outputDict['dns_features'] = dns_features domain_features.append(outputDict) #domain_features = self.active_rules_o.goog_safe_browsing(domain_features) # active kuralların çalıştığı yer except: self.logger.error("Error : {0}".format(format_exc())) return domain_features
class domain_parser(object): def __init__(self): self.logger = NsLog("log") def parse(self, domain_list, class_info, count): self.logger.info("domain_parser.parse() is running") parsed_domain_list = [] registered_domain_lst = [] for line in tqdm(domain_list): domain = {} line = line.strip().replace('"', "").replace("'", '') extracted_domain = tldextract.extract(line) registered_domain_lst.append(extracted_domain.registered_domain) domain['url'] = line domain['domain'] = extracted_domain.domain domain['registered_domain'] = extracted_domain.registered_domain domain['tld'] = extracted_domain.suffix domain['subdomain'] = extracted_domain.subdomain domain['class'] = class_info domain['id'] = count count = count + 1 if line.find('://') == -1: domain['protocol'] = '' else: domain['protocol'] = line.split("://")[0] tmp = line[line.find(extracted_domain.suffix):len( line)] # tld sonraki ilk / e gore parse --> path pth = tmp.partition("/") domain['path'] = pth[1] + pth[2] domain['words_raw'] = self.words_raw_extraction( extracted_domain.domain, extracted_domain.subdomain, pth[2]) parsed_domain_list.append(domain) return parsed_domain_list def parse_nonlabeled_samples(self, domain_list, count=0): self.logger.info("domain_parser.parse_nonlabeled_samples() is running") parsed_domain_list = [] registered_domain_lst = [] for line in tqdm(domain_list): domain = {} line = line.strip().replace('"', "").replace("'", '') extracted_domain = tldextract.extract(line) registered_domain_lst.append(extracted_domain.registered_domain) domain['url'] = line #.strip() domain['domain'] = extracted_domain.domain domain['registered_domain'] = extracted_domain.registered_domain domain['tld'] = extracted_domain.suffix domain['subdomain'] = extracted_domain.subdomain domain['id'] = count count = count + 1 if line.find('://') == -1: domain['protocol'] = '' else: domain['protocol'] = line.split("://")[0] tmp = line[line.find(extracted_domain.suffix):len( line)] # tld sonraki ilk / e gore parse --> path pth = tmp.partition("/") domain['path'] = pth[1] + pth[2] # domain['path'].append(pth[1] + pth[2]) # path liste olarak kaydedilip istatistik cikarilabilir domain['words_raw'] = self.words_raw_extraction( extracted_domain.domain, extracted_domain.subdomain, pth[2]) parsed_domain_list.append(domain) return parsed_domain_list def words_raw_extraction(self, domain, subdomain, path): w_domain = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", domain.lower()) w_subdomain = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", subdomain.lower()) w_path = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", path.lower()) raw_words = w_domain + w_path + w_subdomain #raw_words = list(set(raw_words)) raw_words = list(filter(None, raw_words)) return raw_words
class Train: def __init__(self): self.logger = NsLog("log") self.json2arff_object = json2arff() self.parser_object = domain_parser() self.rule_calculation = rule_extraction() self.path_input = "../input/" self.path_arff = "../output/arff/" self.path_features = "../output/features/" self.path_parsed_domain = "../output/domain_parser/" def txt_to_list(self, txt_object): lst = [] for line in txt_object: lst.append(line.strip()) txt_object.close() return lst def domain_parser(self, param): parsed_domains = [] for i in range(1, len(param), 2): try: if param[i + 1] == 'phish' or param[i + 1] == 'legitimate': #dataset = self.txt_to_list(open("{0}{1}".format(self.path_input, param[i]), "r")) # txt read dataset = json.loads( open("{0}{1}".format(self.path_input, param[i]), "r").read()) # json read parsed_domains = parsed_domains + self.parser_object.parse( dataset, param[i + 1], len(parsed_domains)) else: self.logger.debug( "class labels must be entered one of (phish, legitimate)" ) except: self.logger.error("an error is occurred : {0}".format( format_exc())) self.logger.debug( "an error occurred when | {0}.txt | file was processing". format(param)) self.logger.info( "Domain Parse process is done {0} unique URLs are parsed".format( len(parsed_domains))) return parsed_domains def json_to_file(self, name, path, data): time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") file_name = name + "_" + time_now + ".txt" file = open(path + file_name, "w") file.write(json.dumps(data)) file.close() self.logger.info("{0} Dosyaya Yazıldı.".format(name)) def arff_to_file(self, name, path, data): time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") file_name = name + "_" + time_now + ".txt" file = open(path + file_name, "w") file.write(data) file.close() self.logger.info("{0} Dosyaya Yazıldı.".format(name))