class rule_extraction:
    def __init__(self):
        self.logger = NsLog("log")
        self.url_rules_o = url_rules()
        self.active_rules_o = active_rules()

    def extraction(self, parsed_domains):

        self.logger.info("rule_extraction.extraction() is running")

        domain_features = []
        try:
            for line in tqdm(parsed_domains):  # self.bar(parsed_domains)

                info = line

                #  info['mail'] = 'whoisden cekilecek'

                nlp_info, url_features = self.url_rules_o.rules_main(
                    info['domain'], info['tld'], info['subdomain'],
                    info['path'],
                    info['words_raw'])  # url kurallarin calistigi yer

                info['nlp_info'] = nlp_info
                info['nlp_info']['words_raw'] = info['words_raw']
                info.pop("words_raw", None)

                #  domain_info, dns_features = self.dns_rules_o.rules_main(line_lst)  # dns rules

                outputDict = {}

                #  info['dns_records'] = domain_info

                outputDict['info'] = info
                outputDict['url_features'] = url_features

                #  outputDict['dns_features'] = dns_features

                domain_features.append(outputDict)

            #domain_features = self.active_rules_o.goog_safe_browsing(domain_features)  # active kuralların çalıştığı yer
        except:
            self.logger.error("Error : {0}".format(format_exc()))

        return domain_features
Exemple #2
0
class domain_parser(object):
    def __init__(self):

        self.logger = NsLog("log")

    def parse(self, domain_list, class_info, count):
        self.logger.info("domain_parser.parse() is running")

        parsed_domain_list = []
        registered_domain_lst = []

        for line in tqdm(domain_list):

            domain = {}
            line = line.strip().replace('"', "").replace("'", '')
            extracted_domain = tldextract.extract(line)

            registered_domain_lst.append(extracted_domain.registered_domain)

            domain['url'] = line
            domain['domain'] = extracted_domain.domain
            domain['registered_domain'] = extracted_domain.registered_domain
            domain['tld'] = extracted_domain.suffix
            domain['subdomain'] = extracted_domain.subdomain
            domain['class'] = class_info
            domain['id'] = count
            count = count + 1

            if line.find('://') == -1:
                domain['protocol'] = ''
            else:
                domain['protocol'] = line.split("://")[0]

            tmp = line[line.find(extracted_domain.suffix):len(
                line)]  # tld sonraki ilk / e gore parse --> path
            pth = tmp.partition("/")

            domain['path'] = pth[1] + pth[2]

            domain['words_raw'] = self.words_raw_extraction(
                extracted_domain.domain, extracted_domain.subdomain, pth[2])

            parsed_domain_list.append(domain)

        return parsed_domain_list

    def parse_nonlabeled_samples(self, domain_list, count=0):
        self.logger.info("domain_parser.parse_nonlabeled_samples() is running")
        parsed_domain_list = []
        registered_domain_lst = []

        for line in tqdm(domain_list):
            domain = {}
            line = line.strip().replace('"', "").replace("'", '')

            extracted_domain = tldextract.extract(line)

            registered_domain_lst.append(extracted_domain.registered_domain)

            domain['url'] = line  #.strip()
            domain['domain'] = extracted_domain.domain
            domain['registered_domain'] = extracted_domain.registered_domain
            domain['tld'] = extracted_domain.suffix
            domain['subdomain'] = extracted_domain.subdomain
            domain['id'] = count
            count = count + 1

            if line.find('://') == -1:
                domain['protocol'] = ''
            else:
                domain['protocol'] = line.split("://")[0]

            tmp = line[line.find(extracted_domain.suffix):len(
                line)]  # tld sonraki ilk / e gore parse --> path
            pth = tmp.partition("/")

            domain['path'] = pth[1] + pth[2]
            # domain['path'].append(pth[1] + pth[2]) # path liste olarak kaydedilip istatistik cikarilabilir

            domain['words_raw'] = self.words_raw_extraction(
                extracted_domain.domain, extracted_domain.subdomain, pth[2])

            parsed_domain_list.append(domain)

        return parsed_domain_list

    def words_raw_extraction(self, domain, subdomain, path):

        w_domain = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", domain.lower())
        w_subdomain = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_",
                               subdomain.lower())
        w_path = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", path.lower())

        raw_words = w_domain + w_path + w_subdomain
        #raw_words = list(set(raw_words))
        raw_words = list(filter(None, raw_words))

        return raw_words
Exemple #3
0
class Train:
    def __init__(self):
        self.logger = NsLog("log")

        self.json2arff_object = json2arff()
        self.parser_object = domain_parser()
        self.rule_calculation = rule_extraction()

        self.path_input = "../input/"
        self.path_arff = "../output/arff/"
        self.path_features = "../output/features/"
        self.path_parsed_domain = "../output/domain_parser/"

    def txt_to_list(self, txt_object):

        lst = []

        for line in txt_object:
            lst.append(line.strip())

        txt_object.close()

        return lst

    def domain_parser(self, param):

        parsed_domains = []

        for i in range(1, len(param), 2):
            try:
                if param[i + 1] == 'phish' or param[i + 1] == 'legitimate':

                    #dataset = self.txt_to_list(open("{0}{1}".format(self.path_input, param[i]), "r"))  # txt read
                    dataset = json.loads(
                        open("{0}{1}".format(self.path_input, param[i]),
                             "r").read())  # json read

                    parsed_domains = parsed_domains + self.parser_object.parse(
                        dataset, param[i + 1], len(parsed_domains))

                else:
                    self.logger.debug(
                        "class labels must be entered one of (phish, legitimate)"
                    )

            except:
                self.logger.error("an error is occurred  : {0}".format(
                    format_exc()))
                self.logger.debug(
                    "an error occurred when | {0}.txt | file was processing".
                    format(param))

        self.logger.info(
            "Domain Parse process is done {0} unique URLs are parsed".format(
                len(parsed_domains)))

        return parsed_domains

    def json_to_file(self, name, path, data):
        time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")
        file_name = name + "_" + time_now + ".txt"
        file = open(path + file_name, "w")
        file.write(json.dumps(data))
        file.close()
        self.logger.info("{0} Dosyaya Yazıldı.".format(name))

    def arff_to_file(self, name, path, data):
        time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")
        file_name = name + "_" + time_now + ".txt"
        file = open(path + file_name, "w")
        file.write(data)
        file.close()
        self.logger.info("{0} Dosyaya Yazıldı.".format(name))