Python TfidfVector.transformの例

プログラミング言語: Python

名前空間/パッケージ名: analog.bin.machine_learning.TfidfVector

クラス/型: TfidfVector

メソッド/関数: transform

hotexamples.comのコード掲載数: 3

Python TfidfVector.transform - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのanalog.bin.machine_learning.TfidfVector.TfidfVector.transformの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

TfidfVector(4)

transform(3)

get_url(1)

よく使われるメソッド

TfidfVector (4)

transform (3)

get_url (1)

コード例 #1

ファイルを表示

ファイル: train.py プロジェクト: zqd1996/analog

    def get_model(self, queue=None):
        log_Pattern = r'^(?P<remote_addr>.*?) - (?P<remote_user>.*) \[(?P<time_local>.*?) \+[0-9]+?\] "(?P<request>.*?)" ' \
                      '(?P<status>.*?) (?P<body_bytes_sent>.*?) "(?P<http_referer>.*?)" "(?P<http_user_agent>.*?)"$'
        log_regx = re.compile(log_Pattern)
        # 输出重定向
        __console__ = sys.stdout
        sys.stdout = open(
            os.path.join(self.root_path, "analog/log/train_log.txt"), 'w+')
        start = datetime.now()
        print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S")))

        train_example = []
        white_example = []
        black_example = []

        # 读取训练集
        self.read_txt(self.train_log_path, train_example)
        # with open(self.train_log_path, "r") as file:
        #     line = file.readline().strip("\r\n")
        #     while line:
        #         log_tuple = log_regx.search(line)
        #         line = file.readline().strip("\r\n")
        #         if log_tuple is not None:
        #             train_example.append(TfidfVector.get_url(log_tuple.group('request')))

        # 读取黑样本集
        self.read_txt(self.test_black_path, black_example)
        # with open(self.test_black_path, "r") as file:
        #     line = file.readline().strip("\r\n")
        #     while line:
        #         log_tuple = log_regx.search(line)
        #         line = file.readline().strip("\r\n")
        #         if log_tuple is not None:
        #             black_example.append(TfidfVector.get_url(log_tuple.group('request')))

        # 读取白样本集(日志格式)
        self.read_txt(self.test_white_path, white_example)
        # with open(test_white_path, "r") as file:
        #     line = file.readline().strip("\r\n")
        #     while line:
        #         log_tuple = log_regx.search(line)
        #         line = file.readline().strip("\r\n")
        #         if log_tuple is not None:
        #             white_example.append(TfidfVector.get_url(log_tuple.group('request')))

        # 读取白样本集(纯路径格式)
        # with open(self.test_white_path, "r") as file:
        #     line = file.readline().strip("\r\n")
        #     while line:
        #         white_example.append(line)
        #         line = file.readline().strip("\r\n")

        tf_idf_vector = TfidfVector()
        # 特征向量化训练样本
        train_vector = tf_idf_vector.fit_vector

        # 特征向量化黑白样本
        test_normal_vector = tf_idf_vector.transform(white_example)
        test_abnormal_vector = tf_idf_vector.transform(black_example)

        y = [1] * (len(train_example))

        # ============================================= 遍历调优参数nu与gamma ==========================================
        grid = {
            'gamma': np.logspace(-8, 1, 10),
            'nu': np.linspace(0.01, 0.20, 20)
        }

        # 核函数(rbf,linear,poly)
        kernel = 'rbf'

        # 最高准确度、召回率、F1值纪录
        max_F1 = 0
        max_Re = 0
        max_Pr = 0

        # 最高准确度、召回率、F1值时参数gamma的值
        gamma_r_F1 = 0.01
        gamma_r_Re = 0.01
        gamma_r_Pr = 0.01

        # 最高准确度、召回率、F1值时参数nu的值
        nu_r_F1 = 0
        nu_r_Re = 0
        nu_r_Pr = 0

        svdd = OneClassSVM(kernel=kernel)
        zero_count = 0
        re_gamma = 0

        total_loop = len(ParameterGrid(grid))
        process_count = 0
        for z in ParameterGrid(grid):
            process_count += 1

            queue.put_nowait("{:0.4f}".format(process_count / total_loop))
            if re_gamma == z.get('gamma'):
                if zero_count >= 4:
                    continue
            else:
                zero_count = 0
                # re_gamma = z.get('gamma')
                # zero_count = 0
            #     print("This parameter gamma({}) maybe too small. So pass it for saving time.".format(z.get('gamma')))
            #
            # if :
            #     continue
            svdd.set_params(**z)
            svdd.fit(train_vector, y)
            k = svdd.get_params()
            # 正常样本测试
            f = svdd.predict(test_normal_vector)

            TP = f.tolist().count(1)  # True positive
            FN = f.tolist().count(-1)  # False Negative

            # 异常样本测试
            f = svdd.predict(test_abnormal_vector)

            FP = f.tolist().count(1)  # False positive
            Precision = 0 if TP == 0 else (TP / (TP + FP))  # Precision
            Recall = 0 if TP == 0 else (TP / (TP + FN))  # Recall
            if Recall == 0 or Precision == 0:
                F1_score = 0
                zero_count += 1
                re_gamma = k.get('gamma')
            else:
                F1_score = 2 * Precision * Recall / (Precision + Recall
                                                     )  # F1 value

            if F1_score > max_F1:
                max_F1 = F1_score
                nu_r_F1 = k.get('nu')
                gamma_r_F1 = k.get('gamma')

            if Recall > max_Re:
                max_Re = Recall
                nu_r_Re = k.get('nu')
                gamma_r_Re = k.get('gamma')

            if Precision > max_Pr:
                max_Pr = Precision
                nu_r_Pr = k.get('nu')
                gamma_r_Pr = k.get('gamma')

            print(
                "========================== [{}] ===========================".
                format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
            print(
                "nu: ",
                k.get('nu'),
                'gamma',
                k.get('gamma'),
            )
            print("Precision: {}%".format(Precision * 100))
            print("Recall: {}%".format(Recall * 100))
            print("F1 score: {}".format(F1_score))
        print("========================== [{}] ===========================".
              format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))

        print(
            "MAX Precision:  {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_Pr, nu_r_Pr, gamma_r_Pr))
        print(
            "MAX Recall:     {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_Re, nu_r_Re, gamma_r_Re))
        print(
            "MAX F1:         {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_F1, nu_r_F1, gamma_r_F1))
        total_second = datetime.now() - start
        print("Cost {}s.".format(total_second.total_seconds()))
        queue.put_nowait("1")
        with open(os.path.join(self.root_path, "analog/cache/model.pkl"),
                  'wb') as file:
            svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1)
            svdd.fit(train_vector, y)
            pickle.dump(svdd, file)
        self.complete = True

コード例 #2

ファイルを表示

class Controller:
    analog_completer = AnalogCompleter(
        [[
            'show', 'set', 'get', 'train', 'retrain', 'locate', 'clear',
            'help', 'exit'
        ],
         [
             'statistics', 'analysis', 'log', "time", "offset", "progress",
             "model"
         ], ['requests', 'ip', 'ua', 'url'], ['current'],
         ['day', 'week', 'month', 'year', 'all'], ['top']],
        ignore_case=True,
    )
    style = Style.from_dict({
        # User input (default text).
        '': '#ff0066',
        # Prompt.
        'yellow': 'yellow',
        'green': 'green',
        'blue': 'blue',
        'black': 'black',
        'white': 'white',
        'analog': 'yellow',
        'help': 'blue',
        'cyan': 'ansicyan',
        'help_title': 'red',
    })
    analog_prompt = [
        ('class:yellow', 'analog> '),
    ]

    def __init__(self, path=None, debug=False):

        # 配置文件初始化
        self.path = path
        self.debug = debug
        self.config_path = os.path.join(self.path, "config.ini")
        self.default_config_path = os.path.join(
            self.path, "analog/conf/default_config.ini")
        self.all_columns = [
            "remote_addr", "remote_user", "time_local", "request", "status",
            "body_bytes_sent", "http_referer", "http_user_agent",
            "http_x_forwarded_for"
        ]

        self.time = datetime.now()
        self.output = ColorOutput()
        self.output.print_banner("AnaLog")

        if os.path.exists(self.config_path):
            self.config = Config(self.config_path, self.default_config_path)
        else:
            raise DatabaseConfigError

        # 日志参数初始化
        self.section_name_log = "Log"
        self.log_path = self.config.get(self.section_name_log, 'path')
        self.time_local_pattern = self.config.get(self.section_name_log,
                                                  "time_local_pattern")
        self.log_file_pattern = self.config.get(self.section_name_log,
                                                "log_file_pattern")
        self.log_pattern = self.config.get(self.section_name_log,
                                           'log_content_pattern')
        self.log_regx = re.compile(self.log_pattern)
        self.acceptable_group_name = list(
            filter(lambda x: x in self.all_columns, self.log_regx.groupindex))

        # 数据初始化
        self.db = db(self.config, root_path=self.path)
        self.logger = None
        self.analyser = None
        self.section_name_database = "Database"
        self.table_name = self.config.get(self.section_name_database,
                                          "table_name")
        self.ip_db = ipDatabase(os.path.join(self.path, "analog/ipdb/ip.ipdb"))
        self.pool = None

        # 创建数据库
        if self.config.get(self.section_name_database, 'initial') != '1':
            self.init_database()
            self.config.set(self.section_name_database, 'initial', '1')

            self.init_table()
            self.config.write(open(self.config_path, "w"))
        else:
            # 连接数据库
            try:
                self.db.connect_db()
            except Exception as e:
                self.output.print_error(e)
                self.output.print_error(
                    "Connect DB failed. Please check your config file or make sure DB server is running."
                )
                self.output.print_info("Bye~")
                exit(0)
            self.output.print_info("Logs had been loaded before.Type " +
                                   self.output.Fore.BLUE + "reload" +
                                   self.output.Style.RESET_ALL +
                                   self.output.Fore.LIGHTYELLOW_EX +
                                   " command to reload logs.")

        self.statistic = Statistics(database=self.db,
                                    output=self.output,
                                    ipdb=self.ip_db,
                                    controller=self)

        self.session = PromptSession()
        self.mode = NORMAL_MODE
        self.key_bindings = None

        # TF-IDF向量填充词料库
        tfidf_exist_flag = False
        self.tfidfVector = None
        try:
            self.tfidfVector = TfidfVector(self.path, self.config)
            tfidf_exist_flag = True
        except FileEmptyError:
            print_formatted_text(HTML(
                '<yellow>Detected that train.txt content is empty. '
                '<red>Disable abnormal detection.</red></yellow>'),
                                 style=self.style)
        except FileNotFound:
            print_formatted_text(HTML(
                '<yellow>Detected that train.txt does no exist. '
                '<red>Disable abnormal detection.</red></yellow>'),
                                 style=self.style)

        self.model = None
        self.progress_queue = None
        self.train_progress = None

        # 模型缓存载入
        if tfidf_exist_flag is False:
            print_formatted_text(HTML(
                '<yellow>Cause of you lack of TF-IDF corpus'
                '(<red>analog/sample_set/train.txt</red>), We can\'t calculate TF-IDF value'
                'of each log item and can\'t use train model also</yellow>'),
                                 style=self.style)
        if self.check_model_cache_exist():
            print_formatted_text(HTML(
                '<yellow>Detection model cache file exist. Load model from it.\nYou can type '
                '<blue>retrain</blue> to train a new model.</yellow>'),
                                 style=self.style)
            self.load_model()

        # else:
        #     try:
        #         if self.train() is False:
        #             print_formatted_text(
        #                 HTML('<yellow>Train Failed! Cause of lack of train sample. '
        #                      '\nVisit <blue>https://www.testzero-wz.com</blue> for help.</yellow>'),
        #                 style=self.style)
        #     except Exception as e:
        #         raise e

        # 审计模块和分析模块初始化
        self.logger = Logger(database=self.db,
                             output=self.output,
                             section_name_log=self.section_name_log,
                             ipdb=self.ip_db,
                             controller=self,
                             tfidfvector=self.tfidfVector,
                             config=self.config,
                             model=self.model)
        self.analyser = Analyser(database=self.db,
                                 ipdb=self.ip_db,
                                 controller=self,
                                 tfidfvector=self.tfidfVector,
                                 model=self.model)

        # 日志文件监视模块初始化
        self.file_queue = Queue()
        self.supervisor = FileSupervisor(_path=self.log_path,
                                         _queue=self.file_queue,
                                         log_path=os.path.join(
                                             self.path,
                                             "analog/logs/file_log.log"))
        # 监视线程和更新日志线程开启
        self.supervisor.start()
        self.output.print_info("Supervisor on.")
        self.update_thread = Thread(target=self.update_thread_func,
                                    daemon=True)
        self.update_thread_stop_flag = False
        self.update_thread.start()
        self.output.print_info("Updater on.")

    def mainloop(self):
        self.key_bindings = KeyBindings()

        @Condition
        def is_logging_mode():
            return self.mode == LOGGING_MODE

        @self.key_bindings.add('up')
        def _(event):
            self.clear_screen()
            self.logger.show_log(decrease=True)
            self.output.print_info("Log Mode")
            print_formatted_text(HTML(
                '<yellow>[+] Press <blue>↑</blue> or <blue>↓</blue> to turn pages or '
                '<blue>Esc</blue> to quit log mode</yellow>'),
                                 style=self.style)
            print_formatted_text(FormattedText(self.analog_prompt), end="")

        @self.key_bindings.add('down')
        def _(event):
            self.clear_screen()
            self.logger.show_log(increase=True)
            self.output.print_info("Log Mode")
            print_formatted_text(HTML(
                '<yellow>[+] Press <blue>↑</blue> or <blue>↓</blue> to turn pages or '
                '<blue>Esc</blue> to quit log mode</yellow>'),
                                 style=self.style)
            print_formatted_text(FormattedText(self.analog_prompt), end="")

        @self.key_bindings.add('escape')
        def _(event):
            self.mode = NORMAL_MODE
            self.logger.clear()
            print()
            self.output.print_info("Back to normal mode.")

        self.key_bindings = ConditionalKeyBindings(
            key_bindings=self.key_bindings, filter=is_logging_mode)

        while True:
            text = None
            try:
                text = self.session.prompt(self.analog_prompt,
                                           style=self.style,
                                           completer=self.analog_completer,
                                           key_bindings=self.key_bindings,
                                           refresh_interval=True)
                self.command_parser(text)

            except KeyboardInterrupt:
                if self.mode == NORMAL_MODE:
                    self.output.print_info(
                        "Are you wanna exit? Type 'exit' to quit the analog.")
                else:
                    self.output.print_lastLine(
                        "Press key Esc to quit log mode.")
            except AddCommandError:
                self.add_command_help()
            except CommandFormatError:
                self.output.print_error(
                    "Unknown command: {}. Type \"help\" for help.".format(
                        text if text else "(Failed to read command)"))
            except Exception as e:
                if self.debug:
                    traceback.print_exc()
                    raise e
                self.output.print_error("Error: %s" % str(e))

    def train(self):
        if not self.check_train_txt():
            return False
        model_train = Train(self.path, config=self.config, test_flag=False)
        self.progress_queue = multiprocessing.Manager().Queue()
        self.pool = multiprocessing.Pool(1)
        self.pool.apply_async(model_train.get_model,
                              args=(self.progress_queue, ),
                              callback=self.train_complete_callback,
                              error_callback=self.train_error_callback)
        self.pool.close()
        self.train_progress = 0
        print_formatted_text(HTML(
            '<yellow>Start the model training process.'
            '\nYou can type <blue>get progress</blue> to get the progress of training.</yellow>'
        ),
                             style=self.style)

    def train_error_callback(self, e):
        if self.debug:
            traceback.print_exc()
        self.output.print_error(str(e))
        raise e

    def check_train_txt(self):
        train_log_path = os.path.join(self.path, "analog/sample_set/train.txt")
        test_black_path = os.path.join(self.path,
                                       "analog/sample_set/test_black_log.txt")
        test_raw_path = os.path.join(self.path,
                                     "analog/sample_set/test_white_log.txt")
        check_path = [train_log_path, test_black_path, test_raw_path]
        flag = True
        for path in check_path:
            try:
                if check_txt(path) is False:
                    flag = False
                    print_formatted_text(
                        HTML('<yellow>Necessary file ' + path +
                             ' is <red>empty</red> ! </yellow>'),
                        style=self.style)
            except FileNotFound:
                print_formatted_text(HTML('<yellow>Necessary file ' + path +
                                          '  <red>Not found</red> ! </yellow>'
                                          '\nGenerate file automatically.'),
                                     style=self.style)
        return flag

    def train_complete(self):
        self.load_model()
        print_formatted_text(HTML(
            '\n\n<blue>Training task completed! '
            'Try to type command <white>show analysis</white> to show '
            'abnormal analysis!</blue>'),
                             style=self.style)
        self.logger.model = self.model

    def print_train_progress(self):
        progress = float(self.get_train_progress()) * 100
        if progress == 100:
            self.train_complete()
        else:
            self.output.print_info(
                "Now train progress is {:0.2f}%".format(progress))

    def print_time(self):
        self.output.print_value("time",
                                self.time.strftime("%Y/%m/%d %H:00:00"))

    def train_complete_callback(self, e):
        self.load_model()
        self.train_progress = 1
        self.output.print_special("Training  completed.")

    def load_model(self):
        with open(os.path.join(self.path, r'analog/cache/model.pkl'),
                  'rb') as file:
            self.model = pickle.load(file)
        if self.logger is not None:
            self.logger.model = self.model
        if self.analyser is not None:
            self.analyser.model = self.model

    def command_parser(self, command: str):
        """
        命令解析
        """
        command = command.lower().split()
        if len(command) == 0:
            return
        try:
            c = command[0]
            if c == 'show':
                if command[1] == 'statistics':
                    if command[2] == 'requests':
                        self.statistic.requests_num(
                            command[4],
                            True if command[3] == 'current' else False,
                            True if command[-1] == 'c' else False)
                    else:
                        if len(command) >= 7:
                            N = command[6]
                            self.statistic.top_n(
                                command[2],
                                command[4],
                                current_flag=(True if command[3] == 'current'
                                              else False),
                                N=N)
                        else:
                            self.statistic.top_n(
                                command[2],
                                command[4],
                                current_flag=(True if command[3] == 'current'
                                              else False))
                elif command[1] == 'analysis':
                    if command[2] == 'current':
                        self.analyser.show_analysis(command[3])

                elif command[1] == 'log':
                    flag = False
                    if command[2] == 'of' and command[3] == 'ip':
                        self.logger.clear()
                        self.logger.set_mode('ip')
                        self.logger.set_ip(command[4])
                        flag = self.logger.show_log()

                    elif command[2] in ['current', 'last']:
                        self.logger.clear()
                        self.logger.set_mode('date')
                        flag = self.logger.show_log(
                            when=command[3],
                            current_flag=True
                            if command[2] == 'current' else False)

                    self.mode = LOGGING_MODE if flag else NORMAL_MODE

            elif c == 'set':
                if command[1] == 'date':
                    t = datetime.strptime(command[2], "%Y/%m/%d")
                    self.output.print_changed(
                        "time",
                        self.set_time(
                            year=t.year, month=t.month, day=t.day,
                            hour=t.hour).strftime("%Y/%m/%d %H:00:00"))
                elif command[1] in ['hour', 'day', 'month', 'year']:
                    d = dict()
                    d[command[1]] = int(command[2])
                    self.output.print_changed(
                        "time",
                        self.set_time(**d).strftime("%Y/%m/%d %H:00:00"))
                    del d
                elif command[1] == 'offset':
                    self.logger.set_offset(int(command[2]))
                else:
                    raise CommandFormatError

            elif c == 'get':
                if command[1] == "time":
                    self.print_time()
                elif command[1] == "date":
                    self.output.print_value("time",
                                            self.time.strftime("%Y/%m/%d"))
                elif command[1] == "progress":
                    self.print_train_progress()
                elif command[1] == 'offset':
                    self.output.print_value("offset", self.logger.offset)
                elif command[1] == 'model':
                    if self.model:
                        params = self.model.get_params('nu')
                        print_formatted_text(FormattedText([
                            ('class:yellow', 'nu: {}'.format(params.get('nu')))
                        ]),
                                             style=self.style)
                        print_formatted_text(FormattedText([
                            ('class:yellow',
                             'gamma: {}'.format(params.get('gamma')))
                        ]),
                                             style=self.style)
                    else:
                        self.output.print_info("No model has been loaded.")
            elif c == 'train' or c == 'retrain':
                if self.train_progress is not None and self.train_progress != 1:
                    self.output.print_warning("A training task is running.")
                    self.print_train_progress()
                elif len(command) == 1:
                    if self.model is not None:
                        self.output.print_warning(
                            "Model has been loaded before!")
                        self.output.print_warning("Overwrite this model?[N/y]")
                        ans = input()
                        if ans.lower() not in ["y", "yes"]:
                            return
                    self.train()
            elif c == "locate":
                if command[1] == 'ip':
                    self.output.print_value(
                        command[2],
                        "-".join(self.statistic.ip_geolocation(command[2])))
            elif c == 'clear':
                self.clear_screen()
            elif c == 'help':
                self.help()
            elif c == 'debug':
                self.debug = True
                self.output.print_info("Debug mode [On].")
            elif c == 'exit':
                self.output.print_info("Bye~")
                exit(0)

            elif c == 'reload':
                self.output.print_warning(
                    "Reload option will duplicate your logs if you don't erase DB table first."
                )
                self.output.print_warning(
                    "Would you like to erase table?[Y/n]")
                res = input()
                if res.lower() != "n":
                    self.erase_table(self.table_name)
                    self.output.print_warning("Erased table `%s`." %
                                              self.table_name)
                self.config.set(self.section_name_database, 'initial', '0')
                self.config.write(open(self.config_path, "w"))
                self.output.print_info("Please start analog once again.")
                exit(0)
            elif c == "test":
                self.test()
            elif c == "add":
                keys = {
                    "day": "days",
                    "d": "days",
                    "hour": "hours",
                    "h": "hours",
                    "week": "weeks",
                    "w": "weeks",
                    "month": "months",
                    "m": "months",
                    "year": "years",
                    "y": "years",
                    "offset": "offset",
                    "o": "offset",
                }
                try:
                    val = None
                    key = command[1].lower()
                    if len(command) == 3:
                        val = int(command[2].lower())

                    if key not in keys:
                        raise AddCommandError
                    if keys[key] == "offset":
                        if val is None:
                            val = 5
                        self.logger.set_offset(self.logger.offset + val)
                        self.output.print_value("offset", self.logger.offset)
                    elif keys[key] in ['hours', 'days', 'weeks', 'months']:
                        if val is None:
                            val = 1
                        self.time = self.time + timedelta(**{keys[key]: val})
                        self.print_time()
                    elif keys[key] == "years":
                        # timedelta can't not handle time delta are years
                        # there aren't a explicit length of year.
                        if val is None:
                            val = 1
                        self.set_time(year=self.time.year + val)
                        self.print_time()
                except Exception as e:
                    raise AddCommandError
            else:
                raise CommandFormatError("Command not match.")
        except IndexError:
            raise CommandFormatError
        except Exception as e:
            raise e

    def erase_table(self, table_name):
        sql = "truncate table `%s`;" % table_name
        self.db.execute(sql)
        self.db.commit()
        pass

    @staticmethod
    def clear_screen():
        os.system('cls' if os.name == 'nt' else 'clear')

    def init_database(self):
        """
        数据库初始化，数据库创建以及表创建
        """
        self.db.create_db()
        conn = self.db.connect_db()
        cursor = conn.cursor()

        # Create Table: weblog
        sql = """CREATE TABLE IF NOT EXISTS %s (
                             remote_addr  CHAR(15) NOT NULL ,
                             remote_user  VARCHAR(20),
                             time_local DATETIME DEFAULT '1970-01-01 00:00:00',
                             request TEXT DEFAULT NULL,
                             status  SMALLINT  UNSIGNED DEFAULT 0, 
                             body_bytes_sent INT DEFAULT 0,
                             http_referer TEXT,
                             http_user_agent TEXT,
                             http_x_forwarded_for TEXT);
                             """ % self.table_name
        cursor.execute(sql)
        conn.commit()
        conn.close()
        self.output.print_info("Initial database => Done.")

    def init_table(self):
        """
        数据库表单载入初始化
        将日志载入数据库中
        """

        sql = "INSERT INTO %s (%s) VALUES (%s)" % (self.table_name, ",".join(
            self.acceptable_group_name), ",".join(
                ["?"] * len(self.acceptable_group_name)))
        self.read_log_files(self.log_regx, sql)

    def get_latest_time(self):
        t = self.db.execute(
            "SELECT time_local from %s ORDER BY 1 DESC limit 1" %
            self.table_name).fetchall()[0][0]
        if not isinstance(t, datetime):
            return db_time2datetime(t)
        return t

    def get_oldest_time(self):
        t = self.db.execute(
            "SELECT time_local from %s ORDER BY 1 ASC limit 1" %
            self.table_name).fetchall()[0][0]

        if not isinstance(t, datetime):
            return db_time2datetime(t)
        return t

    def read_log_files(self, log_regx, sql):
        # 日志数据处理
        count = 0
        arg_list = []

        for file in fp_gen(self.log_path, pattern=self.log_file_pattern):
            try:
                line = file.readline().strip("\r\n")
                while line:
                    log_tuple = log_regx.search(line)
                    line = file.readline().strip("\r\n")
                    if log_tuple is not None:
                        # 将存在
                        single_line_arg = list(
                            log_tuple.group(group_name)
                            for group_name in log_regx.groupindex
                            if group_name in self.acceptable_group_name)

                        # 转换日志时间格式为数据库时间格式，方便插入数据库及后续使用数据库时间函数
                        single_line_arg[self.acceptable_group_name.index(
                            'time_local')] = log2db_time(
                                log_tuple.group('time_local'),
                                log_str=self.time_local_pattern)

                        arg_list.append(single_line_arg)
                        count += 1
                    if count % 5000 == 0 or line is None or line == "":
                        self.db.execute_many(sql, arg_list)
                        self.output.print_lastLine(
                            "Total process: {} logs".format(count))
                        self.db.commit()
                        arg_list.clear()
            except Exception as e:
                raise Exception(e)
            finally:
                file.close()
        self.output.print_info("Total process: {} logs".format(count))
        self.output.print_info("Load log => Done.")

    def update_logs(self, files, log_regx, sql):
        """
        实时更新日志文件，且仅插入非当前已存入数据库时间段内的数据
        用于服务器访问日志实时更新
        :param files: 要更新插入到数据库的日志文件路径
        :param log_regx: 读取日志的正则
        :param sql: 插入sql
        :return:
        """
        count = 0
        arg_list = []
        latest_time = self.get_latest_time()
        for file in files:
            logic_end = False
            for line, is_last in stop_iter(reverse_read_lines(file)):
                try:
                    line = line.strip("\r\n")
                    if line == "":
                        continue
                    log_tuple = log_regx.search(line)
                    current_time = datetime.strptime(
                        log_tuple.group('time_local'), self.time_local_pattern)
                    if current_time <= latest_time:
                        logic_end = True
                    if not logic_end and log_tuple is not None and not logic_end:
                        # 从匹配到的正则分组名中获取需要加入数据库的列名
                        single_line_arg = list(
                            log_tuple.group(group_name)
                            for group_name in log_regx.groupindex
                            if group_name in self.acceptable_group_name)

                        # 转换日志时间格式为数据库时间格式，方便插入数据库及后续使用数据库时间函数
                        single_line_arg[self.acceptable_group_name.index(
                            'time_local')] = log2db_time(
                                log_tuple.group('time_local'),
                                log_str=self.time_local_pattern)

                        arg_list.append(single_line_arg)
                        count += 1
                    if logic_end or count % 5000 == 0 or is_last and count != 0:
                        self.db.execute_many(sql, arg_list)
                        self.db.commit()
                        arg_list.clear()
                except Exception as e:
                    raise UpdateLogThreadException(e)
            # self.output.print_info("Total process: {} logs".format(count))
            # self.output.print_info("Load log => Done.")

    def update_thread_func(self):
        sql = "INSERT INTO %s (%s) VALUES (%s)" % (self.table_name, ",".join(
            self.acceptable_group_name), ",".join(
                ["?"] * len(self.acceptable_group_name)))
        while True:
            changes = None
            if self.update_thread_stop_flag:
                return
            try:
                if not self.file_queue.empty():
                    changes = self.file_queue.get(timeout=100)
            except Empty:
                sleep(1)
                continue
            if changes is not None:
                self.update_logs(changes, self.log_regx, sql)
            else:
                sleep(1)

    def get_time_condition(self,
                           when: str,
                           current_flag=False,
                           time_change=False) -> str:
        """
        数据查询时间（WHERE）条件获取
        :param when: ['day','week','month','year']
        :param time_change: 是否改变当前时间(减1单位)
        :param current_flag: 判断是current还是last
        :return: SQL语句约束（WHERE）时间条件
        """
        time_condition_sql = None
        t_w = None
        when = when.lower()

        if when == 'hour':
            t_w = self.time + relativedelta(hours=-1)
            if current_flag:
                time_condition_sql = " DATE(time_local) = '" + self.time.strftime("%Y-%m-%d") \
                                     + "' AND HOUR(time_local) = " + self.time.strftime("%H")

        elif when == 'day':
            t_w = self.time + relativedelta(days=-1)
            if current_flag:
                time_condition_sql = " DATE(time_local) = '" + self.time.strftime(
                    "%Y-%m-%d") + "'"

        elif when == 'week':
            t_w = self.time + relativedelta(weeks=-1)
            if current_flag:
                time_condition_sql = " YEAR(time_local) = " + self.time.strftime(
                    "%Y") + " and  WEEK(time_local) =  " + self.time.strftime(
                        "%W")

        elif when == 'month':
            t_w = self.time + relativedelta(months=-1)
            if current_flag:
                time_condition_sql = " YEAR(time_local) = " + self.time.strftime(
                    "%Y") + " and MONTH(time_local) = " + self.time.strftime(
                        "%m")

        elif when == 'year':
            t_w = self.time + relativedelta(years=-1)
            if current_flag:
                time_condition_sql = " YEAR(time_local) = " + self.time.strftime(
                    "%Y")

        if time_change:
            self.time = t_w

        return time_condition_sql + " "

    def get_date_list(self,
                      when: str,
                      d_list: list,
                      time: datetime,
                      d_format_list=None):
        if not isinstance(time, datetime):
            time = db_time2datetime(time)
        if when == 'year':
            d_list.append(time.month)
            if d_format_list is not None:
                d_format_list.append(time.strftime("%Y-%m"))
        elif when == 'month':
            d_list.append(time.day)
            if d_format_list is not None:
                d_format_list.append(time.strftime("%Y-%m-%d"))
        elif when == 'week':
            d_list.append(time.day)
            if d_format_list is not None:
                d_format_list.append(time.strftime("%Y-%m-%d"))
        elif when == 'day':
            d_list.append(time.hour)
            if d_format_list is not None:
                d_format_list.append(time.strftime("%Y-%m-%d %H:00:00"))
        elif when == 'hour':
            d_list.append(time.minute)
            if d_format_list is not None:
                d_format_list.append(time.strftime("%H:%M:00"))

    def check_model_cache_exist(self):
        model_path = os.path.join(self.path, 'analog/cache/model.pkl')
        try:
            if os.path.getsize(model_path) != 0:
                return True
        except FileNotFoundError:
            return False
        return False

    def get_train_progress(self):
        """
        获取训练模型进程的进度
        """
        while True:
            if not self.progress_queue:
                return 0
            if self.progress_queue.empty():
                return 0 if self.train_progress is None else self.train_progress
            progress = self.progress_queue.get_nowait()
            if progress is None or progress == '':
                break
            self.train_progress = 0 if progress is None else progress
        return self.train_progress

    def set_time(self,
                 year=None,
                 month=None,
                 day=None,
                 hour=None,
                 minute=0,
                 second=0) -> datetime:
        year = year if year is not None else self.time.year
        month = month if month is not None else self.time.month
        day = day if day is not None else self.time.day
        hour = hour if hour is not None else self.time.hour
        self.time = datetime(year=year,
                             month=month,
                             day=day,
                             hour=hour,
                             minute=minute,
                             second=second)
        return self.time

    def test(self):
        """
        Test Function for testing model training.
        """
        test_white_path = os.path.join(self.path,
                                       "analog/sample_set/test_white_log.txt")
        test_black_path = os.path.join(self.path,
                                       "analog/sample_set/test_black_log.txt")
        white_example = []
        black_example = []
        read_by_group(test_white_path,
                      white_example,
                      pattern=self.config.get('log', 'log_content_pattern'))
        read_by_group(test_black_path,
                      black_example,
                      pattern=self.config.get('log', 'log_content_pattern'))
        w_tf_idf_vector = self.tfidfVector.transform(white_example)
        b_tf_idf_vector = self.tfidfVector.transform(black_example)
        y1 = self.model.predict(w_tf_idf_vector)
        TP = y1.tolist().count(1)  # True positive
        FN = y1.tolist().count(-1)  # False Negative
        y2 = self.model.predict(b_tf_idf_vector)
        FP = y2.tolist().count(1)  # False positive
        Precision = 0 if TP == 0 else (TP / (TP + FP))  # Precision
        Recall = 0 if TP == 0 else (TP / (TP + FN))  # Recall
        F1_score = 2 * Precision * Recall / (Precision + Recall)
        test_log_path = os.path.join(self.path, "analog/log")

        with open(os.path.join(test_log_path, "white_test.txt"),
                  "w",
                  encoding='utf-8') as file:
            for i in range(len(white_example)):
                file.write(white_example[i] + " => " +
                           ("正常" if y1[i] == 1 else "恶意") + "请求\n")

        with open(os.path.join(test_log_path, "black_test.txt"),
                  "w",
                  encoding='utf-8') as file:
            for i in range(len(black_example)):
                file.write(black_example[i] + " => " +
                           ("正常" if y2[i] == 1 else "恶意") + "请求\n")
        self.output.print_info("Test Done.")
        self.output.print_info("Precision:%s%%  Recall:%s%%  F1_score:%s  " %
                               (Precision * 100, Recall * 100, F1_score))

    def help(self):
        indent_len = 4
        text_len = 50
        help_text = FormattedText([
            ('class:help_title', '\nUsage:\n'),
            ('class:green', ' ' * indent_len + 'show  '),
            ('class:yellow', '<statistics|analysis|log>  '),
            ('class:yellow', '<IP|requests|UA|url>  '),
            ('class:green', '<current>  '),
            ('class:yellow', '<day|week|month|year|all>  '),
            ('class:yellow', '(top N)\n'),
            ('class:help_title', '\nExample:\n'),
            ('class:yellow', " " * indent_len +
             'show statistics requests current day'.ljust(text_len)),
            ('class:white',
             'Draw a chart to show statistics of website visit\n'),
            ('class:yellow', " " * indent_len +
             'show statistics url last week top 10'.ljust(text_len)),
            ('class:white', 'Draw a chart to show statistics of requests \n'),
            ('class:yellow',
             " " * indent_len + 'show analysis current day'.ljust(text_len)),
            ('class:white',
             'Display log analysis using abnormal detection model.\n'),
            ('class:yellow',
             " " * indent_len + 'show log current day'.ljust(text_len)),
            ('class:white', 'Display the log in a table.\n'),
            ('class:help_title', '\nMore:\n'),
            ('class:yellow',
             " " * indent_len + 'train|retrain'.ljust(text_len)),
            ('class:white', 'Train your model\n'),
            ('class:yellow',
             " " * indent_len + 'get progress'.ljust(text_len)),
            ('class:white', 'Get progress of training model\n'),
            ('class:yellow',
             " " * indent_len + 'get time|date|offset'.ljust(text_len)),
            ('class:white', 'Display values\n'),
            ('class:yellow',
             " " * indent_len + 'set date 2019/8/3 '.ljust(text_len)),
            ('class:white', 'Set date\n'),
            ('class:yellow',
             " " * indent_len + 'set day|month|year|offset N'.ljust(text_len)),
            ('class:white', 'Set values\n'),
            ('class:yellow',
             " " * indent_len + 'get <values>'.ljust(text_len)),
            ('class:white', 'Get values\n'),
            ('class:help_title', '\nMore information:\n'),
            ('class:blue',
             " " * indent_len + '<https://analog.testzero-wz.com>\n'),
            ('class:blue',
             " " * indent_len + '<https://github.com/Testzero-wz>\n'),
        ])
        print_formatted_text(help_text, style=self.style)

    def add_command_help(self):
        indent_len = 4
        help_text = FormattedText([
            ('class:help_title', '\nUsage:\n'),
            ('class:green', ' ' * indent_len + 'add '),
            ('class:yellow', '<hour|day|week|month|year|offset>  '),
            ('class:yellow', 'N\n '),
            ('class:help_title', '\nOr:\n'),
            ('class:green', ' ' * indent_len + 'add '),
            ('class:yellow', '<h|d|w|m|y|o>  '),
            ('class:yellow', 'N\n '),
        ])
        print_formatted_text(help_text, style=self.style)

コード例 #3

ファイルを表示

    def get_model(self, queue=None):

        start = datetime.now()
        # Since logger is not pickleable until python 3.7,
        # we can init logger within this function.
        train_logger = Logger(logger_name="train_logger",
                              log_path=self.log_path)

        train_logger.register_log_function("calc", "CALCU")
        train_logger.register_log_function("split", "SPLIT")
        train_logger.register_log_function("start", "START")
        train_logger.register_log_function("end", "-END-")
        train_logger.register_log_function("result", "RESULT")
        train_logger.start("Start Training.")
        train_example = []
        white_example = []
        black_example = []

        pattern = self.config.get(self.section_name_log, 'log_content_pattern')
        # 读取训练集
        read_by_group(self.train_log_path, train_example, pattern=pattern)

        # 读取黑样本集
        read_by_group(self.test_black_path, black_example, pattern=pattern)

        # 读取白样本集
        read_by_group(self.test_white_path, white_example, pattern=pattern)

        # 特征向量化训练样本
        tf_idf_vector = TfidfVector(self.root_path, self.config)
        train_vector = tf_idf_vector.fit_vector

        # 特征向量化黑/白样本
        test_normal_vector = tf_idf_vector.transform(white_example)
        test_abnormal_vector = tf_idf_vector.transform(black_example)
        # test_param_x = tf_idf_vector.transform(white_example + black_example)
        # test_param_y = [1] * len(white_example) + [-1] * len(black_example)
        # ============================================= 遍历调优参数nu与gamma ==========================================
        grid = {'gamma': np.logspace(-9, 1, 10),
                'nu': np.linspace(0.00001, 0.2, 100)}
        # ======================================= GridSearchCV遍历调优参数nu与gamma ======================================
        # scores = "f1"
        # clf = GridSearchCV(OneClassSVM(), grid, scoring=scores)
        # clf.fit(test_param_x, test_param_y)
        # ==============================================================================================================

        # 核函数(rbf,linear,poly)
        kernel = 'rbf'

        # 最高准确度、召回率、F1值纪录
        max_F1 = 0
        max_Re = 0
        max_Pr = 0

        # 最高准确度、召回率、F1值时参数gamma的值
        gamma_r_F1 = 0.01
        gamma_r_Re = 0.01
        gamma_r_Pr = 0.01

        # 最高准确度、召回率、F1值时参数nu的值
        nu_r_F1 = 0
        nu_r_Re = 0
        nu_r_Pr = 0

        svdd = OneClassSVM(kernel=kernel)
        zero_count = 0
        re_gamma = 0

        total_loop = len(ParameterGrid(grid))
        process_count = 0
        for z in ParameterGrid(grid):
            process_count += 1
            queue.put_nowait("{:0.4f}".format(process_count / total_loop))
            if re_gamma == z.get('gamma'):
                if zero_count >= 6:
                    continue
            else:
                zero_count = 0
            svdd = OneClassSVM(**z)
            svdd.fit(train_vector)
            k = svdd.get_params()
            # 正常样本测试
            f = svdd.predict(test_normal_vector)

            TP = f.tolist().count(1)  # True positive
            FN = f.tolist().count(-1)  # False Negative

            # 异常样本测试
            f = svdd.predict(test_abnormal_vector)

            FP = f.tolist().count(1)  # False positive
            Precision = 0 if TP == 0 else (TP / (TP + FP))  # Precision
            Recall = 0 if TP == 0 else (TP / (TP + FN))  # Recall
            if Recall == 0 or Precision == 0:
                F1_score = 0
                zero_count += 1
                re_gamma = k.get('gamma')
            else:
                F1_score = 2 * Precision * Recall / (Precision + Recall)  # F1 value

            if F1_score > max_F1:
                max_F1 = F1_score
                nu_r_F1 = k.get('nu')
                gamma_r_F1 = k.get('gamma')

            if Recall > max_Re:
                max_Re = Recall
                nu_r_Re = k.get('nu')
                gamma_r_Re = k.get('gamma')

            if Precision > max_Pr:
                max_Pr = Precision
                nu_r_Pr = k.get('nu')
                gamma_r_Pr = k.get('gamma')

            train_logger.split("=" * 60)
            train_logger.calc("nu: %.08f , gamma: %.04f" % (k.get('nu'), k.get('gamma')))
            train_logger.calc("precision: {}%".format(Precision * 100))
            train_logger.calc("recall: {}%".format(Recall * 100))
            train_logger.calc("f1 score: {}".format(F1_score))

        train_logger.split("=" * 60)
        train_logger.result(
            "MAX Precision:{:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_Pr, nu_r_Pr,
                                                                                          gamma_r_Pr))
        train_logger.result(
            "MAX Recall:   {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_Re, nu_r_Re,
                                                                                          gamma_r_Re))
        train_logger.result(
            "MAX F1:       {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_F1, nu_r_F1,
                                                                                          gamma_r_F1))
        total_second = datetime.now() - start
        train_logger.end("Cost {}s.".format(total_second.total_seconds()))
        queue.put_nowait("1")
        with open(os.path.join(self.root_path, "analog/cache/model.pkl"), 'wb') as file:
            # svdd = OneClassSVM(**clf.best_params_)
            svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1)
            svdd.fit(train_vector)
            pickle.dump(svdd, file)
        self.complete = True