Beispiel #1
0
def test_best_f_score():
    fill_step = 60
    use_plt = True
    src_threshold_value, test_scores, real_test_labels, real_test_missing = get_test_data()
    threshold_value, catch_num, catch_index, f_score, fp_index, fp_num, tp_index, tp_num, fn_index, fn_num, precision, recall \
        = catch_label_v2(use_plt, src_threshold_value, test_scores, real_test_labels, real_test_missing)
    print_text(use_plt, "捕捉到的异常数:{}".format(catch_num))
    print_text(use_plt, "默认阈值:{},最佳F分值:{},精度:{},召回率:{}".format(round(threshold_value, 7), f_score, precision, recall))
    tp_interval_num, tp_interval_str = get_constant_timestamp(tp_index, fill_step)
    print_text(use_plt, "【TP】成功监测出的异常点(数量:{}):\n 共有{}段连续 \n 具体为{}".format(tp_num, tp_interval_num, tp_interval_str))
    fp_interval_num, fp_interval_str = get_constant_timestamp(fp_index, fill_step)
    print_text(use_plt, "【FP】未标记但超过阈值的点(数量:{}):\n 共有{}段连续 \n 具体为{}".format(fp_num, fp_interval_num, fp_interval_str))
    fn_interval_num, fn_interval_str = get_constant_timestamp(fn_index, fill_step)
    print_text(use_plt, "【FN】漏报异常点(数量:{}):\n 共有{}段连续 \n 具体为{}".format(fn_num, fn_interval_num, fn_interval_str))
Beispiel #2
0
    def fit(self,
            use_plt,
            train_values,
            train_labels,
            train_missing,
            test_values,
            test_labels,
            test_missing,
            train_mean,
            train_std,
            valid_num,
            excludes=None,
            summary_dir=None):
        """
        根据所给数据训练:class:`Donut`模型

        Args:
            use_plt: 展示方式
            valid_num: 测试数据数量
            test_missing: 测试数据缺失值
            test_labels: 测试数据异常标注
            test_values: 测试数据缺失值
            train_values (np.ndarray):一维32位浮点数组,标准化的KPI数据
            train_labels (np.ndarray):一维32位整型数组,异常标签
            train_missing (np.ndarray):一维32位数组,指出缺失点
            train_mean (float):标准化之前的平均值
            train_std (float):标准化之前的标准差
            excludes (np.ndarray):一维布尔数组,表明是否包含该点,如果包含,任何包含该点的窗口都包含在内(default :obj:`None`,没有点包含)
            summary_dir (str)::class:`tf.summary.FileWriter`的可选的概要目录。(default :obj:`None`,无目录)
        """
        # 获得默认session
        sess = get_default_session_or_error()
        # 转化训练集
        train_values = np.asarray(train_values, dtype=np.float32)
        train_labels = np.asarray(train_labels, dtype=np.int32)
        train_missing = np.asarray(train_missing, dtype=np.int32)
        # 一维数组检验
        if len(train_values.shape) != 1:
            raise ValueError('values必须是一维数组')
        # 标注维数必须与数值维数相同
        if train_labels.shape != train_values.shape:
            raise ValueError('`labels` 的形状必须与`values`的形状相同 ({} vs {})'.format(
                train_labels.shape, train_values.shape))
        # 缺失点维数必须与数值维数相同
        if train_missing.shape != train_values.shape:
            raise ValueError('`missing` 的形状必须与`values`的形状相同 ({} vs {})'.format(
                train_missing.shape, train_values.shape))
        v_y = np.logical_or(test_labels, test_missing).astype(np.int32)
        if excludes is None:
            train_excludes, valid_excludes = None, None
        else:
            train_excludes, valid_excludes = excludes[:-valid_num], excludes[
                -valid_num:]
        # 数据扩展对象和滑动窗口迭代器
        aug = MissingDataInjection(train_mean, train_std,
                                   self._missing_data_injection_rate)
        train_sliding_window = BatchSlidingWindow(
            array_size=len(train_values),
            window_size=self.model.x_dims,
            batch_size=self._batch_size,
            excludes=train_excludes,
            shuffle=True,
            ignore_incomplete_batch=True,
        )
        valid_sliding_window = BatchSlidingWindow(
            array_size=len(test_values),
            window_size=self.model.x_dims,
            batch_size=self._valid_batch_size,
            excludes=valid_excludes,
        )

        # 初始化训练器变量并检验
        sess.run(self._trainer_initializer)
        ensure_variables_initialized(self._train_params)
        # 循环训练
        lr = self._initial_lr
        epoch_list = []
        lr_list = []
        train_message = []
        tc = TimeCounter()
        with TrainLoop(param_vars=self._train_params,
                       early_stopping=True,
                       summary_dir=summary_dir,
                       max_epoch=self._max_epoch,
                       max_step=self._max_step) as loop:  # type: TrainLoop
            loop.print_training_summary()
            tc.start()
            for epoch in loop.iter_epochs():
                aug_values, aug_labels, aug_missing = aug.augment(
                    train_values, train_labels, train_missing)
                label_or_missing = np.logical_or(aug_labels,
                                                 aug_missing).astype(np.int32)
                train_iterator = train_sliding_window.get_iterator(
                    [aug_values, label_or_missing])
                for step, (batch_x,
                           batch_y) in loop.iter_steps(train_iterator):
                    # 一次训练
                    feed_dict_train = dict(six.iteritems(self._feed_dict))
                    feed_dict_train[self._learning_rate] = lr
                    feed_dict_train[self._input_x] = batch_x
                    feed_dict_train[self._input_y] = batch_y
                    loss, _ = sess.run([self._loss, self._train_op],
                                       feed_dict=feed_dict_train)
                    loop.collect_metrics({'loss': loss})
                    if step % self._valid_step_freq == 0:
                        # 收集变量目录
                        if summary_dir is not None:
                            loop.add_summary(sess.run(self._summary_op))
                        # 批量进行验证
                        with loop.timeit('valid_time'), loop.metric_collector(
                                'valid_loss') as mc:
                            it = valid_sliding_window.get_iterator(
                                [test_values, v_y])
                            for b_x, b_y in it:
                                feed_dict_train = dict(
                                    six.iteritems(self._valid_feed_dict))
                                feed_dict_train[self._input_x] = b_x
                                feed_dict_train[self._input_y] = b_y
                                loss = sess.run(self._loss,
                                                feed_dict=feed_dict_train)
                                mc.collect(loss, weight=len(b_x))
                        # 打印最近步骤的日志
                        suffix, message = loop.print_logs(False)
                        train_message.append(suffix)
                        train_message.append(message)
                        print_text(use_plt, suffix)
                        print_text(use_plt, message)
                # 退火学习率
                if self._lr_anneal_epochs and epoch % self._lr_anneal_epochs == 0:
                    lr *= self._lr_anneal_factor
                    loop.println('Learning rate decreased to {}'.format(lr),
                                 with_tag=True)
                    epoch_list.append(epoch)
                    lr_list.append(lr)
            tc.end()
        return epoch_list, lr_list, tc.get_s() + "秒", train_message
Beispiel #3
0
from donut.assessment import Assessment
from donut.cache import get_test_data
from donut.util.out.out import print_text
from donut.util.time_util import get_constant_timestamp

fill_step = 60
use_plt = True
a = 1
src_threshold_value, test_scores, real_test_labels, real_test_missing = get_test_data(
)
assessment = Assessment(src_threshold_value, test_scores, real_test_labels,
                        real_test_missing, a, use_plt)
threshold_value, f_score, catch_num, catch_index, fp_index, fp_num, tp_index, tp_num, fn_index, fn_num, precision, recall \
    = assessment.get_assessment()
print_text(use_plt, "捕捉到的异常数:{}".format(catch_num))
print_text(
    use_plt, "默认阈值:{},最佳F分值:{},精度:{},召回率:{}".format(round(threshold_value,
                                                          7), f_score,
                                                    precision, recall))
tp_interval_num, tp_interval_str = get_constant_timestamp(tp_index, fill_step)
print_text(
    use_plt,
    "【TP】成功监测出的异常点(数量:{}):\n 共有{}段连续  具体为{}".format(tp_num, tp_interval_num,
                                                    tp_interval_str))
fp_interval_num, fp_interval_str = get_constant_timestamp(fp_index, fill_step)
print_text(
    use_plt,
    "【FP】未标记但超过阈值的点(数量:{}):\n 共有{}段连续  具体为{}".format(fp_num, fp_interval_num,
                                                     fp_interval_str))
fn_interval_num, fn_interval_str = get_constant_timestamp(fn_index, fill_step)
print_text(
Beispiel #4
0
def save_data_cache(
        use_plt, is_local, file_name, test_portion, src_threshold_value,
        src_timestamps, src_labels, src_values, src_data_num, src_label_num,
        src_label_proportion, first_time, fill_timestamps, fill_values,
        fill_data_num, fill_step, fill_num, second_time, third_time,
        train_data_num, train_label_num, train_label_proportion, test_data_num,
        test_label_num, test_label_proportion, train_mean, train_std,
        forth_time, epoch_list, lr_list, epoch_time, fifth_time, catch_num,
        labels_num, accuracy, special_anomaly_num, interval_num, interval_str,
        special_anomaly_t, special_anomaly_v, special_anomaly_s,
        test_timestamps, test_values, test_scores, model_time, trainer_time,
        predictor_time, fit_time, probability_time, threshold_value,
        train_message, train_timestamps, train_values, t_use, t_name,
        src_train_values, src_test_values):
    """
    保存缓存对象
    Args:
        src_test_values: 标准化前的测试数据
        src_train_values: 标准前的训练数据
        is_local: 展示本地图片
        t_name: 用时排序名称
        t_use: 用时排序用时
        train_values: 训练数据值
        train_timestamps: 训练时间戳
        train_message: 训练信息
        threshold_value: 阈值
        use_plt: 展示方式
        file_name: 文件名
        test_portion: 测试数据比例
        src_threshold_value: 初始阈值
        src_timestamps: 初始时间戳数据
        src_labels: 初始异常标签
        src_values: 初始值
        src_data_num: 初始数据数量
        src_label_num: 初始异常标签数量
        src_label_proportion: 初始异常标签所占比例
        first_time: 第一阶段使用时间
        fill_timestamps: 排序并填充的时间戳
        fill_values: 排序并填充的值
        fill_data_num: 填充后的数据数量
        fill_step: 填充后时间戳步长
        fill_num: 填充的数据的数量
        second_time: 第二阶段的用时
        third_time: 第三阶段用时
        train_data_num: 训练数据数量
        train_label_num: 训练数据标签数量
        train_label_proportion: 训练数据中的异常标签比例
        test_data_num: 测试数据数量
        test_label_num: 测试数据中异常标签数量
        test_label_proportion: 测试数据中异常标签比例
        train_mean: 平均值
        train_std: 标准差
        forth_time: 第四阶段用时
        epoch_list: 迭代遍数
        lr_list: 学习率
        epoch_time: 迭代时间
        fifth_time: 第五阶段用时
        catch_num: 根据阈值捕捉到的数量
        labels_num: 异常标注数量
        accuracy: 异常标注精确率
        special_anomaly_num: 被捕捉到的异常数量
        interval_num: 连续的异常段数量
        interval_str: 连续异常段字符串
        special_anomaly_t: 被捕捉到的异常的时间戳
        special_anomaly_v: 被捕捉到的异常的值
        special_anomaly_s: 被捕捉到的异常的分数
        test_timestamps: 测试数据时间戳
        test_values: 测试数据值
        test_scores: 测试数据分数
        model_time: 构建模型用时
        trainer_time: 构建训练器用时
        predictor_time: 构建预测期用时
        fit_time: 训练时长
        probability_time: 获得重构概率用时
    """
    tc = TimeCounter()
    print_text(use_plt, "缓存开始")
    tc.start()
    name = file_name_converter(file_name, test_portion, src_threshold_value,
                               is_local)
    db = shelve.open(name)
    db["src_timestamps"] = src_timestamps
    db["src_labels"] = src_labels
    db["src_values"] = src_values
    db["src_data_num"] = src_data_num
    db["src_label_num"] = src_label_num
    db["src_label_proportion"] = src_label_proportion
    db["first_time"] = first_time
    db["fill_timestamps"] = fill_timestamps
    db["fill_values"] = fill_values
    db["fill_data_num"] = fill_data_num
    db["fill_step"] = fill_step
    db["fill_num"] = fill_num
    db["second_time"] = second_time
    db["third_time"] = third_time
    db["train_data_num"] = train_data_num
    db["train_label_num"] = train_label_num
    db["train_label_proportion"] = train_label_proportion
    db["test_data_num"] = test_data_num
    db["test_label_num"] = test_label_num
    db["test_label_proportion"] = test_label_proportion
    db["train_mean"] = train_mean
    db["train_std"] = train_std
    db["forth_time"] = forth_time
    db["epoch_list"] = epoch_list
    db["lr_list"] = lr_list
    db["epoch_time"] = epoch_time
    db["fifth_time"] = fifth_time
    db["catch_num"] = catch_num
    db["labels_num"] = labels_num
    db["accuracy"] = accuracy
    db["special_anomaly_num"] = special_anomaly_num
    db["interval_num"] = interval_num
    db["interval_str"] = interval_str
    db["special_anomaly_t"] = special_anomaly_t
    db["special_anomaly_v"] = special_anomaly_v
    db["special_anomaly_s"] = special_anomaly_s
    db["src_threshold_value"] = src_threshold_value
    db["test_timestamps"] = test_timestamps
    db["test_values"] = test_values
    db["test_scores"] = test_scores
    db["model_time"] = model_time
    db["trainer_time"] = trainer_time
    db["predictor_time"] = predictor_time
    db["fit_time"] = fit_time
    db["probability_time"] = probability_time
    db["threshold_value"] = threshold_value
    db["train_message"] = train_message
    db["train_timestamps"] = train_timestamps
    db["train_values"] = train_values
    db["t_use"] = t_use
    db["t_name"] = t_name
    db["src_train_values"] = src_train_values
    db["src_test_values"] = src_test_values
    tc.end()
    print_info(use_plt, "缓存结束【共用时:{}】".format(tc.get_s() + "秒"))
    db.close()
Beispiel #5
0
def gain_data_cache(use_plt, file_name, test_portion, src_threshold_value,
                    is_local):
    """
    获得缓存数据
    Args:
        is_local: 本地照片显示
        use_plt: 显示格式
        file_name: 数据文件名称
        test_portion: 测试数据比例
        src_threshold_value: 初始阈值

    Returns:
        test_portion: 测试数据比例
        src_threshold_value: 初始阈值
        src_timestamps: 初始时间戳数据
        src_labels: 初始异常标签
        src_values: 初始值
        src_data_num: 初始数据数量
        src_label_num: 初始异常标签数量
        src_label_proportion: 初始异常标签所占比例
        first_time: 第一阶段使用时间
        fill_timestamps: 排序并填充的时间戳
        fill_values: 排序并填充的值
        fill_data_num: 填充后的数据数量
        fill_step: 填充后时间戳步长
        fill_num: 填充的数据的数量
        second_time: 第二阶段的用时
        third_time: 第三阶段用时
        train_data_num: 训练数据数量
        train_label_num: 训练数据标签数量
        train_label_proportion: 训练数据中的异常标签比例
        test_data_num: 测试数据数量
        test_label_num: 测试数据中异常标签数量
        test_label_proportion: 测试数据中异常标签比例
        train_mean: 平均值
        train_std: 标准差
        forth_time: 第四阶段用时
        epoch_list: 迭代遍数
        lr_list: 学习率
        epoch_time: 迭代时间
        fifth_time: 第五阶段用时
        catch_num: 根据阈值捕捉到的数量
        labels_num: 异常标注数量
        accuracy: 异常标注精确率
        special_anomaly_num: 被捕捉到的异常数量
        interval_num: 连续的异常段数量
        interval_str: 连续异常段字符串
        special_anomaly_t: 被捕捉到的异常的时间戳
        special_anomaly_v: 被捕捉到的异常的值
        special_anomaly_s: 被捕捉到的异常的分数
        test_timestamps: 测试数据时间戳
        test_values: 测试数据值
        test_scores: 测试数据分数
        model_time: 构建模型用时
        trainer_time: 构建训练器用时
        predictor_time: 构建预测期用时
        fit_time: 训练时长
        probability_time: 获得重构概率用时
        threshold_value: 阈值
    """
    print_text(use_plt, "读取缓存开始")
    tc = TimeCounter()
    tc.start()
    name = file_name_converter(file_name, test_portion, src_threshold_value,
                               is_local)
    db = shelve.open(name)
    src_timestamps = db["src_timestamps"]
    src_labels = db["src_labels"]
    src_values = db["src_values"]
    src_data_num = db["src_data_num"]
    src_label_num = db["src_label_num"]
    src_label_proportion = db["src_label_proportion"]
    first_time = db["first_time"]
    fill_timestamps = db["fill_timestamps"]
    fill_values = db["fill_values"]
    fill_data_num = db["fill_data_num"]
    fill_step = db["fill_step"]
    fill_num = db["fill_num"]
    second_time = db["second_time"]
    third_time = db["third_time"]
    train_data_num = db["train_data_num"]
    train_label_num = db["train_label_num"]
    train_label_proportion = db["train_label_proportion"]
    test_data_num = db["test_data_num"]
    test_label_num = db["test_label_num"]
    test_label_proportion = db["test_label_proportion"]
    train_mean = db["train_mean"]
    train_std = db["train_std"]
    forth_time = db["forth_time"]
    epoch_list = db["epoch_list"]
    lr_list = db["lr_list"]
    epoch_time = db["epoch_time"]
    fifth_time = db["fifth_time"]
    catch_num = db["catch_num"]
    labels_num = db["labels_num"]
    accuracy = db["accuracy"]
    special_anomaly_num = db["special_anomaly_num"]
    interval_num = db["interval_num"]
    interval_str = db["interval_str"]
    special_anomaly_t = db["special_anomaly_t"]
    special_anomaly_v = db["special_anomaly_v"]
    special_anomaly_s = db["special_anomaly_s"]
    src_threshold_value = db["src_threshold_value"]
    test_timestamps = db["test_timestamps"]
    test_values = db["test_values"]
    test_scores = db["test_scores"]
    model_time = db["model_time"]
    trainer_time = db["trainer_time"]
    predictor_time = db["predictor_time"]
    threshold_value = db["threshold_value"]
    fit_time = db["fit_time"]
    probability_time = db["probability_time"]
    train_message = db["train_message"]
    train_timestamps = db["train_timestamps"]
    train_values = db["train_values"]
    t_use = db["t_use"]
    t_name = db["t_name"]
    src_train_values = db["src_train_values"]
    src_test_values = db["src_test_values"]
    tc.end()
    print_info(use_plt, "读取缓存数据结束【共用时:{}】".format(tc.get_s() + "秒"))
    db.close()
    return src_timestamps, src_labels, src_values, src_data_num, src_label_num, src_label_proportion, first_time, \
           fill_timestamps, fill_values, fill_data_num, fill_step, fill_num, second_time, third_time, \
           train_data_num, train_label_num, train_label_proportion, test_data_num, test_label_num, test_label_proportion, \
           train_mean, train_std, forth_time, epoch_list, lr_list, epoch_time, fifth_time, src_threshold_value, catch_num, \
           labels_num, accuracy, special_anomaly_num, interval_num, interval_str, special_anomaly_t, special_anomaly_v, \
           special_anomaly_s, test_timestamps, test_values, test_scores, model_time, trainer_time, predictor_time, \
           fit_time, probability_time, threshold_value, train_message, train_timestamps, train_values, t_use, t_name, \
           src_train_values, src_test_values
Beispiel #6
0
def train_prediction_v1(use_plt, train_values, train_labels, train_missing, test_values, test_missing, test_labels,
                        train_mean, train_std, valid_num):
    """
    训练与预测
    Args:
        test_labels: 测试数据异常标签
        valid_num: 测试数据数量
        use_plt: 使用plt输出
        train_values: 训练数据值
        train_labels: 训练数据异常标签
        train_missing: 训练数据缺失点
        test_values: 测试数据
        test_missing: 测试数据缺失点
        train_mean: 平均值
        train_std: 标准差

    Returns:
        refactor_probability:  重构概率
        epoch_list: 遍数列表
        lr_list: 学习率变化列表
        epoch_time: 遍数
        model_time: 构建Donut模型时间
        trainer_time: 构建训练器时间
        predictor_time: 构建预测期时间
        fit_time: 训练时间
        probability_time: 获得重构概率时间
    """
    # 1.构造模型
    tc = TimeCounter()
    tc.start()
    with tf.variable_scope('model') as model_vs:
        model = Donut(
            # 构建`p(x|z)`的隐藏网络
            hidden_net_p_x_z=Sequential([
                # units:该层的输出维度。
                # kernel_regularizer:施加在权重上的正则项。L2正则化 使权重尽可能小 惩罚力度不大
                # activation:激活函数 ReLU
                K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            hidden_net_q_z_x=Sequential([
                K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            # x维的数量
            x_dims=120,
            # z维的数量
            z_dims=5,
        )
        tc.end()
        model_time = tc.get_s() + "秒"
        print_info(use_plt, "5.构建Donut模型【共用时{}】".format(model_time))
        # 2.构造训练器
        tc.start()
        trainer = DonutTrainer(model=model, model_vs=model_vs)
        tc.end()
        trainer_time = tc.get_s() + "秒"
        print_info(use_plt, "6.构造训练器【共用时{}】".format(trainer_time))
        # 3.构造预测器
        tc.start()
        predictor = DonutPredictor(model)
        tc.end()
        predictor_time = tc.get_s() + "秒"
        print_info(use_plt, "7.构造预测器【共用时{}】".format(predictor_time))
        with tf.Session().as_default():
            # 4.训练器训练模型
            tc.start()
            epoch_list, lr_list, epoch_time, train_message = \
                trainer.fit(use_plt, train_values, train_labels, train_missing, test_values, test_labels, test_missing,
                            train_mean, train_std, valid_num)
            tc.end()
            fit_time = tc.get_s() + "秒"
            print_info(use_plt, "8.训练器训练模型【共用时{}】".format(fit_time))
            print_text(use_plt, "所有epoch【共用时:{}】".format(epoch_time))
            print_text(use_plt, "退火学习率 学习率随epoch变化")
            show_line_chart(use_plt, epoch_list, lr_list, 'annealing learning rate')
            # 5.预测器获取重构概率
            tc.start()
            refactor_probability = predictor.get_refactor_probability(test_values, test_missing)
            tc.end()
            probability_time = tc.get_s() + "秒"
            print_info(use_plt, "9.预测器获取重构概率【共用时{}】".format(probability_time))
            return refactor_probability, epoch_list, lr_list, epoch_time, model_time, trainer_time, predictor_time, fit_time, probability_time, train_message