Beispiel #1
0
    def test_fit_args(self):
        values, labels, missing, excludes = self._payload()

        tf.set_random_seed(1234)
        donut = Donut(
            h_for_p_x=lambda x: x, h_for_q_z=lambda x: x, x_dims=5,
            z_dims=3
        )
        trainer = DonutTrainer(donut, max_epoch=1)

        with self.test_session():
            # test no exclude
            trainer.fit(values=values, labels=labels, missing=missing,
                        mean=1., std=2.)

            # test shape error
            with pytest.raises(
                    ValueError, match='`values` must be a 1-D array'):
                trainer.fit(values=np.array([[1.]]), labels=labels,
                            missing=missing, mean=1., std=2.)
            with pytest.raises(
                    ValueError, match='The shape of `labels` does not agree '
                                      'with the shape of `values`'):
                trainer.fit(values=values, labels=labels[:-1],
                            missing=missing, mean=1., std=2.)
            with pytest.raises(
                    ValueError, match='The shape of `missing` does not agree '
                                      'with the shape of `values`'):
                trainer.fit(values=values, labels=labels,
                            missing=missing[:-1], mean=1., std=2.)
Beispiel #2
0
    def test_fit(self):
        values, labels, missing, excludes = self._payload()

        with TemporaryDirectory() as tmpdir:
            tf.set_random_seed(1234)
            donut = Donut(
                h_for_p_x=lambda x: x, h_for_q_z=lambda x: x, x_dims=5,
                z_dims=3
            )
            trainer = DonutTrainer(
                donut, max_epoch=3, batch_size=7, valid_step_freq=50,
                lr_anneal_epochs=2
            )

            with self.test_session():
                trainer.fit(
                    values=values, labels=labels, missing=missing, mean=1.,
                    std=2., excludes=excludes, summary_dir=tmpdir
                )
Beispiel #3
0
                           kernel_regularizer=K.regularizers.l2(0.001),
                           activation=tf.nn.relu),
        ]),
        h_for_q_z=Sequential([
            K.layers.Dense(100,
                           kernel_regularizer=K.regularizers.l2(0.001),
                           activation=tf.nn.relu),
            K.layers.Dense(100,
                           kernel_regularizer=K.regularizers.l2(0.001),
                           activation=tf.nn.relu),
        ]),
        x_dims=120,
        z_dims=5,
    )

trainer = DonutTrainer(model=model, model_vs=model_vs, max_epoch=300)
predictor = DonutPredictor(model)

with tf.Session().as_default():
    trainer.fit(train_values, train_labels, train_missing, mean, std)

    print('Testing size:', np.shape(test_values))
    start_time = time.time()
    test_score = predictor.get_score(test_values, test_missing)
    end_time = time.time()
    print('time comsuming', end_time - start_time)

    writer = csv.writer(open('donut_result.csv', 'w', newline=''))
    print(len(test_labels), len(test_score), len(test_values))
    for i in range(len(test_score)):
        writer.writerow(
Beispiel #4
0
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=120,
            z_dims=5,
        )

    # To train the Donut model, and use a trained model for prediction
    trainer = DonutTrainer(model=model, model_vs=model_vs)
    predictor = DonutPredictor(model)

    with tf.Session().as_default():
        #trainer.fit(train_values, train_labels, train_missing, mean, std)
        #var_dict = get_variables_as_dict(model_vs)
        #saver = VariableSaver(var_dict, "donut_without_label_2.ckpt")
        #saver.save()

        # Restore variables from `save_dir`.
        saver = VariableSaver(get_variables_as_dict(model_vs),
                              "donut_without_label_2.ckpt")
        saver.restore()
        test_score = predictor.get_score(test_values, test_missing)
        result = np.array([test_labels[119:], test_score])
        np.savetxt('result_arti_sin2.csv',
Beispiel #5
0
    def test_construction_args(self):
        values, labels, missing, excludes = self._payload()

        tf.set_random_seed(1234)
        donut = Donut(h_for_p_x=lambda x: x,
                      h_for_q_z=lambda x: x,
                      x_dims=5,
                      z_dims=3)

        # test feed_dict
        is_training = tf.placeholder(tf.bool, ())
        trainer = DonutTrainer(donut,
                               max_epoch=1,
                               feed_dict={is_training: True})
        with self.test_session():
            trainer.fit(values=values,
                        labels=labels,
                        missing=missing,
                        mean=1.,
                        std=2.,
                        excludes=excludes)

        # test valid_feed_dict
        trainer = DonutTrainer(donut,
                               max_epoch=1,
                               valid_feed_dict={is_training: True})
        with self.test_session():
            trainer.fit(values=values,
                        labels=labels,
                        missing=missing,
                        mean=1.,
                        std=2.,
                        excludes=excludes)

        # test max_epoch is None and max_step is None
        with pytest.raises(ValueError,
                           match='At least one of `max_epoch` and `max_step` '
                           'should be specified'):
            _ = DonutTrainer(donut, max_epoch=None, max_step=None)

        # test optimizer and optimizer_params
        trainer = DonutTrainer(donut,
                               max_epoch=1,
                               optimizer=tf.train.MomentumOptimizer,
                               optimizer_params={'momentum': 0.01})
        with self.test_session():
            trainer.fit(values=values,
                        labels=labels,
                        missing=missing,
                        mean=1.,
                        std=2.,
                        excludes=excludes)
Beispiel #6
0
def donut_test(src_dir, output_dir, file, batch):
    if os.path.exists(output_dir + "performance-donut-" + str(batch) + ".csv"):
        perform = pd.read_csv(output_dir + "performance-donut-" + str(batch) +
                              ".csv")
    else:
        perform = pd.DataFrame({
            "file": [],
            "storage": [],
            "train-time": [],
            "codisp-time": [],
            "test-time": [],
            "precision": [],
            "recall": [],
            "best-F1": [],
            "best-threshold": []
        })
    perform = perform.append([{
        'file': file,
        "storage": 0.0,
        "train-time": 0.0,
        "codisp-time": 0.0,
        "test-time": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "best-F1": 0.0,
        "best-threshold": 0.0
    }],
                             ignore_index=True)
    perform.index = perform["file"]

    data = pd.read_csv(src_dir + file)
    timestamp, value, labels = data["timestamp"], data["value"], data[
        "anomaly"]
    missing = np.zeros(len(timestamp))

    test_portion = 0.5
    test_n = int(len(value) * test_portion)
    train_values, test_values = value[:-test_n], value[-test_n:]
    train_labels, test_labels = labels[:-test_n], labels[-test_n:]
    train_time, test_time = timestamp[:-test_n], timestamp[-test_n:]
    train_missing, test_missing = missing[:-test_n], missing[-test_n:]

    train_values, mean, std = standardize_kpi(train_values,
                                              excludes=np.logical_or(
                                                  train_labels, train_missing))
    test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)

    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=120,
            z_dims=5,
        )
    trainer = DonutTrainer(model=model, model_vs=model_vs)
    predictor = DonutPredictor(model)
    with tf.Session().as_default():
        start = time.time()
        trainer.fit(train_values, train_labels, train_missing, mean, std)
        end = time.time()
        perform.loc[file, "train-time"] = end - start

        start = time.time()
        test_score = predictor.get_score(test_values, test_missing)
        end = time.time()
        perform.loc[file, "test-time"] = end - start

    storage = get_size(trainer) + get_size(predictor)
    perform.loc[file, "storage"] = storage

    pd.DataFrame({
        "timestamp": test_time[-len(test_score):],
        "score": test_score
    }).to_csv(output_dir + "test-donut" + file, index=False)
    best_F1, best_threshold, precision, recall = compute_best_F1(
        src_dir + file,
        output_dir + "test-donut" + file,
        reverse=True,
        mean_start=False)
    perform.loc[file, "best-F1"] = best_F1
    perform.loc[file, "best-threshold"] = best_threshold
    perform.loc[file, "precision"] = precision
    perform.loc[file, "recall"] = recall

    perform.to_csv(output_dir + "performance-donut-" + str(batch) + ".csv",
                   index=False)
Beispiel #7
0
def generate_score(number):
    # Read the raw data.
    data_dir_path = 'C:/Users/Administrator/Downloads/research/donut-master/SMD/data_concat/data-' + number + '.csv'
    data = np.array(pd.read_csv(data_dir_path, header=None), dtype=np.float64)
    tag_dir_path = './SMD/test_label/machine-' + number + '.csv'
    tag = np.array(pd.read_csv(tag_dir_path, header=None), dtype=np.int)
    labels = np.append(np.zeros(int(len(data) / 2)), tag)
    # pick one colume
    values = data[:, 1]
    timestamp = np.arange(len(data)) + 1

    # If there is no label, simply use all zeros.
    # labels = np.zeros_like(values, dtype=np.int32)

    # Complete the timestamp, and obtain the missing point indicators.
    timestamp, (values, labels) = \
        complete_timestamp(timestamp, (values, labels))

    # Split the training and testing data.
    test_portion = 0.5
    test_n = int(len(values) * test_portion)
    train_values = values[:-test_n]
    test_values = values[-len(train_values):]
    train_labels, test_labels = labels[:-test_n], labels[-test_n:]
    # print(len(test_values), len(test_labels))

    # Standardize the training and testing data.
    train_values, mean, std = standardize_kpi(train_values,
                                              excludes=train_labels)
    test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)

    import tensorflow as tf
    from donut import Donut
    from tensorflow import keras as K
    from tfsnippet.modules import Sequential

    # We build the entire model within the scope of `model_vs`,
    # it should hold exactly all the variables of `model`, including
    # the variables created by Keras layers.
    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=120,
            z_dims=5,
        )

    from donut import DonutTrainer, DonutPredictor

    trainer = DonutTrainer(model=model, model_vs=model_vs)
    predictor = DonutPredictor(model)

    with tf.Session().as_default():
        trainer.fit(train_values, train_labels, mean, std)
        test_score = predictor.get_score(test_values)

    if not os.path.exists('./score'):
        os.makedirs('./score')

    np.save('./score/' + number + '.npy', test_score)
Beispiel #8
0
def vae_donut(ts_obj,
              window_size,
              mcmc_iteration,
              latent_dim,
              gaussian_window_size,
              step_size,
              plot_reconstruction=False,
              plot_anomaly_score=False):
    # authors use window_size = 120
    # mcmc_iteration = 10

    # https://github.com/kratzert/finetune_alexnet_with_tensorflow/issues/8
    tf.reset_default_graph()

    start = time.time()

    # if there are missing time steps, we DO NOT fill them with NaNs because donut will replace them with 0s
    # using complete_timestamp
    # see line 6 in https://github.com/NetManAIOps/donut/blob/master/donut/preprocessing.py
    timestamp, values, labels = ts_obj.dataframe[
        "timestamp"].values, ts_obj.dataframe["value"].values, np.zeros_like(
            ts_obj.dataframe["value"].values, dtype=np.int32)

    # print(len(timestamp))
    # print(len(values))
    # print(len(labels))

    # Complete the timestamp, and obtain the missing point indicators
    # replaces  missing with 0s.

    # donut cannot handle this date format for some reason
    if ts_obj.dateformat == "%Y-%m":
        rng = pd.date_range('2000-01-01', periods=len(values), freq='T')
        timestamp, missing, (values, labels) = complete_timestamp(
            rng, (values, labels))
    else:
        timestamp, missing, (values, labels) = complete_timestamp(
            timestamp, (values, labels))

    # print(len(timestamp))
    # print(len(values))
    # print(len(labels))
    # print(sum(missing))

    # Standardize the training and testing data.
    values, mean, std = standardize_kpi(values,
                                        excludes=np.logical_or(
                                            labels, missing))

    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=window_size,
            z_dims=latent_dim,
        )

        trainer = DonutTrainer(model=model, model_vs=model_vs)
        predictor = DonutPredictor(model)

        with tf.Session().as_default():
            trainer.fit(values, labels, missing, mean, std)
            score = predictor.get_score(values, missing)

            # if time series is [1,2,3,4...] and ts_length is 3
            # this gives us [[1,2,3],[2,3,4]...]
            ts_strided = ah.as_sliding_window(values, window_size)
            ts_strided = my_func_float(np.array(ts_strided, dtype=np.float32))
            missing_strided = ah.as_sliding_window(missing, window_size)
            missing_strided = my_func_int(
                np.array(missing_strided, dtype=np.int32))

            # print(ts_strided)
            # print(missing_strided)

            x = model.vae.reconstruct(
                iterative_masked_reconstruct(reconstruct=model.vae.reconstruct,
                                             x=ts_strided,
                                             mask=missing_strided,
                                             iter_count=mcmc_iteration,
                                             back_prop=False))

            # `x` is a :class:`tfsnippet.stochastic.StochasticTensor`, from which
            # you may derive many useful outputs, for example:
            # print(x.tensor.eval())  # the `x` samples
            # print(x.log_prob(group_ndims=0).eval())  # element-wise log p(x|z) of sampled x
            # print(x.distribution.log_prob(ts_strided).eval())  # the reconstruction probability
            # print(x.distribution.mean.eval(), x.distribution.std.eval())  # mean and std of p(x|z)

            tensor_reconstruction_probabilities = x.distribution.log_prob(
                ts_strided).eval()

            # because of the way strided works, we use the first 120 anomaly scores in the first slide
            # and then for remaining slides, we use the last point/score
            reconstruction_probabilities = list(
                tensor_reconstruction_probabilities[0])
            for i in range(len(tensor_reconstruction_probabilities)):
                if i != 0:
                    slide = tensor_reconstruction_probabilities[i]
                    reconstruction_probabilities.append(slide[-1])

    # print(len(reconstruction_probabilities))
    # print(len(ts_obj.dataframe))

    if ts_obj.miss:
        ref_date_range = ch.get_ref_date_range(ts_obj.dataframe,
                                               ts_obj.dateformat,
                                               ts_obj.timestep)
        gaps = ref_date_range[~ref_date_range.isin(ts_obj.
                                                   dataframe["timestamp"])]
        filled_df = ch.fill_df(ts_obj.dataframe, ts_obj.timestep,
                               ref_date_range, "fill_nan")
        # print("NaNs exist?: ",filled_df['value'].isnull().values.any())
        filled_df[
            "reconstruction_probabilities"] = reconstruction_probabilities
        # remove nans
        filled_df = filled_df.dropna()
        reconstruction_probabilities = list(
            filled_df["reconstruction_probabilities"].values)

    # print(len(reconstruction_probabilities))
    # print(len(ts_obj.dataframe))

    reconstruction_probabilities = [
        abs(item) for item in reconstruction_probabilities
    ]

    anomaly_scores = anomaly_scores = ah.determine_anomaly_scores_error(
        reconstruction_probabilities,
        np.zeros_like(reconstruction_probabilities), ts_obj.get_length(),
        gaussian_window_size, step_size)

    end = time.time()

    if plot_reconstruction:
        plt.subplot(211)
        # see lines 98 to 100 of https://github.com/NetManAIOps/donut/blob/master/donut/prediction.py
        plt.title("Negative of Reconstruction Probabilities")
        plt.plot(reconstruction_probabilities)
        # plt.ylim([.99,1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    if plot_anomaly_score:
        plt.subplot(211)
        plt.title("Anomaly Scores")
        plt.plot(anomaly_scores)
        plt.ylim([.998, 1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    return {
        "Anomaly Scores": anomaly_scores,
        "Time": end - start,
        "Reconstruction Probabilities": reconstruction_probabilities
    }
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=120,
            z_dims=5,
        )

# Remember to get the model variables after the birth of a
# `predictor` or a `trainer`.  The :class:`Donut` instances
# does not build the graph until :meth:`Donut.get_score` or
# :meth:`Donut.get_training_objective` is called, which is
# done in the `predictor` or the `trainer`.

# save variables to `save_dir`
    trainer = DonutTrainer(model=model, model_vs=model_vs)
    trainer.fit(train_values, train_labels, train_missing, mean, std)

    var_dict = get_variables_as_dict(model_vs)
    saver = VariableSaver(var_dict, save_dir)
    saver.save()

with tf.Session().as_default():
    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),