Beispiel #1
0
    def visualize_features(self, data, file_name, method='TSNE', show=False):
        fig = plt.figure()
        tc.yellow('Visualize features using {}...'.format(method))
        features = data.drop(['is_anomaly', 'window_label'], axis=1)

        if (method == 'TSNE'):
            embedded = TSNE(n_components=2).fit_transform(features)
        elif (method == 'UMAP'):
            embedded = umap.UMAP().fit_transform(features)

        ai = data.index[data.is_anomaly == 1].tolist()
        ni = data.index[data.is_anomaly == 0].tolist()

        normal = plt.scatter(embedded[ni, 0], embedded[ni, 1], c='blue', s=2)
        anomaly = plt.scatter(embedded[ai, 0], embedded[ai, 1], c='red', s=2)

        # Add time window labels to feature plot
        # for i in ai:
        #     wl = data.loc[i].window_label
        #     plt.annotate(
        #         '{} ({})'.format(i, wl),
        #         (embedded[i, 0], embedded[i, 1])
        #     )

        plt.legend((normal, anomaly), ('Normal', 'Anomaly'), loc='lower right')
        plt.title('{} projection of the features\n'.format(method))
        fig.tight_layout()
        file_name = file_name.replace('.csv', '')
        file_path = '{}_{}-features.png'.format(file_name, method)
        fig.savefig(file_path)
        if show:
            plt.show()
        plt.close()
        tc.green('Saved {} visualized features using {}'.format(
            method, file_path))
Beispiel #2
0
def visualize_labelled_series(anomaly_windows, show=True):
    file_names = get_sorted_file_names()
    files_test = file_names[30:]
    file_count = len(files_test)
    fn = '{}/plot_test_taxi_files_anomalies.png'.format(result_dir)
    data_test = load_files(files_test, file_count)
    # visualize(data_test, file_count, fn, show=True)

    fig, ax = plt.subplots()
    plt.plot(data_test.index, data_test.values, color='blue')
    title = '{} files nyc taxi (custom) dataset (test data)'.format(file_count)
    plt.title(title)

    # Highlight anomalies
    for window in anomaly_windows:
        start, end = window.split('-')
        start = int(start)
        end = int(end)
        if len(data_test) < end:
            end = len(data_test)
        plt.plot(data_test.index[start:end],
                 data_test.values[start:end],
                 color='red')

    ax.get_xaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
    plt.tight_layout()
    fig.savefig(fn)
    tc.green('Saved plot in {}'.format(fn))
    if show:
        plt.show()
Beispiel #3
0
def visualize_labelled_series(anomaly_windows, show=True):
    fn = '{}/plot_anomalies.png'.format(result_dir)
    # data_test = load_files(files_test, file_count)
    fn = '{}/sel102.csv'.format(result_dir)
    df = pd.read_csv(fn, header=0)
    row_count = len(df.index)
    split = int(row_count / 2)
    # data_train = df.iloc[split:].V5
    data_test = df.iloc[:-split].V5

    fig, ax = plt.subplots()
    plt.plot(data_test.index, data_test.values, color='blue')
    title = 'ECG dataset (test data)'
    plt.title(title)

    # Highlight anomalies
    for window in anomaly_windows:
        start, end = window.split('-')
        start = int(start)
        end = int(end)
        if len(data_test) < end:
            end = len(data_test)
        plt.plot(data_test.index[start:end],
                 data_test.values[start:end],
                 color='red')

    ax.get_xaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
    plt.tight_layout()
    fn = fn.replace('.csv', '.png')
    fig.savefig(fn)
    tc.green('Saved plot in {}'.format(fn))
    if show:
        plt.show()
Beispiel #4
0
    def generate_features(self,
                          timeseries,
                          anomaly_labels,
                          window_size,
                          file_name,
                          method='ARMA',
                          order=(2, 2),
                          stride=0):
        """Process the complete timeseries. Create windows first and then
        encode each window to reduce the dimensionality.

        Returns
        -------
        features: DataFrame
            List of features with anomaly labels
        """
        if stride == 0:
            stride = window_size / 2

        if (method == 'ARMA'):
            get_parameters = self.get_arma_params
        elif (method == 'ARIMA'):
            get_parameters = self.get_arima_params
        else:
            raise ValueError('Unkown method {}.'.format(method) +
                             'Only ARMA and ARIMA are supported.')

        window_columns = ['window_start', 'window_end', 'is_anomaly']
        windows = pd.DataFrame(columns=window_columns)

        features = pd.DataFrame()
        window_starts = np.arange(0, len(timeseries), step=stride, dtype=int)
        tc.yellow("Generating features...")

        for i, start in enumerate(tqdm(window_starts)):
            end = int(start + window_size - 1)
            window_data = timeseries[start:end]
            window_is_anomaly = min(1, sum(anomaly_labels[start:end]))
            windows.loc[i] = [start, end, window_is_anomaly]

            fitted = get_parameters(window_data, order)
            if i == 0:
                feature_columns = np.append(fitted.data.param_names,
                                            ('is_anomaly', 'window_label'))
                features = pd.DataFrame(columns=feature_columns)
            window_label = '{}-{}'.format(start, end)
            # TODO: add fitted.sigma2
            newRow = np.append(fitted.params,
                               (window_is_anomaly, window_label))
            features.loc[i] = newRow

        features.is_anomaly = features.is_anomaly.astype(int)
        features.to_csv(file_name, index=False)  # Save features to file
        tc.green('Saved features in {}'.format(file_name))
        return pd.read_csv(file_name)
def visualize(df, show=True):
    fig = plt.figure()

    plt.title('Record sel102 from Physionet')
    df.V5.plot()
    # df.V2.plot()
    save_data(df)

    if show:
        plt.show()
    fig.tight_layout()
    file_path = '{}/ecg.png'.format(result_dir)
    fig.savefig(file_path)
    plt.close()
    tc.green('Saved {}.'.format(file_path))
Beispiel #6
0
 def create_data_plot(self, data, show):
     """Plot the generated (stitched) data containing the anomalies.
     """
     fig = plt.figure()
     plt.title(self.get_title())
     cmap = ['b', 'r']
     # plt.plot(data.mask((data['is_anomaly'] == 1))['value'], color='blue')
     plt.plot(data['value'], color='blue')
     plt.plot(data.mask((data['is_anomaly'] == 0))['value'], color='red')
     plt.tight_layout() # avoid overlapping plot titles
     image_file_name = self.file_name.replace('.csv', '.png')
     fig.savefig(image_file_name)
     tc.green('Saved data plot in {}'.format(image_file_name))
     if show:
         plt.show()
     plt.close()
Beispiel #7
0
def load_and_label_data(features_fn, threshold, scores_fn):
    features = pd.read_csv(features_fn)
    anomaly_data = pd.read_csv(scores_fn)
    mask = anomaly_data.anomaly_score > threshold
    # Save labelled anomaly scores
    anomaly_data.loc[mask, 'is_anomaly'] = 1
    fn = scores_fn.replace(
        '.csv', '_labelled_{}.csv'.format(str(threshold).replace('.', '_')))
    anomaly_data.to_csv(fn, index=False)
    tc.green('Saved file {}'.format(fn))
    # Save labelled features
    features.loc[mask, 'is_anomaly'] = 1
    fn = features_fn.replace(
        '.csv', '_labelled_{}.csv'.format(str(threshold).replace('.', '_')))
    features.to_csv(fn, index=False)
    tc.green('Saved file {}'.format(fn))
    # Return anomaly windows
    anomaly_windows = features.loc[mask].window_label
    return anomaly_windows
Beispiel #8
0
def visualize(data, file_count, file_name, show=False):
    '''Plot a line graph with red markers for all anomalies.
    '''
    fig = plt.figure()
    data.value.plot.line(color='blue')
    title = 'First {} files of yahoo S5 dataset'.format(file_count)
    if file_count == 1:
        title = 'First file of yahoo S5 dataset'.format(file_count)
    plt.title(title)

    # Add anomaly markers
    for index, row in data.loc[data.is_anomaly == 1].iterrows():
        plt.scatter(index, row['value'], marker='x', color='red')

    plt.tight_layout()
    fig.savefig(file_name)
    tc.green('Saved plot in {}'.format(file_name))
    if show:
        plt.show()
Beispiel #9
0
def visualize_and_save(data,
                       labels,
                       file_name,
                       regularization_strength,
                       show=False):
    # Show green vertical lines for each anomaly label
    # anomaly_indices = [i for i, x in enumerate(labels) if x == 1]
    # for ai in anomaly_indices:
    #     plt.axvline(x=ai, zorder=-1, c='green')

    axes = plt.gca()
    axes.set_ylim([0, 1])
    plt.plot(data)  # plotting by columns
    plt.title('regularization_strength: {}'.format(regularization_strength))
    image_file = file_name.replace('.csv', '.png')
    plt.savefig(image_file)
    tc.green('Saved image {}'.format(image_file))
    if show:
        plt.show()
    plt.clf()
Beispiel #10
0
def visualize(data, file_count, file_name, show=False):
    '''Plot a line graph with red markers for all anomalies.
    '''
    fig = plt.figure()
    data.plot(color='blue')
    title = '{} files of nyc taxi (custom) dataset'.format(file_count)
    plt.title(title)
    '''
    TODO: Research anomalies (such as Marathon, Blizzard etc.)
    and add an 'is_anomaly' column to the csv files.
    '''
    # Add anomaly markers
    # for index, row in data.loc[data.is_anomaly == 1].iterrows():
    #     plt.scatter(index, row['value'], marker='x', color='red')

    plt.tight_layout()
    fig.savefig(file_name)
    tc.green('Saved plot in {}'.format(file_name))
    if show:
        plt.show()
def visualize(data, anomalies, title):
    """Plot the generated (stitched) data containing the anomalies.
    """
    fig = plt.figure(1, figsize=(12, 3))
    ax1 = fig.add_subplot(111)

    # Generate title to show window count and anomaly window
    ax1.title.set_text(title)
    ax1.plot(np.arange(data.size), data, color='blue', zorder=1)
    # Add anomaly markers
    for anomaly_index in anomalies:
        ax1.scatter(anomaly_index,
                    data[anomaly_index],
                    marker='x',
                    color='red',
                    zorder=2)
    plt.tight_layout()  # avoid overlapping plot titles
    fn = '{}/taxi_data.png'.format(folder)
    fig.savefig(fn)
    tc.green('Created {}'.format(fn))
Beispiel #12
0
    def detect_anomalies(self, X, show=False):
        plot_num = 1
        plt.figure(figsize=(len(self.anomaly_algorithms) * 2 + 3, 6))

        for name, algorithm in self.anomaly_algorithms:
            tc.yellow('Detecting anomalies using {}...'.format(name))
            algorithm.fit(X)
            plt.subplot(1, len(self.anomaly_algorithms), plot_num)
            plt.title(name, size=18)

            # fit the data and tag outliers
            if name == "Local Outlier Factor":
                y_pred = algorithm.fit_predict(X)
            else:
                y_pred = algorithm.fit(X).predict(X)
            # Print and plot
            self.print_anomalies(name, y_pred)
            self.plot_anomalies(name, X, y_pred, plt)
            plot_num += 1
        plt.tight_layout()
        plt.savefig(self.file)
        tc.green('Saved anomaly plot to {}'.format(self.file))
        if show:
            plt.show()
Beispiel #13
0
def run(training_data,
        test_data,
        test_labels,
        regularization_strength,
        file_name,
        epochs=100):
    assert training_data.shape[1] == test_data.shape[1]

    # Train autoencoder network
    encoding_dim = 2
    model = Sequential()
    data_dim = test_data.shape[1]
    layers = [data_dim]
    hidden_dim = int(data_dim / 2)
    # Input layer and first encoding layer
    model.add(
        Dense(hidden_dim,
              input_dim=data_dim,
              activation='relu',
              activity_regularizer=l2(regularization_strength),
              name='encoding_{}'.format(hidden_dim)))
    layers.append(hidden_dim)

    # Add layers with decreasing size
    hidden_dim = int(hidden_dim / 2)
    while encoding_dim <= hidden_dim:
        model.add(
            Dense(hidden_dim,
                  activation='relu',
                  activity_regularizer=l2(regularization_strength),
                  name='encoding_{}'.format(hidden_dim)))
        layers.append(hidden_dim)
        hidden_dim = int(hidden_dim / 2)

    # Add layers with increasing size
    layers.pop()  # remove smallest element
    for hidden_dim in sorted(layers):
        model.add(
            Dense(hidden_dim,
                  activation='relu',
                  activity_regularizer=l2(regularization_strength),
                  name='decoding_{}'.format(hidden_dim)))

    # Output layer
    model.add(Dense(data_dim, name='output'))  # Multiple output neurons
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(training_data, training_data, verbose=1, epochs=epochs)

    # Save network structure to png
    dirname = os.path.dirname(file_name)
    fn = '{}/auto_encoder_model.png'.format(dirname)
    plot_model(model, to_file=fn, show_shapes=True)
    tc.green('Saved model image as {}'.format(fn))

    pred = model.predict(training_data)
    score = np.sqrt(metrics.mean_squared_error(pred, training_data))
    tc.yellow("Training Normal Score (RMSE): {}".format(score))

    pred = model.predict(test_data)
    score = np.sqrt(metrics.mean_squared_error(pred, test_data))
    tc.yellow("Test Normal Score (RMSE): {}".format(score))

    # Predict / create anomaly scores
    scores = []
    tc.yellow('Generating anomaly scores...')
    for feature in tqdm(test_data):
        pred = model.predict(np.array([feature]))
        score = np.sqrt(metrics.mean_squared_error(pred, np.array([feature])))
        scores.append(score)

    # Save scores (anomaly scores)
    df = pd.DataFrame({'anomaly_score': scores, 'is_anomaly': test_labels})
    df.to_csv(file_name, index=False)
    tc.green('Saved file {}'.format(file_name))

    visualize_and_save(scores, test_labels, file_name, regularization_strength)
Beispiel #14
0
 def save_data(self, data):
     """Write data to pandas csv file.
     """
     df = pd.DataFrame(data) 
     df.to_csv(self.file_name, index=False) 
     tc.green('Saved data in {}.'.format(self.file_name))
def save_data(df):
    """Write data to pandas csv file.
    """
    fn = '{}/sel102.csv'.format(result_dir)
    df.to_csv(fn, index=False)
    tc.green('Saved data in {}.'.format(fn))