def get_data_shape(file_list, best_features):

    data = pd.read_csv(file_list[0])

    if best_features is not None:
        data = data[best_features].values
    else:
        data = data.values[1:]

    data = get_seq2seq_batch(data, seq_length, batch_size)
    return data.shape
Exemple #2
0
    def compute_set_loss(self, file_list, best_features):
        losses = []

        for file in file_list:
            # Load data
            data = pd.read_csv(file)
            data = keep_best_features(data, best_features)

            # Prepare data for seq2seq processing
            data = get_seq2seq_batch(data, self.seq_length, self.batch_size)

            # Test data
            loss = self.test_batch(data)
            losses.append(loss)

        # Computes average loss on validation files
        avg_loss = np.average(np.asarray(losses))
        return avg_loss
Exemple #3
0
    def get_roc(self, strt_thr, end_thr, step, files_test_normal,
                files_test_anomalous, best_features):

        print('[ + ] Computing ROC curve using :')
        print('\t[-->] Threshold from 0 to', end_thr)
        print('\t[-->] Step :', step)
        print('\t[-->] ROC curve has', int(end_thr / step), 'points')

        point_count = 1

        # Init empty arrays for points coordinates
        a_vp = []
        a_vn = []
        a_fp = []
        a_fn = []

        # For every threshold...
        for thr in drange(strt_thr, end_thr, step):
            tot_vn = 0
            tot_vp = 0
            tot_fn = 0
            tot_fp = 0
            n_sample_norm = 0
            n_sample_anom = 0

            # Compute predictions for normal test set
            for file in files_test_normal:

                # Load data
                data = pd.read_csv(file)
                data = keep_best_features(data, best_features)

                # Prepare data for seq2seq processing
                data = get_seq2seq_batch(data, self.seq_length,
                                         self.batch_size)

                # Predict file with current threshold
                fp, vn = self.predict(data, thr)

                # Update true/false positives count
                n_sample_norm += data.shape[1]
                tot_vn += vn
                tot_fp += fp

            # Compute predictions for anomalous test set
            for file in files_test_anomalous:

                # Load data
                data = pd.read_csv(file)
                data = keep_best_features(data, best_features)

                # Prepare data for seq2seq processing
                data = get_seq2seq_batch(data, self.seq_length,
                                         self.batch_size)

                # Predict file with current threshold
                vp, fn = self.predict(data, thr)

                # Update
                n_sample_anom += data.shape[1]
                tot_vp += vp
                tot_fn += fn

            vp_rate = tot_vp / n_sample_anom  # Append for ROC curve
            vn_rate = tot_vn / n_sample_norm
            fp_rate = tot_fp / n_sample_norm  # Append for ROC curve
            fn_rate = tot_fn / n_sample_norm

            print(
                '\t[ + ] Point {}/{} \tTrue Positive Rate : {:.5f} \tFalse Positive Rate : {:.5f}'
                .format(point_count, int(end_thr / step), vp_rate, fp_rate))
            point_count += 1

            a_vp.append(vp_rate)
            a_vn.append(vn_rate)
            a_fp.append(fp_rate)
            a_fn.append(fn_rate)

        return a_fp, a_vp
Exemple #4
0
    def train(self,
              train_file_list,
              validation_file_list=None,
              anomalous_file_list=None,
              best_features=None,
              epoch=200):

        print('[ + ] Starting training !')

        avg_train_losses = []
        avg_valid_losses = []
        avg_anomalous_losses = []

        for e in range(epoch):

            losses = []

            for file in train_file_list:

                # Load data
                data = pd.read_csv(file)
                data = keep_best_features(data, best_features)

                # Prepare data for seq2seq processing
                data = get_seq2seq_batch(data, self.seq_length,
                                         self.batch_size)

                # Train step
                loss = self.train_batch(data)
                losses.append(loss)

            # Compute losses to display
            if e % 10 == 0:

                #print('One sample time step : ', data[0,0,:])
                #print('Prediction', self.predict_batch(data)[0,0,:])

                # Computes average loss on train files
                avg_loss = np.average(np.asarray(losses))
                avg_train_losses.append(avg_loss)

                # Compute validation loss
                if validation_file_list is not None:
                    avg_val_loss = self.compute_set_loss(
                        validation_file_list, best_features)
                    avg_valid_losses.append(avg_val_loss)
                else:
                    avg_val_loss = np.nan

                # Compute anomlous set loss
                if anomalous_file_list is not None:
                    avg_ano_loss = self.compute_set_loss(
                        anomalous_file_list, best_features)
                    avg_anomalous_losses.append(avg_ano_loss)
                else:
                    avg_ano_loss = np.nan

                # Display losses
                print(
                    '\t[ + ] Step {}/{} \tTrain loss : {:.4f} \tValidation loss : {:.4f} \tAnomalous set loss : {:.4f}'
                    .format(e + 10, epoch, avg_loss, avg_val_loss,
                            avg_ano_loss))

        return avg_train_losses, avg_valid_losses, avg_anomalous_losses