Esempio n. 1
0
    def filterData(self, series=pd.DataFrame(), unwanted_columns = []):
        # Setup autoencoder model
        anomaly_model = H2OAutoEncoderEstimator(
            activation=self.activation,
            hidden=self.layers,
            l1=1e-4,
            epochs=self.epochs,
        )

        # Split data frame
        pValidate = series.sample(frac=self.validation_ratio, random_state=200)
        pTrain = series.drop(pValidate.index)

        # Convert pandas to h2o frame - for anomaly detection
        hValidate = h2o.H2OFrame(pValidate)
        hValidate.set_names(list(pValidate.columns))

        hTrain = h2o.H2OFrame(pTrain)
        hTrain.set_names(list(pTrain.columns))

        # Select columns
        train_columns = [x for x in list(series.columns) if x not in unwanted_columns]

        # Train model
        anomaly_model.train(x=train_columns, training_frame=hTrain, validation_frame=hValidate)

        # Get reconstruction error
        reconstruction_error = anomaly_model.anomaly(test_data=hTrain, per_feature=False)
        error_str = reconstruction_error.get_frame_data()
        err_list = map(float, error_str.split("\n")[1:-1])
        err_list = np.array(err_list)

        if self.anomaly_remove_function == 'iqr':
            print ""
Esempio n. 2
0
h_test.set_names(list(p_test.columns))

# Select columns for AutoEncoder
ac_train_columns = list(p_data.columns)  # Define autoencoder train columns
rm_columns = ['RUL', 'UnitNumber', 'Time', 'Setting1', 'Setting2',
              'Setting3']  # Columns need to be removed
'''
Because we are using auto encoders to remove noises in sensor readings. So we have to select only sensor readings
'''
for column in rm_columns:
    ac_train_columns.remove(column)

# Define AutoEncoder model
auto_encoder_model = H2OAutoEncoderEstimator(activation="Tanh",
                                             hidden=18,
                                             epochs=150,
                                             loss='Quadratic',
                                             distribution='gaussian')

# Train AutoEncoder model
auto_encoder_model.train(x=ac_train_columns,
                         training_frame=h_train,
                         validation_frame=h_validate)

# Get reconstruction error
reconstruction_error = auto_encoder_model.anomaly(test_data=h_train,
                                                  per_feature=False)
error_str = reconstruction_error.get_frame_data()
err_list = list(map(float, error_str.split("\n")[1:-1]))

# Filter anomalies in reconstruction error
Esempio n. 3
0
# AutoEncoder anomaly removal process
p_train = ProcessData.trainData(bin_classification=True)
p_test = ProcessData.testData(bin_classification=True)

# Converting to h2o frane
h_test = h2o.H2OFrame(p_test)
h_test.set_names(list(p_test.columns))

h_train = h2o.H2OFrame(p_train)
h_train.set_names(list(p_train.columns))

# Define autoencoder
anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier",
                                        hidden=[25, 12, 25],
                                        sparse=True,
                                        l1=1e-4,
                                        epochs=100)

# Select relevant features
anomaly_train_columns = list(p_train.columns)
print(anomaly_train_columns)
anomaly_train_columns.remove('RUL')
anomaly_train_columns.remove('BIN')
anomaly_train_columns.remove('UnitNumber')
anomaly_train_columns.remove('Time')
anomaly_train_columns.remove('Setting1')
anomaly_train_columns.remove('Setting2')
anomaly_train_columns.remove('Setting3')

# Train model
Esempio n. 4
0
pTrain.to_csv("Auto-Train.csv", index=False)

# Select relevant features
anomaly_train_columns = list(hTrain.columns)
anomaly_train_columns.remove(response_column)
anomaly_train_columns.remove('UnitNumber')
anomaly_train_columns.remove('Time')

column_count = len(anomaly_train_columns)

layers = [20, 6, 20]
print "Layers:", layers
# Define model
anomaly_model = H2OAutoEncoderEstimator(
    activation="Rectifier",
    hidden=layers,
    l1=1e-4,
    epochs=100,
)

# Train model
anomaly_model.train(x=anomaly_train_columns,
                    training_frame=hTrain,
                    validation_frame=hValidate)

# Get reconstruction error
reconstruction_error = anomaly_model.anomaly(test_data=hTrain,
                                             per_feature=False)
error_str = reconstruction_error.get_frame_data()
err_list = map(float, error_str.split("\n")[1:-1])
err_list = np.array(err_list)
Esempio n. 5
0
        return

    value_str = data_column.get_frame_data()
    splitter_list = value_str.split("\n")[1:-1]

    if data_type == "real":
        return list(map(float, splitter_list))
    elif data_type == "enum":
        return splitter_list


h2o.init()
pd_train = pd.read_csv('na_filled_random_forest.csv')
training_frame = h2o.H2OFrame(pd_train)
columns = list(pd_train.columns)
anomaly_model = H2OAutoEncoderEstimator()
anomaly_model.train(x=columns, training_frame=training_frame)
reconstruction_error = anomaly_model.anomaly(test_data=training_frame,
                                             per_feature=False)
reconstruction_error = list(map(float, h2OColumnToList(reconstruction_error)))
pd_train['reconstruction_error'] = reconstruction_error

pd_test = pd.read_csv('dataset/dengue_features_test.csv')
testing_frame = h2o.H2OFrame(pd_test)
columns = list(pd_test.columns)
anomaly_model = H2OAutoEncoderEstimator()
anomaly_model.train(x=columns, training_frame=testing_frame)
reconstruction_error = anomaly_model.anomaly(test_data=testing_frame,
                                             per_feature=False)
reconstruction_error = list(map(float, h2OColumnToList(reconstruction_error)))
pd_test['reconstruction_error'] = reconstruction_error
Esempio n. 6
0
def function():
    # AutoEncoder anomaly removal process
    p_train = ProcessData.trainData(moving_median_centered_average=True,
                                    standard_deviation=True,
                                    probability_distribution=True,
                                    bin_classification=True)
    p_test = ProcessData.testData(moving_median_centered_average=True,
                                  standard_deviation=True,
                                  probability_from_file=True,
                                  bin_classification=True)

    # Converting to h2o frane
    h_test = h2o.H2OFrame(p_test)
    h_test.set_names(list(p_test.columns))

    h_train = h2o.H2OFrame(p_train)
    h_train.set_names(list(p_train.columns))

    # Define autoencoder
    anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier",
                                            hidden=[25, 12, 25],
                                            sparse=True,
                                            l1=1e-4,
                                            epochs=100)

    # Select relevant features
    anomaly_train_columns = list(p_train.columns)
    print(anomaly_train_columns)
    anomaly_train_columns.remove('RUL')
    anomaly_train_columns.remove('BIN')
    anomaly_train_columns.remove('UnitNumber')
    anomaly_train_columns.remove('Time')
    anomaly_train_columns.remove('Setting1')
    anomaly_train_columns.remove('Setting2')
    anomaly_train_columns.remove('Setting3')

    # Train model
    anomaly_model.train(x=anomaly_train_columns, training_frame=h_train)

    # Get reconstruction error
    reconstruction_error = anomaly_model.anomaly(test_data=h_train,
                                                 per_feature=False)
    error_str = reconstruction_error.get_frame_data()
    err_list = list(map(float, error_str.split("\n")[1:-1]))
    err_list = np.array(err_list)

    # Threshold
    threshold = np.amax(err_list) * 0.97

    print("Max Reconstruction Error       :", reconstruction_error.max())
    print("Threshold Reconstruction Error :", threshold)

    # Filter anomalies based on reconstruction error
    p_filter = Filter.filterDataAutoEncoder(panda_frame=p_train,
                                            reconstruction_error=err_list,
                                            threshold=threshold)

    # Drop features
    del p_filter['Setting3']
    del p_filter['Sensor1']
    del p_filter['Sensor5']
    del p_filter['Sensor10']
    del p_filter['Sensor16']
    del p_filter['Sensor18']
    del p_filter['Sensor19']

    h_filter = h2o.H2OFrame(p_filter)
    h_filter.set_names(list(p_filter.columns))

    h_test = h2o.H2OFrame(p_test)
    h_test.set_names(list(p_test.columns))

    training_columns = list(p_filter.columns)
    training_columns.remove('UnitNumber')
    training_columns.remove('Time')
    training_columns.remove('RUL')
    training_columns.remove('BIN')

    h_filter['BIN'] = h_filter['BIN'].asfactor()
    h_test['BIN'] = h_test['BIN'].asfactor()

    model = H2ODeepLearningEstimator(variable_importances=True)
    model.train(x=training_columns,
                y='BIN',
                training_frame=h_filter,
                nfolds=10)

    predict = model.predict(test_data=h_test)
    predict = DataFrameParser.h2oToList(predict['predict'])
    actual = DataFrameParser.h2oToList(h_test['BIN'])

    Measures.confusion_matrix(actual, predict)
    print(predict)
    print(actual)
print list(iq_train.columns)
print ""

h2o.init()

print "Adding Reconstruction Error"
print "---------------------------"
print "Applying to SJ Train"
print "---------------------------"
columns = list(sj_train.columns)
columns.remove('total_cases')
sj_training_frame = h2o.H2OFrame(sj_train)
sj_training_frame.set_names(list(sj_train.columns))
sj_testing_frame = h2o.H2OFrame(sj_test)
sj_testing_frame.set_names(list(sj_test.columns))
sj_model = H2OAutoEncoderEstimator()
sj_model.train(x=columns, training_frame=sj_training_frame)
sj_reconstruction_error = sj_model.anomaly(test_data=sj_training_frame,
                                           per_feature=False)
sj_reconstruction_error = list(
    map(float, h2OColumnToList(sj_reconstruction_error)))
sj_reconstruction_error_test = sj_model.anomaly(test_data=sj_testing_frame,
                                                per_feature=False)
sj_reconstruction_error_test = list(
    map(float, h2OColumnToList(sj_reconstruction_error_test)))
sj_train['reconstruction_error'] = sj_reconstruction_error
sj_test['reconstruction_error'] = sj_reconstruction_error_test

print ""

print "Applying to IQ Train"