from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn_pandas import DataFrameMapper
from dataprocessor import ProcessData, Filter

training_frame = ProcessData.trainData()
testing_frame = ProcessData.testData()

# Remove anomalies in training frame based on percentile
all_columns = list(training_frame.columns)
rm_columns = ['UnitNumber', 'Time', 'Setting1', 'Setting2', 'Setting3', 'RUL']
filter_columns = [x for x in all_columns if x not in rm_columns]
training_frame = Filter.filterDataPercentile(panda_frame=training_frame,
                                             columns=filter_columns,
                                             lower_percentile=0.01,
                                             upper_percentile=0.99,
                                             column_err_threshold=2)

# Training data columns
del training_frame['UnitNumber']
del training_frame['Time']

# Testing columns
del testing_frame['UnitNumber']
del testing_frame['Time']

training_columns = list(training_frame.columns)
training_columns.remove('RUL')
response_column = 'RUL'
Exemple #2
0
# Get reconstruction error
reconstruction_error = anomaly_model.anomaly(test_data=h_train,
                                             per_feature=False)
error_str = reconstruction_error.get_frame_data()
err_list = list(map(float, error_str.split("\n")[1:-1]))
err_list = np.array(err_list)

# Threshold
threshold = np.amax(err_list)

print("Max Reconstruction Error       :", reconstruction_error.max())
print("Threshold Reconstruction Error :", threshold)

# Filter anomalies based on reconstruction error
p_filter = Filter.filterDataAutoEncoder(panda_frame=p_train,
                                        reconstruction_error=err_list,
                                        threshold=threshold)

# Drop features
# del p_filter['Setting3']
# del p_filter['Sensor1']
# del p_filter['Sensor5']
# del p_filter['Sensor10']
# del p_filter['Sensor16']
# del p_filter['Sensor18']
# del p_filter['Sensor19']

h_filter = h2o.H2OFrame(p_filter)
h_filter.set_names(list(p_filter.columns))

h_test = h2o.H2OFrame(p_test)
# Load training data frame
pData = ProcessData.trainData()

# Select columns
selected_columns = list(pData.columns)
selected_columns.remove('UnitNumber')
selected_columns.remove('Time')
selected_columns.remove('RUL')
selected_columns.remove('Setting1')
selected_columns.remove('Setting2')
selected_columns.remove('Setting3')

# Filtered data frame
df = Filter.filterDataPercentile(panda_frame=pData,
                                 columns=selected_columns,
                                 lower_percentile=0.01,
                                 upper_percentile=0.99,
                                 column_err_threshold=1)

# Feature engineering
data_frame = ProcessData.trainDataToFrame(df,
                                          moving_k_closest_average=True,
                                          standard_deviation=True)
testing_frame = ProcessData.testData(moving_k_closest_average=True,
                                     standard_deviation=True)

# Create h2o frame
hData = h2o.H2OFrame(data_frame)
hData.set_names(list(data_frame.columns))

hTesting = h2o.H2OFrame(testing_frame)
Exemple #4
0
def function():
    # AutoEncoder anomaly removal process
    p_train = ProcessData.trainData(moving_median_centered_average=True,
                                    standard_deviation=True,
                                    probability_distribution=True,
                                    bin_classification=True)
    p_test = ProcessData.testData(moving_median_centered_average=True,
                                  standard_deviation=True,
                                  probability_from_file=True,
                                  bin_classification=True)

    # Converting to h2o frane
    h_test = h2o.H2OFrame(p_test)
    h_test.set_names(list(p_test.columns))

    h_train = h2o.H2OFrame(p_train)
    h_train.set_names(list(p_train.columns))

    # Define autoencoder
    anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier",
                                            hidden=[25, 12, 25],
                                            sparse=True,
                                            l1=1e-4,
                                            epochs=100)

    # Select relevant features
    anomaly_train_columns = list(p_train.columns)
    print(anomaly_train_columns)
    anomaly_train_columns.remove('RUL')
    anomaly_train_columns.remove('BIN')
    anomaly_train_columns.remove('UnitNumber')
    anomaly_train_columns.remove('Time')
    anomaly_train_columns.remove('Setting1')
    anomaly_train_columns.remove('Setting2')
    anomaly_train_columns.remove('Setting3')

    # Train model
    anomaly_model.train(x=anomaly_train_columns, training_frame=h_train)

    # Get reconstruction error
    reconstruction_error = anomaly_model.anomaly(test_data=h_train,
                                                 per_feature=False)
    error_str = reconstruction_error.get_frame_data()
    err_list = list(map(float, error_str.split("\n")[1:-1]))
    err_list = np.array(err_list)

    # Threshold
    threshold = np.amax(err_list) * 0.97

    print("Max Reconstruction Error       :", reconstruction_error.max())
    print("Threshold Reconstruction Error :", threshold)

    # Filter anomalies based on reconstruction error
    p_filter = Filter.filterDataAutoEncoder(panda_frame=p_train,
                                            reconstruction_error=err_list,
                                            threshold=threshold)

    # Drop features
    del p_filter['Setting3']
    del p_filter['Sensor1']
    del p_filter['Sensor5']
    del p_filter['Sensor10']
    del p_filter['Sensor16']
    del p_filter['Sensor18']
    del p_filter['Sensor19']

    h_filter = h2o.H2OFrame(p_filter)
    h_filter.set_names(list(p_filter.columns))

    h_test = h2o.H2OFrame(p_test)
    h_test.set_names(list(p_test.columns))

    training_columns = list(p_filter.columns)
    training_columns.remove('UnitNumber')
    training_columns.remove('Time')
    training_columns.remove('RUL')
    training_columns.remove('BIN')

    h_filter['BIN'] = h_filter['BIN'].asfactor()
    h_test['BIN'] = h_test['BIN'].asfactor()

    model = H2ODeepLearningEstimator(variable_importances=True)
    model.train(x=training_columns,
                y='BIN',
                training_frame=h_filter,
                nfolds=10)

    predict = model.predict(test_data=h_test)
    predict = DataFrameParser.h2oToList(predict['predict'])
    actual = DataFrameParser.h2oToList(h_test['BIN'])

    Measures.confusion_matrix(actual, predict)
    print(predict)
    print(actual)
Exemple #5
0
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from parser import DataFrameParser
from presenting import Chart

# Initialize h2o server
h2o.init()

pTrain = pd.read_csv("hTrainMy.csv")
pValidate = pd.read_csv("hValidateMy.csv")
pTest = pd.read_csv("hTestingMy.csv")

all_columns = list(pTrain.columns)
removing_columns = ['UnitNumber', 'Time', 'RUL', 'Setting1', 'Setting2', 'Setting3']
selected_columns = [x for x in all_columns if x not in removing_columns]

filtered = Filter.filterData(panda_frame=pTrain, columns=selected_columns, removal_method="iqr", threshold=3)

hTrain = h2o.H2OFrame(filtered)
hTrain.set_names(list(pTrain.columns))

hValidate = h2o.H2OFrame(pValidate)
hValidate.set_names(list(pValidate.columns))

hTest = h2o.H2OFrame(pTest)
hTest.set_names(list(pTest.columns))

training_columns = list(pTrain.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')
Exemple #6
0





# Load data
p_train = pd.read_csv('Training.csv')
p_test = pd.read_csv('Testing.csv')

all_columns = list(p_train.columns)
removing_columns = ['UnitNumber', 'Time', 'RUL', 'Setting1', 'Setting2', 'Setting3']
selected_columns = [x for x in all_columns if x not in removing_columns]

# Filter training dataset
p_noise_filtered = Filter.filterData(panda_frame=p_train, columns=[], removal_method="iqr", threshold=3)

removing_columns = ['UnitNumber', 'Time', 'RUL']
training_columns = [x for x in all_columns if x not in removing_columns]
response_column = 'RUL'

# Set mapper
df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)])

# Pandas to sklearn
train = df_mapper.fit_transform(p_noise_filtered)
test = df_mapper.fit_transform(p_test)

# [row : column]
column_count = len(train[0, :])