from sklearn.feature_selection import RFE from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn_pandas import DataFrameMapper from dataprocessor import ProcessData, Filter training_frame = ProcessData.trainData() testing_frame = ProcessData.testData() # Remove anomalies in training frame based on percentile all_columns = list(training_frame.columns) rm_columns = ['UnitNumber', 'Time', 'Setting1', 'Setting2', 'Setting3', 'RUL'] filter_columns = [x for x in all_columns if x not in rm_columns] training_frame = Filter.filterDataPercentile(panda_frame=training_frame, columns=filter_columns, lower_percentile=0.01, upper_percentile=0.99, column_err_threshold=2) # Training data columns del training_frame['UnitNumber'] del training_frame['Time'] # Testing columns del testing_frame['UnitNumber'] del testing_frame['Time'] training_columns = list(training_frame.columns) training_columns.remove('RUL') response_column = 'RUL'
# Get reconstruction error reconstruction_error = anomaly_model.anomaly(test_data=h_train, per_feature=False) error_str = reconstruction_error.get_frame_data() err_list = list(map(float, error_str.split("\n")[1:-1])) err_list = np.array(err_list) # Threshold threshold = np.amax(err_list) print("Max Reconstruction Error :", reconstruction_error.max()) print("Threshold Reconstruction Error :", threshold) # Filter anomalies based on reconstruction error p_filter = Filter.filterDataAutoEncoder(panda_frame=p_train, reconstruction_error=err_list, threshold=threshold) # Drop features # del p_filter['Setting3'] # del p_filter['Sensor1'] # del p_filter['Sensor5'] # del p_filter['Sensor10'] # del p_filter['Sensor16'] # del p_filter['Sensor18'] # del p_filter['Sensor19'] h_filter = h2o.H2OFrame(p_filter) h_filter.set_names(list(p_filter.columns)) h_test = h2o.H2OFrame(p_test)
# Load training data frame pData = ProcessData.trainData() # Select columns selected_columns = list(pData.columns) selected_columns.remove('UnitNumber') selected_columns.remove('Time') selected_columns.remove('RUL') selected_columns.remove('Setting1') selected_columns.remove('Setting2') selected_columns.remove('Setting3') # Filtered data frame df = Filter.filterDataPercentile(panda_frame=pData, columns=selected_columns, lower_percentile=0.01, upper_percentile=0.99, column_err_threshold=1) # Feature engineering data_frame = ProcessData.trainDataToFrame(df, moving_k_closest_average=True, standard_deviation=True) testing_frame = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True) # Create h2o frame hData = h2o.H2OFrame(data_frame) hData.set_names(list(data_frame.columns)) hTesting = h2o.H2OFrame(testing_frame)
def function(): # AutoEncoder anomaly removal process p_train = ProcessData.trainData(moving_median_centered_average=True, standard_deviation=True, probability_distribution=True, bin_classification=True) p_test = ProcessData.testData(moving_median_centered_average=True, standard_deviation=True, probability_from_file=True, bin_classification=True) # Converting to h2o frane h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) h_train = h2o.H2OFrame(p_train) h_train.set_names(list(p_train.columns)) # Define autoencoder anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier", hidden=[25, 12, 25], sparse=True, l1=1e-4, epochs=100) # Select relevant features anomaly_train_columns = list(p_train.columns) print(anomaly_train_columns) anomaly_train_columns.remove('RUL') anomaly_train_columns.remove('BIN') anomaly_train_columns.remove('UnitNumber') anomaly_train_columns.remove('Time') anomaly_train_columns.remove('Setting1') anomaly_train_columns.remove('Setting2') anomaly_train_columns.remove('Setting3') # Train model anomaly_model.train(x=anomaly_train_columns, training_frame=h_train) # Get reconstruction error reconstruction_error = anomaly_model.anomaly(test_data=h_train, per_feature=False) error_str = reconstruction_error.get_frame_data() err_list = list(map(float, error_str.split("\n")[1:-1])) err_list = np.array(err_list) # Threshold threshold = np.amax(err_list) * 0.97 print("Max Reconstruction Error :", reconstruction_error.max()) print("Threshold Reconstruction Error :", threshold) # Filter anomalies based on reconstruction error p_filter = Filter.filterDataAutoEncoder(panda_frame=p_train, reconstruction_error=err_list, threshold=threshold) # Drop features del p_filter['Setting3'] del p_filter['Sensor1'] del p_filter['Sensor5'] del p_filter['Sensor10'] del p_filter['Sensor16'] del p_filter['Sensor18'] del p_filter['Sensor19'] h_filter = h2o.H2OFrame(p_filter) h_filter.set_names(list(p_filter.columns)) h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) training_columns = list(p_filter.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') training_columns.remove('BIN') h_filter['BIN'] = h_filter['BIN'].asfactor() h_test['BIN'] = h_test['BIN'].asfactor() model = H2ODeepLearningEstimator(variable_importances=True) model.train(x=training_columns, y='BIN', training_frame=h_filter, nfolds=10) predict = model.predict(test_data=h_test) predict = DataFrameParser.h2oToList(predict['predict']) actual = DataFrameParser.h2oToList(h_test['BIN']) Measures.confusion_matrix(actual, predict) print(predict) print(actual)
from h2o.estimators.gbm import H2OGradientBoostingEstimator from parser import DataFrameParser from presenting import Chart # Initialize h2o server h2o.init() pTrain = pd.read_csv("hTrainMy.csv") pValidate = pd.read_csv("hValidateMy.csv") pTest = pd.read_csv("hTestingMy.csv") all_columns = list(pTrain.columns) removing_columns = ['UnitNumber', 'Time', 'RUL', 'Setting1', 'Setting2', 'Setting3'] selected_columns = [x for x in all_columns if x not in removing_columns] filtered = Filter.filterData(panda_frame=pTrain, columns=selected_columns, removal_method="iqr", threshold=3) hTrain = h2o.H2OFrame(filtered) hTrain.set_names(list(pTrain.columns)) hValidate = h2o.H2OFrame(pValidate) hValidate.set_names(list(pValidate.columns)) hTest = h2o.H2OFrame(pTest) hTest.set_names(list(pTest.columns)) training_columns = list(pTrain.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL')
# Load data p_train = pd.read_csv('Training.csv') p_test = pd.read_csv('Testing.csv') all_columns = list(p_train.columns) removing_columns = ['UnitNumber', 'Time', 'RUL', 'Setting1', 'Setting2', 'Setting3'] selected_columns = [x for x in all_columns if x not in removing_columns] # Filter training dataset p_noise_filtered = Filter.filterData(panda_frame=p_train, columns=[], removal_method="iqr", threshold=3) removing_columns = ['UnitNumber', 'Time', 'RUL'] training_columns = [x for x in all_columns if x not in removing_columns] response_column = 'RUL' # Set mapper df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)]) # Pandas to sklearn train = df_mapper.fit_transform(p_noise_filtered) test = df_mapper.fit_transform(p_test) # [row : column] column_count = len(train[0, :])