from keras.layers import Dense, LSTM, Dropout, Activation from keras.models import Sequential from keras.wrappers.scikit_learn import KerasRegressor from sklearn.cross_validation import KFold, cross_val_score from sklearn.preprocessing import MinMaxScaler from dataprocessor import ProcessData from sklearn_pandas import DataFrameMapper import numpy import math # define response variable response_column = 'RUL' # load pre-processed data frames training_frame = ProcessData.trainData() testing_frame = ProcessData.testData() del training_frame['UnitNumber'] del training_frame['Time'] del testing_frame['UnitNumber'] del testing_frame['Time'] # feature column names training_columns = list(training_frame.columns) # Set mapper df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)])
import h2o import numpy as np from h2o.estimators import H2OAutoEncoderEstimator from h2o.estimators import H2ODeepLearningEstimator from h2o.estimators import H2ORandomForestEstimator from dataprocessor import ProcessData, Filter from featureeng import Measures from parser import DataFrameParser # Initialize server h2o.init() # AutoEncoder anomaly removal process p_train = ProcessData.trainData(bin_classification=True) p_test = ProcessData.testData(bin_classification=True) # Converting to h2o frane h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) h_train = h2o.H2OFrame(p_train) h_train.set_names(list(p_train.columns)) # Define autoencoder anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier", hidden=[25, 12, 25], sparse=True, l1=1e-4, epochs=100) # Select relevant features
# Configuration _validation_ratio_1 = 0.2 # For Auto Encoder _validation_ratio_2 = 0.2 # For Predictive Model _reconstruction_error_rate = 0.9 _nmodels = 50 _smodels = 10 _lim = 3 # Define response column response_column = 'RUL' # initialize server h2o.init() # Load data frames pData = ProcessData.trainData() # Split data frame pValidate = pData.sample(frac=_validation_ratio_1, random_state=200) pTrain = pData.drop(pValidate.index) # Convert pandas to h2o frame - for anomaly detection hValidate = h2o.H2OFrame(pValidate) hValidate.set_names(list(pValidate.columns)) hTrain = h2o.H2OFrame(pTrain) hTrain.set_names(list(pTrain.columns)) # Select relevant features anomaly_train_columns = list(hTrain.columns) anomaly_train_columns.remove(response_column) anomaly_train_columns.remove('UnitNumber')
# Random Forest Regressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn_pandas import DataFrameMapper from sklearn.externals import joblib from dataprocessor import ProcessData import numpy as np import os # Define response variable response_column = "RUL" # Process data training_frame = ProcessData.trainData(moving_average=True, standard_deviation=True, moving_entropy=True) testing_frame = ProcessData.testData(moving_average=True, standard_deviation=True, moving_entropy=True) # Select training columns training_columns = list(training_frame.columns) training_columns.remove(response_column) # Remove RUL training_columns.remove("UnitNumber") # Remove UnitNumber training_columns.remove("Time") # Remove Time # Set mapper df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)])
from dataprocessor import ProcessData from featureeng import Measures train = ProcessData.trainData() training_columns = list(train.columns) training_columns.remove('RUL') for column in training_columns: print(column, Measures.correlation(train[column].values, train['RUL'].values))
# h2o testing import h2o from h2o.estimators import H2ODeepLearningEstimator from dataprocessor import ProcessData # initialize server from featureeng.Math import moving_probability, probabilty_distribution h2o.init() # define response variable response_column = 'RUL' # load pre-processed data frames training_frame = ProcessData.trainData(standard_deviation=True, moving_k_closest_average=True, moving_median_centered_average=True, moving_threshold_average=True) testing_frame = ProcessData.testData(standard_deviation=True, moving_k_closest_average=True, moving_median_centered_average=True, moving_threshold_average=True) # create h2o frames train = h2o.H2OFrame(training_frame) test = h2o.H2OFrame(testing_frame) train.set_names(list(training_frame.columns)) test.set_names(list(testing_frame.columns)) # Feature selection training_columns = list(training_frame.columns) training_columns.remove(response_column) training_columns.remove("UnitNumber") training_columns.remove("Time") # Building mode model = H2ODeepLearningEstimator(hidden=[200, 200], score_each_iteration=False, variable_importances=True)
# GridSearch for RandomForest from h2o.estimators import H2ORandomForestEstimator from h2o.grid import H2OGridSearch from dataprocessor import ProcessData from anomaly import Test import numpy as np import h2o # Initialize h2o server h2o.init() # Load training data frame pData = ProcessData.trainData() # Select columns selected_columns = list(pData.columns) selected_columns.remove('UnitNumber') selected_columns.remove('Time') selected_columns.remove('RUL') selected_columns.remove('Setting1') selected_columns.remove('Setting2') selected_columns.remove('Setting3') tot = 0 anomaly_series = [] for column in selected_columns: series = pData[column] anomaly = Test.iqr(series, threshold=3) anomaly_series.extend(anomaly)
import math from dataprocessor import ProcessData from h2o.estimators import H2ODeepLearningEstimator from sklearn.metrics import mean_squared_error, mean_absolute_error # config _nmodels = 10 _smodels = 5 _lim = 1 _validation_ratio = 0.8 # initialize server h2o.init() # get processed data pTrain = ProcessData.trainData(moving_k_closest_average=True, standard_deviation=True, probability_distribution=True) pTest = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, probability_from_file=True) # convert to h2o frames hTrain = h2o.H2OFrame(pTrain) hTest = h2o.H2OFrame(pTest) hTrain.set_names(list(pTrain.columns)) hTest.set_names(list(pTest.columns)) # select column names response_column = 'RUL' training_columns = list(pTrain.columns) training_columns.remove(response_column) training_columns.remove("UnitNumber") training_columns.remove("Time")
err_list = map(float, error_str.split("\n")[1:-1]) var = np.array(err_list) # input array return np.percentile(var, percentile * 100) _validation_ratio = 0.1 _reconstruction_error_rate = 0.9 # Define response column response_column = 'RUL' # initialize server h2o.init() # Load data frames pData = ProcessData.trainData() #pTest = ProcessData.testData() # Split data frame pValidate = pData.sample(frac=_validation_ratio, random_state=200) pTrain = pData.drop(pValidate.index) # Convert pandas to h2o frame - for anomaly detection hValidate = h2o.H2OFrame(pValidate) hValidate.set_names(list(pValidate.columns)) hTrain = h2o.H2OFrame(pTrain) hTrain.set_names(list(pTrain.columns)) # Save validate and train frames pValidate.to_csv("Auto-Validate.csv", index=False)
import h2o import numpy as np import math from dataprocessor import ProcessData from h2o.estimators import H2ODeepLearningEstimator from sklearn.metrics import mean_squared_error, mean_absolute_error # config _nmodels = 10 # initialize server h2o.init() # get processed data pTrain = ProcessData.trainData(moving_k_closest_average=True, standard_deviation=True) pTest = ProcessData.testData(moving_threshold_average=True, standard_deviation=True) # convert to h2o frames hTrain = h2o.H2OFrame(pTrain) hTest = h2o.H2OFrame(pTest) hTrain.set_names(list(pTrain.columns)) hTest.set_names(list(pTest.columns)) # select column names response_column = 'RUL' training_columns = list(pTrain.columns) training_columns.remove(response_column) training_columns.remove("UnitNumber")
import h2o from h2o.estimators import H2ODeepLearningEstimator from dataprocessor import ProcessData from h2o.estimators import H2ORandomForestEstimator from h2o.grid import H2OGridSearch # Initialize server h2o.init() data = ProcessData.trainData(moving_k_closest_average=True, standard_deviation=True, probability_distribution=True) hData = h2o.H2OFrame(data) hData.set_names(list(data.columns)) training_columns = list(data.columns) training_columns.remove('RUL') training_columns.remove('UnitNumber') training_columns.remove('Time') # hyper_parameters = {'ntrees': [10, 50], 'max_depth': [20, 10]} # grid_search = H2OGridSearch(H2ORandomForestEstimator, hyper_params=hyper_parameters) # grid_search.train(x=training_columns, y='RUL', training_frame=hData) # grid_search.show() # models = grid_search.sort_by("mse") # print models hyper_parameters = {
import h2o from h2o.estimators import H2OAutoEncoderEstimator import numpy as np import pandas as pd from h2o.estimators import H2ODeepLearningEstimator from dataprocessor import ProcessData, Filter # Connect to h2o instance from parser import DataFrameParser from presenting import Chart h2o.init() # AutoEncoder anomaly removal process p_train = ProcessData.trainData() h_train = h2o.H2OFrame(p_train) h_train.set_names(list(p_train.columns)) anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier", hidden=[25, 12, 25], sparse=True, l1=1e-4, epochs=100) # Select relevant features anomaly_train_columns = list(p_train.columns) anomaly_train_columns.remove('RUL') anomaly_train_columns.remove('UnitNumber') anomaly_train_columns.remove('Time')
# Ridge Regression from sklearn import linear_model from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn_pandas import DataFrameMapper from sklearn.externals import joblib from dataprocessor import ProcessData import numpy as np import os # Define response variable response_column = "RUL" # Process data training_frame = ProcessData.trainData(moving_average=True, standard_deviation=True, probability_distribution=True) testing_frame = ProcessData.testData(moving_average=True, standard_deviation=True, probability_distribution=True) # Select training columns training_columns = list(training_frame.columns) training_columns.remove(response_column) # Remove RUL training_columns.remove("UnitNumber") # Remove UnitNumber training_columns.remove("Time") # Remove Time # Set mapper df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)])
from h2o.estimators import H2OAutoEncoderEstimator from h2o.estimators import H2ODeepLearningEstimator from sklearn.cross_validation import train_test_split from dataprocessor import ProcessData from featureeng import Progress _validation_ratio = 0.1 _reconstruction_error_rate = 0.9 # Define response column response_column = 'RUL' # initialize server h2o.init() # Load data frames pData = ProcessData.trainData() pTest = ProcessData.testData() # Split methods # method 1 train, validate = train_test_split(pData, test_size=_validation_ratio) print(len(train)) print(train) # method 2 validate = pData.sample(frac=_validation_ratio, random_state=200) train = pData.drop(validate.index) print(len(train)) print(train)
# Recursive Feature Elimination import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.feature_selection import RFE from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn_pandas import DataFrameMapper from dataprocessor import ProcessData, Filter training_frame = ProcessData.trainData() testing_frame = ProcessData.testData() # Remove anomalies in training frame based on percentile all_columns = list(training_frame.columns) rm_columns = ['UnitNumber', 'Time', 'Setting1', 'Setting2', 'Setting3', 'RUL'] filter_columns = [x for x in all_columns if x not in rm_columns] training_frame = Filter.filterDataPercentile(panda_frame=training_frame, columns=filter_columns, lower_percentile=0.01, upper_percentile=0.99, column_err_threshold=2) # Training data columns del training_frame['UnitNumber'] del training_frame['Time'] # Testing columns del testing_frame['UnitNumber'] del testing_frame['Time']
def function(): # AutoEncoder anomaly removal process p_train = ProcessData.trainData(moving_median_centered_average=True, standard_deviation=True, probability_distribution=True, bin_classification=True) p_test = ProcessData.testData(moving_median_centered_average=True, standard_deviation=True, probability_from_file=True, bin_classification=True) # Converting to h2o frane h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) h_train = h2o.H2OFrame(p_train) h_train.set_names(list(p_train.columns)) # Define autoencoder anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier", hidden=[25, 12, 25], sparse=True, l1=1e-4, epochs=100) # Select relevant features anomaly_train_columns = list(p_train.columns) print(anomaly_train_columns) anomaly_train_columns.remove('RUL') anomaly_train_columns.remove('BIN') anomaly_train_columns.remove('UnitNumber') anomaly_train_columns.remove('Time') anomaly_train_columns.remove('Setting1') anomaly_train_columns.remove('Setting2') anomaly_train_columns.remove('Setting3') # Train model anomaly_model.train(x=anomaly_train_columns, training_frame=h_train) # Get reconstruction error reconstruction_error = anomaly_model.anomaly(test_data=h_train, per_feature=False) error_str = reconstruction_error.get_frame_data() err_list = list(map(float, error_str.split("\n")[1:-1])) err_list = np.array(err_list) # Threshold threshold = np.amax(err_list) * 0.97 print("Max Reconstruction Error :", reconstruction_error.max()) print("Threshold Reconstruction Error :", threshold) # Filter anomalies based on reconstruction error p_filter = Filter.filterDataAutoEncoder(panda_frame=p_train, reconstruction_error=err_list, threshold=threshold) # Drop features del p_filter['Setting3'] del p_filter['Sensor1'] del p_filter['Sensor5'] del p_filter['Sensor10'] del p_filter['Sensor16'] del p_filter['Sensor18'] del p_filter['Sensor19'] h_filter = h2o.H2OFrame(p_filter) h_filter.set_names(list(p_filter.columns)) h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) training_columns = list(p_filter.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') training_columns.remove('BIN') h_filter['BIN'] = h_filter['BIN'].asfactor() h_test['BIN'] = h_test['BIN'].asfactor() model = H2ODeepLearningEstimator(variable_importances=True) model.train(x=training_columns, y='BIN', training_frame=h_filter, nfolds=10) predict = model.predict(test_data=h_test) predict = DataFrameParser.h2oToList(predict['predict']) actual = DataFrameParser.h2oToList(h_test['BIN']) Measures.confusion_matrix(actual, predict) print(predict) print(actual)
# Without autoencoders chech anomalies from h2o.estimators import H2ODeepLearningEstimator from h2o.grid import H2OGridSearch from dataprocessor import ProcessData from anomaly import Test import numpy as np import h2o # Initialize h2o server h2o.init() # Load training data frame pData = ProcessData.trainData() # Select columns selected_columns = list(pData.columns) selected_columns.remove('UnitNumber') selected_columns.remove('Time') selected_columns.remove('RUL') selected_columns.remove('Setting1') selected_columns.remove('Setting2') selected_columns.remove('Setting3') tot = 0 anomaly_series = [] for column in selected_columns: series = pData[column] anomaly = Test.iqr(series, threshold=5) anomaly_series.extend(anomaly)
for column in all_columns: if column not in sustain: del train[column] del test[column] training_columns = sustain training_columns.remove('UnitNumber') training_columns.remove('RUL') training_columns.remove('Time') #filter_train = Process.filterData(panda_frame=train, columns=sustain, removal_method='iqr', threshold=4) filter_train = train feature_engineered_train = ProcessData.trainDataToFrame( training_frame=filter_train, moving_k_closest_average=True, standard_deviation=True) feature_engineered_test = ProcessData.trainDataToFrame( training_frame=test, moving_k_closest_average=True, standard_deviation=True, rul=True) h_train = h2o.H2OFrame(feature_engineered_train) h_train.set_names(list(feature_engineered_train.columns)) h_test = h2o.H2OFrame(feature_engineered_test) h_test.set_names(list(feature_engineered_test.columns)) model = H2ODeepLearningEstimator(epochs=100,
from dataprocessor import ProcessData import numpy as np import pandas as pd from sklearn.metrics import mean_squared_error # initialize server from featureeng.Math import moving_probability, probabilty_distribution h2o.init() # define response variable response_column = 'RUL' # load pre-processed data frames training_frame = ProcessData.trainData(standard_deviation=True, moving_k_closest_average=True) testing_frame = ProcessData.testData(standard_deviation=True, moving_k_closest_average=True) # create h2o frames train = h2o.H2OFrame(training_frame) test = h2o.H2OFrame(testing_frame) train.set_names(list(training_frame.columns)) test.set_names(list(testing_frame.columns)) # Feature selection training_columns = list(training_frame.columns) training_columns.remove(response_column) training_columns.remove("UnitNumber") training_columns.remove("Time")
from h2o.estimators import H2ODeepLearningEstimator from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn_pandas import DataFrameMapper from dataprocessor import ProcessData import math h2o.init() _nmodels = 10 _lim = 2 # define response variable response_column = 'RUL' # Pandas frame data_frame = ProcessData.trainData() test_frame = ProcessData.testData() # Create h2o frame h2o_data = h2o.H2OFrame(data_frame) h2o_data.set_names(list(data_frame.columns)) h2o_test = h2o.H2OFrame(test_frame) h2o_test.set_names(list(test_frame.columns)) # split frame data = h2o_data.split_frame(ratios=(0.9, 0.09)) # split data train_data = data[0] validate_data = data[1]