from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn_pandas import DataFrameMapper
from dataprocessor import ProcessData, Filter

training_frame = ProcessData.trainData()
testing_frame = ProcessData.testData()

# Remove anomalies in training frame based on percentile
all_columns = list(training_frame.columns)
rm_columns = ['UnitNumber', 'Time', 'Setting1', 'Setting2', 'Setting3', 'RUL']
filter_columns = [x for x in all_columns if x not in rm_columns]
training_frame = Filter.filterDataPercentile(panda_frame=training_frame,
                                             columns=filter_columns,
                                             lower_percentile=0.01,
                                             upper_percentile=0.99,
                                             column_err_threshold=2)

# Training data columns
del training_frame['UnitNumber']
del training_frame['Time']

# Testing columns
del testing_frame['UnitNumber']
del testing_frame['Time']

training_columns = list(training_frame.columns)
training_columns.remove('RUL')
response_column = 'RUL'
# Load training data frame
pData = ProcessData.trainData()

# Select columns
selected_columns = list(pData.columns)
selected_columns.remove('UnitNumber')
selected_columns.remove('Time')
selected_columns.remove('RUL')
selected_columns.remove('Setting1')
selected_columns.remove('Setting2')
selected_columns.remove('Setting3')

# Filtered data frame
df = Filter.filterDataPercentile(panda_frame=pData,
                                 columns=selected_columns,
                                 lower_percentile=0.01,
                                 upper_percentile=0.99,
                                 column_err_threshold=1)

# Feature engineering
data_frame = ProcessData.trainDataToFrame(df,
                                          moving_k_closest_average=True,
                                          standard_deviation=True)
testing_frame = ProcessData.testData(moving_k_closest_average=True,
                                     standard_deviation=True)

# Create h2o frame
hData = h2o.H2OFrame(data_frame)
hData.set_names(list(data_frame.columns))

hTesting = h2o.H2OFrame(testing_frame)