def setUp(self): self.input_data, self.config_map = read_input( 'Input/historical_data.csv', 'Input/data_config.json') self.nan_data = self.input_data.head(100) self.input_data = self.input_data.dropna() self.input_data = self.input_data.reset_index(drop=True) self.data = self.input_data.head(5)
def estimator_selection( estimators=None, data_path=None, config_path=None, Kfolds=10, n_jobs=-1): ''' Cross Validator Best Estimator Analysis ''' if data_path is None or config_path is None or estimators is None: raise ValueError('Need Estimators, Data path and Config Path as arguments !') data, config_map = read_input(data_path, config_path) data = data_preprocessor(data, config_map, 5, 'string') y = data[config_map['label']] data.drop(y[config_map['label']], axis=1, inplace=True) data[config_map["scale_columns"]] = scaling_type( 'min-max').fit_transform(data[config_map["scale_columns"]]) y[config_map["label"]] = scaling_type( 'min-max').fit_transform(y[config_map["label"]]) rmse_errors = [] for estimator in estimators: mse = np.sum(-cross_val_score(estimator, data, y, scoring='neg_mean_squared_error', cv=Kfolds, n_jobs=n_jobs)) rmse = np.sqrt(mse) rmse_errors.append(rmse) index_of_lowest_error = rmse_errors.index(min(rmse_errors)) return estimators[index_of_lowest_error]
def setUp(self): self.data, self.config_map = read_input('Input/test_data.csv', 'Input/data_config.json') self.estimator = linear_model.LinearRegression() self.data = data_preprocessor(self.data, self.config_map, 5, 'string') self.X_train, self.y_train, self.X_validation, self.y_validation, self.X_test, self.y_test = split_train_test_validation_data( self.data, self.config_map, 0.25, 0.2, 5) self.X_scaler, self.y_scaler, self.model, self.training_rmse = training( self.estimator, self.X_train, self.y_train, self.config_map)
def run_fractional_stratification_model( estimator=None, data_path=None, config_path=None, num_iter=1, seed=None): ''' Fractional Stratification Model Analysis ''' if estimator is None or data_path is None or config_path is None: raise ValueError('Need Estimator, Data path and Config Path as arguments !') data, config_map = read_input(data_path, config_path) data = data_preprocessor(data, config_map, 5, 'string') training_map = {} for _ in range(0, num_iter): training_data, validation_data, testing_data = fractional_stratification( data, data.columns, 4, [0.6, 0.2, 0.2], config_map, seed) X_train, y_train = split_data(training_data, config_map) X_validation, y_validation = split_data(validation_data, config_map) X_test, y_test = split_data(testing_data, config_map) X_scaler, y_scaler, model, training_rmse = training( estimator, X_train, y_train, config_map) validation_rmse = calculate_rmse( X_validation, y_validation, X_scaler, y_scaler, model, config_map) testing_rmse = calculate_rmse( X_test, y_test, X_scaler, y_scaler, model, config_map) if training_rmse < validation_rmse: model_properties = {} model_properties['estimator'] = estimator model_properties['config_map'] = config_map model_properties['X_train'] = X_train model_properties['y_train'] = y_train model_properties['X_validation'] = X_validation model_properties['y_validation'] = y_validation model_properties['X_test'] = X_test model_properties['y_test'] = y_test model_properties['X_scaler'] = X_scaler model_properties['y_scaler'] = y_scaler model_properties['model'] = model model_properties['training_rmse'] = training_rmse model_properties['validation_rmse'] = validation_rmse model_properties['testing_rmse'] = testing_rmse training_map[validation_rmse] = model_properties if(len(training_map) > 0): best_model_properties = training_map[min(training_map)] print('Best Model train error: {} | Best Model validation error: {} | Best Model test error: {}'.format( round(best_model_properties['training_rmse'], 7), round(best_model_properties['validation_rmse'], 7), round(best_model_properties['testing_rmse'], 7))) return best_model_properties return None
def run_train_test_model( estimator=None, data_path=None, config_path=None, num_iter=1, seed=None): ''' Train/test Model Analysis ''' if estimator is None or data_path is None or config_path is None: raise ValueError('Need Estimator, Data path and Config Path as arguments !') data, config_map = read_input(data_path, config_path) data = data_preprocessor(data, config_map, 5, 'string') training_map = {} for _ in range(0, num_iter): X_train, y_train, X_test, y_test = split_train_test_data( data, config_map, 0.3, seed) X_scaler, y_scaler, model, training_rmse = training( estimator, X_train, y_train, config_map) testing_rmse = calculate_rmse( X_test, y_test, X_scaler, y_scaler, model, config_map) if training_rmse < testing_rmse: model_properties = {} model_properties['estimator'] = estimator model_properties['config_map'] = config_map model_properties['X_train'] = X_train model_properties['y_train'] = y_train model_properties['X_test'] = X_test model_properties['y_test'] = y_test model_properties['X_scaler'] = X_scaler model_properties['y_scaler'] = y_scaler model_properties['model'] = model model_properties['training_rmse'] = training_rmse model_properties['testing_rmse'] = testing_rmse training_map[testing_rmse] = model_properties if(len(training_map) > 0): best_model_properties = training_map[min(training_map)] print('Best Model train error: {} | Best Model test error: {}'.format( round(best_model_properties['training_rmse'], 7), round(best_model_properties['testing_rmse'], 7))) return best_model_properties return None