def train(output, models=['linear', 'tree', 'forest', 'svr', 'cat']): data = get_predictions(output) print('Primary predictions loaded.') [X, y, X_train, y_train, X_test, y_test, X_scaled, y_scaled, X_train_scaled, y_train_scaled, X_test_scaled, y_scaler] \ = pre.split_pipeline(data, output) print('Data preprocessed.') regressors = tra.build(X_train, y_train, X_train_scaled, y_train_scaled, X_train, y_train, models) best_regressor = tra.evaluate(regressors, X_train, y_train, X_train_scaled, y_train_scaled, X_test, y_test, X_test_scaled, y_scaler, X_train, y_train, X_test, y_test) print('Regressors evaluated. Best regressor is:\n' + str(best_regressor)) if 'SVR' in str(best_regressor): best_regressor.fit(X_scaled, y_scaled) else: best_regressor.fit(X, y) print('Regressor fit.') tra.print_results(best_regressor, X, X_scaled, y, y_scaler, X) tra.save(best_regressor, X, output + '_transfer') print('Regressor saved.') tra.upload(output + '_transfer') print('Regressor uploaded.')
def calculate(output, model): """Determine feature importance for specified model""" data = pd.read_csv('campaigns.csv') print('Data loaded.') data = tra.trim(data, output) print('Data trimmed.') # Add random column to data np.random.seed(seed=0) data['random'] = np.random.random(size=len(data)) data, data_cat = pre.data_pipeline(data, output) [_, _, X_train, y_train, _, _, _, _, X_train_scaled, y_train_scaled, _, y_scaler] \ = pre.split_pipeline(data, output, encoded=True) [_, _, X_train_cat, y_train_cat, _, _] = \ pre.split_pipeline(data_cat, output, encoded=False) print('Data preprocessed.') regressor = tra.build(X_train, y_train, X_train_scaled, y_train_scaled, X_train_cat, y_train_cat, [model])[0] model_clone = clone(regressor) # Set random_state for comparability model_clone.random_state = 0 # Train and score the benchmark model if 'SVR' in str(regressor): model_clone.fit(X_train_scaled, y_train_scaled) benchmark_score = hel.mean_relative_accuracy( y_scaler.inverse_transform(model_clone.predict(X_train_scaled)), y_train) else: model_clone.fit(X_train, y_train) benchmark_score = \ hel.mean_relative_accuracy(model_clone.predict(X_train), y_train) # Calculate and store feature importance benchmark deviation importances = [] columns = X_train.columns i = 1 for column in columns: model_clone = clone(regressor) model_clone.random_state = 0 if 'SVR' in str(regressor): model_clone.fit(X_train_scaled.drop(column, axis=1), y_train_scaled) drop_col_score = hel.mean_relative_accuracy( model_clone.predict(X_train_scaled.drop(column, axis=1)), y_train_scaled) else: model_clone.fit(X_train.drop(column, axis=1), y_train) drop_col_score = hel.mean_relative_accuracy( model_clone.predict(X_train.drop(column, axis=1)), y_train) importances.append(benchmark_score - drop_col_score) i += 1 importances_df = \ pd.DataFrame({'column': X_train.columns, 'value': importances}) \ .sort_values('value', ascending=False).reset_index(drop=True) print('Importances:') for i in range(0, len(importances_df)): print( str(importances_df.iloc[i].column) + ': ' + str(importances_df.iloc[i].value))