def get_all_pattern(self, thres=0.1, mode='same', verbose=False): patterns = None infos = None for stimulus in STIMULI: D = self.get_dict(stimulus) for i in range(len(self.ids)): patient_id = self.ids[i] try: landmarks, timestamps = get_data(stimulus, patient_id) except FileNotFoundError: print('Patient %s, stimulus %d not found' % (patient_id, stimulus)) continue features = features_extraction(landmarks, timestamps, stimulus) signal = np.copy(features[0]) x, H = preprocess_signal(signal) if verbose: fig = plt.figure() ax = fig.add_subplot(111) ax.plot(signal) ax.set_title(patient_id + ', stimulus ' + str(stimulus)) if mode in ['weight', 'weight receptive']: x[H == 0] = np.nan coef = D.transform(x, mode=mode) sc_sps = D.scale_spaces(coef) maxima = np.array([], dtype=np.bool) for k in range(len(sc_sps)): sc_sp = sc_sps[k] res = sc_sp_local_maxima(sc_sp, thres=thres) maxima = np.append(maxima, res.flatten()) ind = np.arange(len(coef))[maxima] for j in ind: _, scale, pos = D.get_atom_info(j) pattern = D.get_pattern(signal, scale, pos) / coef[j] pattern = rescale(pattern, scale=self.scale) info = np.array([i, stimulus, scale, pos, coef[j]]) if patterns is None and infos is None: patterns = np.copy(pattern) infos = np.copy(info) else: patterns = np.vstack((patterns, pattern)) infos = np.vstack((infos, info)) return patterns, infos
def run_ml(save_folder=SAVE_FOLDER, domains=DOMAINS, n_jobs=N_JOBS, use_summary=USE_SUMMARY, type_of_analysis='standard', combination_length=(1, 5), target_thresh=0.6, n_perm=N_PERM, target_metric='AUC', seed=None, n_jobs_rf=N_JOBS_RF, cat_encoding=None): if not osp.exists(save_folder): os.makedirs(save_folder) if seed is None: seed = int(time()) # because of all the extra analysis we will always get the more descriptive 'pureanxiety' column as well target_col = ['persistance_anxiety', 'pureanxiety'] print(type_of_analysis) for i_comb, comb_len in enumerate(combination_length): for i_dom, dom in enumerate(combinations(domains, comb_len)): dom = list(dom) random_state = np.random.RandomState(seed=seed) save_pattern = osp.join(save_folder, '_'.join(dom) + '_{}') print('Max domains: {} {}/{}; Domain combination: {}'.format( comb_len, i_dom + 1, int(binom(len(domains), comb_len)), dom)) df, df_dtype, y = get_data( modality_name=dom, load_df=NESDA_FILE_MISSING, load_df_dtypes=NESDA_FILE_MISSING_DTYPE, load_df_summary=NESDA_FILE_MISSING_SUMMARY, load_df_dtypes_summary=NESDA_FILE_MISSING_SUMMARY_DTYPE, load_df_labels=NESDA_FILE_LABELS, use_summary=use_summary, target_col=target_col) print('Shape Data: {}'.format(df.shape)) cat_vars = df_dtype.variable_name[( df_dtype.data_type == 'Nominal')].values other_vars = df_dtype.variable_name[(df_dtype.data_type != 'Nominal')].values y, multiclass = create_labels(y, type_of_analysis) res = run_cross_validation(df_X=df, y=y, cat_vars=cat_vars, other_vars=other_vars, n_jobs=n_jobs, random_state=random_state, n_jobs_rf=n_jobs_rf, cat_encoding=cat_encoding, multiclass=multiclass) df_perf_train, df_perf_test, df_pred_test, df_feat_import = get_results( res) df_perf_test.to_csv(save_pattern.format('performance_test.csv'), index=False) df_perf_train.to_csv(save_pattern.format('performance_train.csv'), index=False) df_feat_import.to_csv(save_pattern.format('var_importance.csv'), index=False) df_pred_test.to_csv(save_pattern.format('predictions.csv'), index=False) print() print("Training-Set:") print(df_perf_train.mean()) print() print('Test-Set:') print(df_perf_test.mean()) print() mean_target_cv = df_perf_test[target_metric].mean() if (mean_target_cv >= target_thresh) and (n_perm > 1): print('{}: {} >= {}'.format(target_metric, mean_target_cv, target_thresh)) print('Running Permutations... (n={})'.format(n_perm)) print() feat_imp_columns = df_feat_import.columns[ df_feat_import.columns.str.startswith('cv_')] var_names = df_feat_import.var_name.values df_feat_import_all_perm, df_perf_test_all_perm = run_permutations( df_X=df, y=y, cat_vars=cat_vars, other_vars=other_vars, perf_columns=df_perf_test.columns, var_names=var_names, feat_imp_columns=feat_imp_columns, n_jobs=n_jobs, n_perm=n_perm, random_state=random_state, n_jobs_rf=n_jobs_rf, multiclass=multiclass) df_perf_test_all_perm.to_csv( save_pattern.format('perf_permutations.csv'), index=False) df_feat_import_all_perm.to_csv( save_pattern.format('var_imprt_permutations.csv'), index=False) np.savez(osp.join(save_folder, 'seed.npz'), np.array([seed]))
from tkinter import * import data_handling from subprocess import call import tkinter.messagebox as tkmsgbox # Getting data and the current user account id data = data_handling.get_data() user_id = data_handling.get_userID() def button_action () : # To handle if the user entered values that are not integer try : pass1 = int ( new_pass1.get() ) pass2 = int ( new_pass2.get() ) except : tkmsgbox.showwarning("Warning", "Passward is numbers only") new_pass1.delete(0 , "end") new_pass2.delete(0 , "end") # Fist ensure that pass is 4 digits if (pass1 <= 9999) & (pass1 >= 1000) : # Check if pass was enterd twic the same if (pass1 == pass2) : # If new pass is not equal the old will change it if (pass1 != int (data[user_id]['pass'])) : data[user_id]['pass'] = str (pass1) data_handling.save_data(data) tkmsgbox.showinfo("Info", "Passward changed successfully") window.quit() window.withdraw() call(["python", "option_window.py"])
def main(): # Get and load data get_data() housing = load_data() # display_data(housing) # Perform and split by strata strat_train_set, strat_test_set = do_stratified_sampling(housing) # Using the training set, play with the data # play_with_data(strat_train_set.copy()) # Split data into predictors and labels housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() # Use an imputer to fill in missing values # We will fill in these values with the median imputer = SimpleImputer(strategy="median") # Get dataframe of only numerical vals housing_num = housing.drop("ocean_proximity", axis=1) # Let the imputer estimate based on the numerical housing vals imputer.fit(housing_num) # NOTE: The median of each attribute is stored in imputer.statistics_ # Use trained imputer to fill in gaps by transforming the data X = imputer.transform(housing_num) # Insert np array into pandas DataFrame housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) # Convert categorical attribute to numerical attribute housing_cat = housing[["ocean_proximity"]] # Use one-hot encoding instead of ordinal encoding # as the categories are not ordered. cat_encoder = OneHotEncoder() # NOTE: This gives a scipy array which stores the location # of the "hot" encoding (instead of potentially storing # many many "cold" encodings (0's)) # NOTE: Categories are stored in ordinal_encoder.categories_ housing_cat_1hot = cat_encoder.fit_transform(housing_cat) # Adding combinational attributes attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # Pipeline for transformations on numerical values num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # It is also possible to perform all of the above transformations # in one go num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) # This is the final set of training data housing_prepared = full_pipeline.fit_transform(housing) # Fit the linear regression model on prepared data lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) # Do some testing some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) print("Predictions:", lin_reg.predict(some_data_prepared)) print("Labels:", list(some_labels)) # Get metrics housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) # Due to the above results being unsatisfactory # Try a decision tree regressor tree_reg = DecisionTreeRegressor() tree_reg.fit(housing_prepared, housing_labels) # Now do some testing on the tree regression model housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) print(tree_rmse) # The above testing gives no error # Cross validation is performed on 10 folds (training and validating # 10 times, choosing a different fold for validation each time # and training on the remaining fold) scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) # As cross validation expect to use a utility function instead of a # cost function (whereas we want to use a cost function), we must # flip the sign of the scores. tree_rmse_scores = np.sqrt(-scores) # Double check against cross validation on the linear reg. model lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-lin_scores) print("TREE RSME SCORES") display_scores(tree_rmse_scores) print("LINEAR REG RMSE SCORES") display_scores(lin_rmse_scores) # This shows that the Decision Tree is overfitting # Therefore we try the Random Forest Regressor forest_reg = RandomForestRegressor() forest_reg.fit(housing_prepared, housing_labels) forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-forest_scores) print("RANDOM FOREST REG RMSE SCORES") display_scores(forest_rmse_scores) # Fine-tuning by automatically searching for hyperparams # Grid indicates to try firstly all permutations of the first dict # followed by the permutations of options in the second dict. param_grid = [ { "n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8] }, { "bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4] }, ] forest_reg = RandomForestRegressor() # We use five-fold cross validation grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True) grid_search.fit(housing_prepared, housing_labels) # The best parameters are found using: print(f"Best hyperparams: {grid_search.best_params_}") # The best estimator: print(f"Best Estimator: {grid_search.best_estimator_}") # The evaluation scores: cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) # Examine the relative importance of each attribute for accurate predictions feature_importances = grid_search.best_estimator_.feature_importances_ # Displaying the importance scores next to their attribute names extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"] cat_encoder = full_pipeline.named_transformers_["cat"] cat_one_hot_attribs = list(cat_encoder.categories_[0]) attributes = num_attribs + extra_attribs + cat_one_hot_attribs print(sorted(zip(feature_importances, attributes), reverse=True)) # NOTE: The above may indicate which features may be dropped # Evaluation on test set # Select the best estimator found by the grid search as the final model final_model = grid_search.best_estimator_ # Separate test set into predictors and labels X_test = strat_test_set.drop("median_house_value", axis=1) y_test = strat_test_set["median_house_value"].copy() # NOTE: Only transform test data, DO NOT FIT the model on test data X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) # Compute 95% confidence interval confidence = 0.95 squared_errors = (final_predictions - y_test)**2 np.sqrt( stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors))) # The following is inserted into our SelectImportantFeatures' # fit method, however we add it here for testing later. top_k_feature_indices = top_importances(feature_importances, 5) # New pipeline, now reducing the data's features to be # restricted to the top 5 most important features prep_and_feature_pipeline = Pipeline([ ("prep", full_pipeline), ("feature", SelectImportantFeatures(feature_importances, 5)) ]) trimmed_housing = prep_and_feature_pipeline.fit_transform(housing) # NOTE: If we were to do trimmed_housing[0:3] and # housing_prepared[0:3, top_k_feature_indices], # the output would be the same. print(trimmed_housing[0:3]) print(housing_prepared[0:3, top_k_feature_indices])
def main(): # Get and load data get_data() housing = load_data() # display_data(housing) # Perform and split by strata strat_train_set, strat_test_set = do_stratified_sampling(housing) # Using the training set, play with the data # play_with_data(strat_train_set.copy()) # Split data into predictors and labels housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() # Use an imputer to fill in missing values # We will fill in these values with the median imputer = SimpleImputer(strategy="median") # Get dataframe of only numerical vals housing_num = housing.drop("ocean_proximity", axis=1) # Let the imputer estimate based on the numerical housing vals imputer.fit(housing_num) # NOTE: The median of each attribute is stored in imputer.statistics_ # Use trained imputer to fill in gaps by transforming the data X = imputer.transform(housing_num) # Insert np array into pandas DataFrame housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) # Convert categorical attribute to numerical attribute housing_cat = housing[["ocean_proximity"]] # Use one-hot encoding instead of ordinal encoding # as the categories are not ordered. cat_encoder = OneHotEncoder() # NOTE: This gives a scipy array which stores the location # of the "hot" encoding (instead of potentially storing # many many "cold" encodings (0's)) # NOTE: Categories are stored in ordinal_encoder.categories_ housing_cat_1hot = cat_encoder.fit_transform(housing_cat) # Adding combinational attributes attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # Pipeline for transformations on numerical values num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # It is also possible to perform all of the above transformations # in one go num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) # This is the final set of training data housing_prepared = full_pipeline.fit_transform(housing) print("Finished preparing data") svr_reg = SVR() # # Try a support vector machine regressor # param_grid = [ # {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]}, # {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0], # 'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}, # ] # # grid_search = GridSearchCV(svr_reg, param_grid, cv=5, # scoring="neg_mean_squared_error", # return_train_score=True) # grid_search.fit(housing_prepared, housing_labels) # # # Best svr score # best_svr_score = np.sqrt(-grid_search.best_score_) # print(f"Best SVR Estimator Score: {best_svr_score}") # Using a randomized search instead of a grid search param_distribs = { 'kernel': ['linear', 'rbf'], 'C': reciprocal(20, 200000), 'gamma': expon(scale=1.0), } rnd_search = RandomizedSearchCV(svr_reg, param_distribs, n_iter=50, cv=5, scoring="neg_mean_squared_error", verbose=2, random_state=42) rnd_search.fit(housing_prepared, housing_labels) best_svr_score = np.sqrt(-rnd_search.best_score_) print(f"Best SVR Estimator Score: {best_svr_score}")
def run_perm_analysis(save_folder, domains='all', n_jobs=10, use_summary=False, type_of_analysis='any_anxiety', n_perm=1000, seed=None, n_jobs_rf=2, cat_encoding=None): if seed is None: seed = int(time()) target_col = ['persistance_anxiety', 'pureanxiety'] df, df_dtype, y = get_data( modality_name=domains, load_df=NESDA_FILE_MISSING, load_df_dtypes=NESDA_FILE_MISSING_DTYPE, load_df_summary=NESDA_FILE_MISSING_SUMMARY, load_df_dtypes_summary=NESDA_FILE_MISSING_SUMMARY_DTYPE, load_df_labels=NESDA_FILE_LABELS, use_summary=use_summary, target_col=target_col) y, multiclass = create_labels(y, type_of_analysis) df, cat_vars = impute_data(df, df_dtype) X, var_names = categorical_encoding(df, y, cat_vars, np.arange(df.shape[0]), method=cat_encoding) n_subj, n_features = X.shape estimator = get_classifier(n_subj, random_state=seed, n_jobs_rf=n_jobs_rf, multiclass=multiclass) estimator.fit(X, y) feat_imp_true = estimator.feature_importances_ perm_col = ['perm_{}'.format(i_perm + 1) for i_perm in range(n_perm)] df_feat_imp = pd.DataFrame(index=var_names, columns=['true_feature_importances'] + perm_col) df_feat_imp['true_feature_importances'] = feat_imp_true for i_feature in range(X.shape[1]): print('{}/{}; Feature: {}'.format(i_feature + 1, X.shape[1], var_names[i_feature])) X_perm = X.copy() res = Parallel(n_jobs=n_jobs, verbose=1, pre_dispatch='2*n_jobs', max_nbytes='50M')(delayed(permute_feature)(clone( estimator), X_perm, y, i_feature) for _ in range(n_perm)) df_feat_imp.loc[var_names[i_feature], perm_col] = res df_feat_imp.to_csv( osp.join( save_folder, 'permuted_variable_importances_domains_{}.csv'.format(domains))) np.save( osp.join( save_folder, 'permuted_variable_importances_domains_{}_seed.npy'.format( domains)), np.array([seed]))
# -*- coding: utf-8 -*- """ Created on Mon Feb 5 23:54:51 2018 @author: Eric Wang, Duoxiao Chang, Yipeng Zhu """ import matplotlib.pyplot as plt #import analysis as ANA from option_pricing import hist_vol from portfolios import Port1, Port2, Port3, Port4 from data_handling import match_data, get_data get_data() data = match_data() hist_vol(data, 30) Port1(data) Port2(data) Port3(data) Port4(data) data = data.loc[121:, ] data.to_csv('../Results/result.csv') plt.hist(data[["P1_daily_return", "P2_daily_return", "P2_daily_return", "P4_daily_return"]],\ label = ['P1', 'P2', 'P3', 'P4']) plt.legend(loc='upper left') plt.savefig('../Results/hist.png') data.plot(y=['P1_value', 'P2_value', 'P3_value', 'P4_value'])