def get_all_pattern(self, thres=0.1, mode='same', verbose=False):
        patterns = None
        infos = None
        for stimulus in STIMULI:
            D = self.get_dict(stimulus)
            for i in range(len(self.ids)):
                patient_id = self.ids[i]
                try:
                    landmarks, timestamps = get_data(stimulus, patient_id)
                except FileNotFoundError:
                    print('Patient %s, stimulus %d not found' %
                          (patient_id, stimulus))
                    continue

                features = features_extraction(landmarks, timestamps, stimulus)
                signal = np.copy(features[0])
                x, H = preprocess_signal(signal)

                if verbose:
                    fig = plt.figure()
                    ax = fig.add_subplot(111)
                    ax.plot(signal)
                    ax.set_title(patient_id + ', stimulus ' + str(stimulus))

                if mode in ['weight', 'weight receptive']:
                    x[H == 0] = np.nan
                coef = D.transform(x, mode=mode)
                sc_sps = D.scale_spaces(coef)
                maxima = np.array([], dtype=np.bool)
                for k in range(len(sc_sps)):
                    sc_sp = sc_sps[k]
                    res = sc_sp_local_maxima(sc_sp, thres=thres)
                    maxima = np.append(maxima, res.flatten())
                ind = np.arange(len(coef))[maxima]
                for j in ind:
                    _, scale, pos = D.get_atom_info(j)
                    pattern = D.get_pattern(signal, scale, pos) / coef[j]
                    pattern = rescale(pattern, scale=self.scale)
                    info = np.array([i, stimulus, scale, pos, coef[j]])
                    if patterns is None and infos is None:
                        patterns = np.copy(pattern)
                        infos = np.copy(info)
                    else:
                        patterns = np.vstack((patterns, pattern))
                        infos = np.vstack((infos, info))
        return patterns, infos
Example #2
0
def run_ml(save_folder=SAVE_FOLDER,
           domains=DOMAINS,
           n_jobs=N_JOBS,
           use_summary=USE_SUMMARY,
           type_of_analysis='standard',
           combination_length=(1, 5),
           target_thresh=0.6,
           n_perm=N_PERM,
           target_metric='AUC',
           seed=None,
           n_jobs_rf=N_JOBS_RF,
           cat_encoding=None):

    if not osp.exists(save_folder):
        os.makedirs(save_folder)

    if seed is None:
        seed = int(time())

    # because of all the extra analysis we will always get the more descriptive 'pureanxiety' column as well
    target_col = ['persistance_anxiety', 'pureanxiety']
    print(type_of_analysis)

    for i_comb, comb_len in enumerate(combination_length):
        for i_dom, dom in enumerate(combinations(domains, comb_len)):
            dom = list(dom)
            random_state = np.random.RandomState(seed=seed)
            save_pattern = osp.join(save_folder, '_'.join(dom) + '_{}')
            print('Max domains: {} {}/{}; Domain combination: {}'.format(
                comb_len, i_dom + 1, int(binom(len(domains), comb_len)), dom))
            df, df_dtype, y = get_data(
                modality_name=dom,
                load_df=NESDA_FILE_MISSING,
                load_df_dtypes=NESDA_FILE_MISSING_DTYPE,
                load_df_summary=NESDA_FILE_MISSING_SUMMARY,
                load_df_dtypes_summary=NESDA_FILE_MISSING_SUMMARY_DTYPE,
                load_df_labels=NESDA_FILE_LABELS,
                use_summary=use_summary,
                target_col=target_col)
            print('Shape Data: {}'.format(df.shape))
            cat_vars = df_dtype.variable_name[(
                df_dtype.data_type == 'Nominal')].values
            other_vars = df_dtype.variable_name[(df_dtype.data_type !=
                                                 'Nominal')].values

            y, multiclass = create_labels(y, type_of_analysis)

            res = run_cross_validation(df_X=df,
                                       y=y,
                                       cat_vars=cat_vars,
                                       other_vars=other_vars,
                                       n_jobs=n_jobs,
                                       random_state=random_state,
                                       n_jobs_rf=n_jobs_rf,
                                       cat_encoding=cat_encoding,
                                       multiclass=multiclass)
            df_perf_train, df_perf_test, df_pred_test, df_feat_import = get_results(
                res)

            df_perf_test.to_csv(save_pattern.format('performance_test.csv'),
                                index=False)
            df_perf_train.to_csv(save_pattern.format('performance_train.csv'),
                                 index=False)
            df_feat_import.to_csv(save_pattern.format('var_importance.csv'),
                                  index=False)
            df_pred_test.to_csv(save_pattern.format('predictions.csv'),
                                index=False)

            print()
            print("Training-Set:")
            print(df_perf_train.mean())
            print()
            print('Test-Set:')
            print(df_perf_test.mean())
            print()

            mean_target_cv = df_perf_test[target_metric].mean()

            if (mean_target_cv >= target_thresh) and (n_perm > 1):
                print('{}: {} >= {}'.format(target_metric, mean_target_cv,
                                            target_thresh))
                print('Running Permutations... (n={})'.format(n_perm))
                print()

                feat_imp_columns = df_feat_import.columns[
                    df_feat_import.columns.str.startswith('cv_')]
                var_names = df_feat_import.var_name.values
                df_feat_import_all_perm, df_perf_test_all_perm = run_permutations(
                    df_X=df,
                    y=y,
                    cat_vars=cat_vars,
                    other_vars=other_vars,
                    perf_columns=df_perf_test.columns,
                    var_names=var_names,
                    feat_imp_columns=feat_imp_columns,
                    n_jobs=n_jobs,
                    n_perm=n_perm,
                    random_state=random_state,
                    n_jobs_rf=n_jobs_rf,
                    multiclass=multiclass)

                df_perf_test_all_perm.to_csv(
                    save_pattern.format('perf_permutations.csv'), index=False)
                df_feat_import_all_perm.to_csv(
                    save_pattern.format('var_imprt_permutations.csv'),
                    index=False)

    np.savez(osp.join(save_folder, 'seed.npz'), np.array([seed]))
Example #3
0
from tkinter import * 
import data_handling
from subprocess import call 
import tkinter.messagebox as tkmsgbox

# Getting data and the current user account id 
data = data_handling.get_data()
user_id = data_handling.get_userID()

def button_action () :
    # To handle if the user entered values that are not integer
    try :
        pass1 = int ( new_pass1.get() )
        pass2 = int ( new_pass2.get() )
    except : 
        tkmsgbox.showwarning("Warning", "Passward is numbers only")
        new_pass1.delete(0 , "end")
        new_pass2.delete(0 , "end")
    
    # Fist ensure that pass is 4 digits 
    if (pass1 <= 9999) & (pass1 >= 1000) :
        # Check if pass was enterd twic the same
        if (pass1 == pass2) :
            # If new pass is not equal the old will change it
            if (pass1 != int (data[user_id]['pass'])) :
                data[user_id]['pass'] = str (pass1)
                data_handling.save_data(data)
                tkmsgbox.showinfo("Info", "Passward changed successfully")
                window.quit()
                window.withdraw()
                call(["python", "option_window.py"])
Example #4
0
def main():
    # Get and load data
    get_data()
    housing = load_data()
    # display_data(housing)

    # Perform and split by strata
    strat_train_set, strat_test_set = do_stratified_sampling(housing)

    # Using the training set, play with the data
    # play_with_data(strat_train_set.copy())

    # Split data into predictors and labels
    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()

    # Use an imputer to fill in missing values
    # We will fill in these values with the median
    imputer = SimpleImputer(strategy="median")
    # Get dataframe of only numerical vals
    housing_num = housing.drop("ocean_proximity", axis=1)

    # Let the imputer estimate based on the numerical housing vals
    imputer.fit(housing_num)
    # NOTE: The median of each attribute is stored in imputer.statistics_
    # Use trained imputer to fill in gaps by transforming the data
    X = imputer.transform(housing_num)
    # Insert np array into pandas DataFrame
    housing_tr = pd.DataFrame(X,
                              columns=housing_num.columns,
                              index=housing_num.index)

    # Convert categorical attribute to numerical attribute
    housing_cat = housing[["ocean_proximity"]]
    # Use one-hot encoding instead of ordinal encoding
    # as the categories are not ordered.
    cat_encoder = OneHotEncoder()

    # NOTE: This gives a scipy array which stores the location
    # of the "hot" encoding (instead of potentially storing
    # many many "cold" encodings (0's))
    # NOTE: Categories are stored in ordinal_encoder.categories_
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

    # Adding combinational attributes
    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)

    # Pipeline for transformations on numerical values
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    housing_num_tr = num_pipeline.fit_transform(housing_num)

    # It is also possible to perform all of the above transformations
    # in one go
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    # This is the final set of training data
    housing_prepared = full_pipeline.fit_transform(housing)

    # Fit the linear regression model on prepared data
    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)

    # Do some testing
    some_data = housing.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    some_data_prepared = full_pipeline.transform(some_data)
    print("Predictions:", lin_reg.predict(some_data_prepared))
    print("Labels:", list(some_labels))

    # Get metrics
    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    print(lin_rmse)

    # Due to the above results being unsatisfactory
    # Try a decision tree regressor
    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(housing_prepared, housing_labels)

    # Now do some testing on the tree regression model
    housing_predictions = tree_reg.predict(housing_prepared)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    print(tree_rmse)

    # The above testing gives no error
    # Cross validation is performed on 10 folds (training and validating
    # 10 times, choosing a different fold for validation each time
    # and training on the remaining fold)
    scores = cross_val_score(tree_reg,
                             housing_prepared,
                             housing_labels,
                             scoring="neg_mean_squared_error",
                             cv=10)
    # As cross validation expect to use a utility function instead of a
    # cost function (whereas we want to use a cost function), we must
    # flip the sign of the scores.
    tree_rmse_scores = np.sqrt(-scores)

    # Double check against cross validation on the linear reg. model
    lin_scores = cross_val_score(lin_reg,
                                 housing_prepared,
                                 housing_labels,
                                 scoring="neg_mean_squared_error",
                                 cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)

    print("TREE RSME SCORES")
    display_scores(tree_rmse_scores)

    print("LINEAR REG RMSE SCORES")
    display_scores(lin_rmse_scores)

    # This shows that the Decision Tree is overfitting
    # Therefore we try the Random Forest Regressor
    forest_reg = RandomForestRegressor()
    forest_reg.fit(housing_prepared, housing_labels)
    forest_scores = cross_val_score(forest_reg,
                                    housing_prepared,
                                    housing_labels,
                                    scoring="neg_mean_squared_error",
                                    cv=10)
    forest_rmse_scores = np.sqrt(-forest_scores)

    print("RANDOM FOREST REG RMSE SCORES")
    display_scores(forest_rmse_scores)

    # Fine-tuning by automatically searching for hyperparams
    # Grid indicates to try firstly all permutations of the first dict
    # followed by the permutations of options in the second dict.
    param_grid = [
        {
            "n_estimators": [3, 10, 30],
            "max_features": [2, 4, 6, 8]
        },
        {
            "bootstrap": [False],
            "n_estimators": [3, 10],
            "max_features": [2, 3, 4]
        },
    ]

    forest_reg = RandomForestRegressor()
    # We use five-fold cross validation
    grid_search = GridSearchCV(forest_reg,
                               param_grid,
                               cv=5,
                               scoring="neg_mean_squared_error",
                               return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)

    # The best parameters are found using:
    print(f"Best hyperparams: {grid_search.best_params_}")
    # The best estimator:
    print(f"Best Estimator: {grid_search.best_estimator_}")
    # The evaluation scores:
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    # Examine the relative importance of each attribute for accurate predictions
    feature_importances = grid_search.best_estimator_.feature_importances_
    # Displaying the importance scores next to their attribute names
    extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
    cat_encoder = full_pipeline.named_transformers_["cat"]
    cat_one_hot_attribs = list(cat_encoder.categories_[0])
    attributes = num_attribs + extra_attribs + cat_one_hot_attribs
    print(sorted(zip(feature_importances, attributes), reverse=True))
    # NOTE: The above may indicate which features may be dropped

    # Evaluation on test set
    # Select the best estimator found by the grid search as the final model
    final_model = grid_search.best_estimator_

    # Separate test set into predictors and labels
    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()

    # NOTE: Only transform test data, DO NOT FIT the model on test data
    X_test_prepared = full_pipeline.transform(X_test)

    final_predictions = final_model.predict(X_test_prepared)
    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)

    # Compute 95% confidence interval
    confidence = 0.95
    squared_errors = (final_predictions - y_test)**2
    np.sqrt(
        stats.t.interval(confidence,
                         len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

    # The following is inserted into our SelectImportantFeatures'
    # fit method, however we add it here for testing later.
    top_k_feature_indices = top_importances(feature_importances, 5)

    # New pipeline, now reducing the data's features to be
    # restricted to the top 5 most important features
    prep_and_feature_pipeline = Pipeline([
        ("prep", full_pipeline),
        ("feature", SelectImportantFeatures(feature_importances, 5))
    ])

    trimmed_housing = prep_and_feature_pipeline.fit_transform(housing)
    # NOTE: If we were to do trimmed_housing[0:3] and
    # housing_prepared[0:3, top_k_feature_indices],
    # the output would be the same.
    print(trimmed_housing[0:3])
    print(housing_prepared[0:3, top_k_feature_indices])
Example #5
0
def main():
    # Get and load data
    get_data()
    housing = load_data()
    # display_data(housing)

    # Perform and split by strata
    strat_train_set, strat_test_set = do_stratified_sampling(housing)

    # Using the training set, play with the data
    # play_with_data(strat_train_set.copy())

    # Split data into predictors and labels
    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()

    # Use an imputer to fill in missing values
    # We will fill in these values with the median
    imputer = SimpleImputer(strategy="median")
    # Get dataframe of only numerical vals
    housing_num = housing.drop("ocean_proximity", axis=1)

    # Let the imputer estimate based on the numerical housing vals
    imputer.fit(housing_num)
    # NOTE: The median of each attribute is stored in imputer.statistics_
    # Use trained imputer to fill in gaps by transforming the data
    X = imputer.transform(housing_num)
    # Insert np array into pandas DataFrame
    housing_tr = pd.DataFrame(X,
                              columns=housing_num.columns,
                              index=housing_num.index)

    # Convert categorical attribute to numerical attribute
    housing_cat = housing[["ocean_proximity"]]
    # Use one-hot encoding instead of ordinal encoding
    # as the categories are not ordered.
    cat_encoder = OneHotEncoder()

    # NOTE: This gives a scipy array which stores the location
    # of the "hot" encoding (instead of potentially storing
    # many many "cold" encodings (0's))
    # NOTE: Categories are stored in ordinal_encoder.categories_
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

    # Adding combinational attributes
    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)

    # Pipeline for transformations on numerical values
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    housing_num_tr = num_pipeline.fit_transform(housing_num)

    # It is also possible to perform all of the above transformations
    # in one go
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    # This is the final set of training data
    housing_prepared = full_pipeline.fit_transform(housing)

    print("Finished preparing data")

    svr_reg = SVR()

    # # Try a support vector machine regressor
    # param_grid = [
    #         {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
    #         {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
    #          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    #     ]
    #
    # grid_search = GridSearchCV(svr_reg, param_grid, cv=5,
    #                             scoring="neg_mean_squared_error",
    #                             return_train_score=True)
    # grid_search.fit(housing_prepared, housing_labels)
    #
    # # Best svr score
    # best_svr_score = np.sqrt(-grid_search.best_score_)
    # print(f"Best SVR Estimator Score: {best_svr_score}")

    # Using a randomized search instead of a grid search
    param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

    rnd_search = RandomizedSearchCV(svr_reg,
                                    param_distribs,
                                    n_iter=50,
                                    cv=5,
                                    scoring="neg_mean_squared_error",
                                    verbose=2,
                                    random_state=42)
    rnd_search.fit(housing_prepared, housing_labels)
    best_svr_score = np.sqrt(-rnd_search.best_score_)
    print(f"Best SVR Estimator Score: {best_svr_score}")
def run_perm_analysis(save_folder,
                      domains='all',
                      n_jobs=10,
                      use_summary=False,
                      type_of_analysis='any_anxiety',
                      n_perm=1000,
                      seed=None,
                      n_jobs_rf=2,
                      cat_encoding=None):

    if seed is None:
        seed = int(time())

    target_col = ['persistance_anxiety', 'pureanxiety']
    df, df_dtype, y = get_data(
        modality_name=domains,
        load_df=NESDA_FILE_MISSING,
        load_df_dtypes=NESDA_FILE_MISSING_DTYPE,
        load_df_summary=NESDA_FILE_MISSING_SUMMARY,
        load_df_dtypes_summary=NESDA_FILE_MISSING_SUMMARY_DTYPE,
        load_df_labels=NESDA_FILE_LABELS,
        use_summary=use_summary,
        target_col=target_col)

    y, multiclass = create_labels(y, type_of_analysis)

    df, cat_vars = impute_data(df, df_dtype)
    X, var_names = categorical_encoding(df,
                                        y,
                                        cat_vars,
                                        np.arange(df.shape[0]),
                                        method=cat_encoding)
    n_subj, n_features = X.shape
    estimator = get_classifier(n_subj,
                               random_state=seed,
                               n_jobs_rf=n_jobs_rf,
                               multiclass=multiclass)

    estimator.fit(X, y)
    feat_imp_true = estimator.feature_importances_
    perm_col = ['perm_{}'.format(i_perm + 1) for i_perm in range(n_perm)]

    df_feat_imp = pd.DataFrame(index=var_names,
                               columns=['true_feature_importances'] + perm_col)
    df_feat_imp['true_feature_importances'] = feat_imp_true

    for i_feature in range(X.shape[1]):
        print('{}/{}; Feature: {}'.format(i_feature + 1, X.shape[1],
                                          var_names[i_feature]))
        X_perm = X.copy()
        res = Parallel(n_jobs=n_jobs,
                       verbose=1,
                       pre_dispatch='2*n_jobs',
                       max_nbytes='50M')(delayed(permute_feature)(clone(
                           estimator), X_perm, y, i_feature)
                                         for _ in range(n_perm))
        df_feat_imp.loc[var_names[i_feature], perm_col] = res

    df_feat_imp.to_csv(
        osp.join(
            save_folder,
            'permuted_variable_importances_domains_{}.csv'.format(domains)))
    np.save(
        osp.join(
            save_folder,
            'permuted_variable_importances_domains_{}_seed.npy'.format(
                domains)), np.array([seed]))
Example #7
0
# -*- coding: utf-8 -*-
"""
Created on Mon Feb  5 23:54:51 2018

@author: Eric Wang, Duoxiao Chang, Yipeng Zhu
"""

import matplotlib.pyplot as plt
#import analysis as ANA
from option_pricing import hist_vol
from portfolios import Port1, Port2, Port3, Port4
from data_handling import match_data, get_data

get_data()
data = match_data()
hist_vol(data, 30)

Port1(data)
Port2(data)
Port3(data)
Port4(data)

data = data.loc[121:, ]
data.to_csv('../Results/result.csv')

plt.hist(data[["P1_daily_return", "P2_daily_return", "P2_daily_return", "P4_daily_return"]],\
         label = ['P1', 'P2', 'P3', 'P4'])

plt.legend(loc='upper left')
plt.savefig('../Results/hist.png')
data.plot(y=['P1_value', 'P2_value', 'P3_value', 'P4_value'])