def cross_validate(data,node_min=4,node_max=16,n_nodes=7): """ Parameters ---------- data : Pandas Dataframe load dataframe using from data_load, using getData function. node_min : Int, optional Min nr. of nodes in range of nodes to loop over in inner CV loop node_max : Int, optional Max nr. of nodes in range of nodes to loop over in inner CV loop n_nodes : Int, optional Number, N, of nodes to loop over in inner CV loop Returns ------- Pickle file with dataframe of results; """ # Define range of nodes to optimize for training models nodes = np.linspace(node_min,node_max,n_nodes) nodes = nodes.astype(int) # Split into train/testing with leave-one-group-out logo = LeaveOneGroupOut() # Define result DataFrame df_cols = ["corr_true", "corr_mask", "corr_rand", "TA", "SNR","Optimal Nr. Nodes"] df = pd.DataFrame(columns = df_cols) TAs = np.unique(data["TA"]) for TA in TAs: SNRs = np.unique(data[data["TA"] == TA]["SNR"]) for SNR in SNRs: data_sub = data[(data["TA"] == TA) & (data["SNR"] == SNR)] # Assign X, y and group variable (trial, as to do leave-trial-out) X = data_sub[data.columns[:16]] # Attended audio y = data_sub["target"] # Unattended audio masks = data_sub["mask"] groups = data_sub["trial"] n_outer_groups = len(np.unique(groups)) ### Two-layer CV starts here ### # Outer fold i = 0 for out_train_idx, out_test_idx in logo.split(X, y, groups): X_train = X.iloc[out_train_idx] y_train = y.iloc[out_train_idx] X_test = X.iloc[out_test_idx] y_test = y.iloc[out_test_idx] # Define inner groups, these are n - 1 of n total groups inner_groups = data["trial"].iloc[out_train_idx] n_inner_groups = len(np.unique(inner_groups)) # Initiate errors for inner fold validations vals = np.zeros((n_inner_groups, n_nodes)) # Inner fold j = 0 for inn_train_idx, inn_test_idx in logo.split(X_train, y_train, inner_groups): print("TA = %i / %i\tSNR = %i / %i\nOuter fold %i / %i\t Inner fold %i / %i" %(TA + 1, len(TAs), SNR + 1, len(SNRs), i + 1, n_outer_groups, j + 1, n_inner_groups)) inn_X_train = X_train.iloc[inn_train_idx] inn_y_train = y_train.iloc[inn_train_idx] inn_X_test = X_train.iloc[inn_test_idx] inn_y_test = y_train.iloc[inn_test_idx] # Validate model with all parameters k = 0 for l in nodes: # Define model with l parameter model = Sequential() # Batch normalization model.add(BatchNormalization()) model.add(Dense(units=l, activation='tanh',input_shape=(inn_X_train.shape[0], inn_X_train.shape[1]))) model.add(Dropout(0.2)) # Batch normalization model.add(BatchNormalization()) model.add(Dense(units=2, activation='tanh')) model.add(Dropout(0.5)) # Batch normalization model.add(BatchNormalization()) model.add(Dense(units=1, activation='linear')) model.compile(optimizer='rmsprop', loss=corr_loss) model.fit(np.asarray(inn_X_train), np.asarray(inn_y_train), epochs=30, verbose=2, shuffle=False) results = model.evaluate(np.asarray(inn_X_test), np.asarray(inn_y_test)) # Compute Pearson R correlation for regressional value val = results vals[j, k] = val k += 1 j += 1 # Get optimal parameter param_score = np.sum(vals, axis = 0) node_opt = nodes[np.argmax(param_score)] print("Optimal nodes = %f" %node_opt) # Train optimal model model_opt = Sequential() # Batch normalization model_opt.add(BatchNormalization()) model_opt.add(Dense(units=node_opt, activation='tanh',input_shape=(X_train.shape[0], X_train.shape[1]))) model_opt.add(Dropout(0.2)) # Batch normalization model_opt.add(BatchNormalization()) model_opt.add(Dense(units=2, activation='tanh')) model_opt.add(Dropout(0.5)) # Batch normalization model_opt.add(BatchNormalization()) model_opt.add(Dense(units=1, activation='linear')) model_opt.compile(optimizer='rmsprop', loss=corr_loss) model_opt.fit(np.asarray(X_train), np.asarray(y_train), epochs=30, verbose=2, shuffle=False) # Predict envelope y_pred = model_opt.predict(np.asarray(X_test)) """ plt.style.use('ggplot') plt.plot(y_test, label="True value") plt.plot(y_pred, label="Predicted value") plt.legend() plt.show() """ trial_test = np.unique(data_sub.iloc[out_test_idx]["trial"])[0] # Random speech y_rand = random_trial(data, TA = TA, trial = trial_test)["target"] # Compute Pearson R between predicted envelope and attended speech corr_true = corr(K.constant(np.asarray(y_test)),K.constant(y_pred)) # Compute Pearson R between predicted envelope and unattended speech corr_mask = corr(K.constant(np.asarray(masks.iloc[out_test_idx])),K.constant(y_pred)) # Compute Pearson R between predicted envelope and random speech corr_rand = corr(K.constant(np.asarray(y_rand)),K.constant(y_pred)) # Evaluate envelope, compare with random trial ### Add correlations to dataframe ### # Convert to DataFrame data_results = np.zeros((1, len(df_cols))) data_results[:, 0] = np.asarray(corr_true) data_results[:, 1] = np.asarray(corr_mask) data_results[:, 2] = np.asarray(corr_rand) data_results[:, 3] = TA data_results[:, 4] = SNR data_results[:, 5] = node_opt df_ = pd.DataFrame(data = data_results, columns = df_cols) # Concatenate df = pd.concat([df, df_], ignore_index = True) print(df) i += 1 df.to_pickle("/content/Measuring_Cognitive_Load_DTU_WSAudiology/local_data/results/Seperate_SNR_2_layer_ANN_result_%i_%i.pkl" %(TA, SNR)) return df
def cross_validate(data, TA=None, lambda_config=(2e0, 2e20, 11), t_config=(-.25, .1)): """ Parameters ---------- data : Pandas Dataframe load dataframe using from data_load, using getData function. lambda_config : tuple, optional Lambda range for Ridge regression. The default is (2e0, 2e20, 11). Range from Cross et al 2016 publication. t_config : tuple, optional Jitter lag range for MNE in ms. The default is (-.25, .1). Returns ------- NA """ np.random.seed(999) # Define lambda values for training models lambdas = np.linspace(lambda_config[0], lambda_config[1], lambda_config[2]) # Parameters for MNE tmin, tmax = t_config sfreq = 64 # Define result DataFrame df_cols = ["corr_true", "corr_mask", "corr_rand", "TA", "SNR"] df = pd.DataFrame(columns=df_cols) if TA == None: TAs = np.unique(data["TA"]) else: TAs = np.array([TA]) for TA in TAs: data_sub = data[data["TA"] == TA] data_train = data_sub SNRs = np.unique(data_sub["SNR"]) SNR_order = [] for SNR in SNRs: trials = data_sub[data_sub["SNR"] == SNR][ "trial"] # Get the trials trials = np.unique(trials) # Get the unique trial indicies np.random.shuffle(trials) # Shuffle the order of the trials SNR_order.append(trials) # Store the order # Get the lowest possible k for k-fold K = np.inf for order in SNR_order: if len(order) < K: K = len(order) # Outer fold for k in range(K): # Split into test and training data_train = data_sub data_test = pd.DataFrame() # Filter the test data away for i in range(len(SNR_order)): data_test = pd.concat([ data_test, data_train[(data_sub["SNR"] == i) & (data_train["trial"] == SNR_order[i][k])] ], ignore_index=True) data_train = data_train.drop( data_train[(data_train["SNR"] == i) & (data_train["trial"] == SNR_order[i][k])].index) # Initiate errors for inner fold validations vals = np.zeros((K - 1, lambda_config[2])) # Get the list of validation trials SNR_valid_order = SNR_order.copy() for i in range(len(SNR_order)): SNR_valid_order[i] = np.delete(SNR_valid_order[i], k) # Inner fold for j in range(K - 1): print("TA: %i / %i\n\tFold: %i / %i\n\tInner fold: %i / %i" % (TA + 1, len(TAs), k + 1, K, j + 1, K - 1)) # Find optimal hyperparameter data_valid_train = data_train data_valid_test = pd.DataFrame() for i in range(len(SNR_order)): data_valid_test = pd.concat([ data_valid_test, data_valid_train[(data_valid_train["SNR"] == i) & (data_valid_train["trial"] == SNR_valid_order[i][j])] ], ignore_index=True) data_valid_train = data_valid_train.drop( data_valid_train[(data_valid_train["SNR"] == i) & (data_valid_train["trial"] == SNR_valid_order[i][j])].index) i = 0 for l in lambdas: # Define model with l parameter model = ReceptiveField(tmin, tmax, sfreq, feature_names=list( data.columns[:16]), estimator=l, scoring="corrcoef") # Fit model to inner fold training data model.fit(np.asarray(data_valid_train[data.columns[:16]]), np.asarray(data_valid_train["target"])) # Compute cross correlation for regressional value val = np.zeros(len(SNR_order)) for i_ in range(len(SNR_order)): val[i_] = model.score( np.asarray( data_valid_test[data_valid_test["SNR"] == i_][ data.columns[:16]]), np.asarray(data_valid_test[data_valid_test["SNR"] == i_]["target"])) # Add score to matrix vals[j, i] = np.mean(val) i += 1 j += 1 # Get optimal parameter param_score = np.sum(vals, axis=0) lambda_opt = lambdas[np.argmax(param_score)] print("Optimal lambda = %f" % lambda_opt) # Train optimal model model_opt = ReceptiveField(tmin, tmax, sfreq, feature_names=list(data.columns[:16]), estimator=lambda_opt, scoring="corrcoef") # Fit model to train data model_opt.fit(np.asarray(data_train[data.columns[:16]]), np.asarray(data_train["target"])) for i in range(len(SNR_order)): # Predict envelope data_test_SNR = data_test[data_test["SNR"] == i] y_pred = model_opt.predict( np.asarray(data_test_SNR[data.columns[:16]])) y_rand = random_trial(data, TA=TA, trial=SNR_order[i][k])["target"] corr_true = pearsonr(y_pred, np.asarray(data_test_SNR["target"])) corr_mask = pearsonr(y_pred, np.asarray(data_test_SNR["mask"])) corr_rand = pearsonr(y_pred, np.asarray(y_rand)) # Convert to DataFrame data_results = np.zeros((1, len(df_cols))) data_results[:, 0] = corr_true[0] data_results[:, 1] = corr_mask[0] data_results[:, 2] = corr_rand[0] data_results[:, 3] = TA data_results[:, 4] = i df_ = pd.DataFrame(data=data_results, columns=df_cols) # Concatenate df = pd.concat([df, df_], ignore_index=True) df.to_pickle("local_data/results/result_%i_%i.pkl" % (TA, k)) print("Done") return df
def cross_validate(data, lambda_config=(2**0, 2**20, 11), t_config=(-.25, .1)): """ Parameters ---------- data : Pandas Dataframe load dataframe using from data_load, using getData function. lambda_config : tuple, optional Lambda range for Ridge regression. The default is (2e0, 2e20, 11). Range from Cross et al 2016 publication. t_config : tuple, optional Jitter lag range for MNE in ms. The default is (-.25, .1). Returns ------- NA """ # Define lambda values for training models lambdas = np.linspace(lambda_config[0], lambda_config[1], lambda_config[2]) # Parameters for MNE tmin, tmax = t_config sfreq = 64 # Split into train/testing with leave-one-group-out logo = LeaveOneGroupOut() # Define result DataFrame df_cols = ["corr_true", "corr_mask", "corr_rand", "TA", "SNR"] df = pd.DataFrame(columns=df_cols) TAs = np.unique(data["TA"]) #TAs = np.array([4, 5, 6]) for TA in TAs: SNRs = np.unique(data[data["TA"] == TA]["SNR"]) for SNR in SNRs: data_sub = data[(data["TA"] == TA) & (data["SNR"] == SNR)] # Assign X, y and group variable (trial, as to do leave-trial-out) X = data_sub[data.columns[:16]] y = data_sub["target"] masks = data_sub["mask"] groups = data_sub["trial"] n_outer_groups = len(np.unique(groups)) ### Two-layer CV starts here ### # Outer fold i = 0 for out_train_idx, out_test_idx in logo.split(X, y, groups): #print("Outer fold %i / %i" %(i + 1, n_outer_groups)) X_train = X.iloc[out_train_idx] y_train = y.iloc[out_train_idx] X_test = X.iloc[out_test_idx] y_test = y.iloc[out_test_idx] # Define inner groups, these are n - 1 of n total groups inner_groups = data["trial"].iloc[out_train_idx] n_inner_groups = len(np.unique(inner_groups)) # Initiate errors for inner fold validations vals = np.zeros((n_inner_groups, lambda_config[2])) # Inner fold j = 0 for inn_train_idx, inn_test_idx in logo.split( X_train, y_train, inner_groups): print( "TA = %i / %i\tSNR = %i / %i\nOuter fold %i / %i\t Inner fold %i / %i" % (TA + 1, len(TAs), SNR + 1, len(SNRs), i + 1, n_outer_groups, j + 1, n_inner_groups)) inn_X_train = X_train.iloc[inn_train_idx] inn_y_train = y_train.iloc[inn_train_idx] inn_X_test = X_train.iloc[inn_test_idx] inn_y_test = y_train.iloc[inn_test_idx] # Validate model with all parameters k = 0 for l in lambdas: # Define model with l parameter model = ReceptiveField(tmin, tmax, sfreq, feature_names=list( data.columns[:16]), estimator=l, scoring="corrcoef") # Fit model to inner fold training data model.fit(np.asarray(inn_X_train), np.asarray(inn_y_train)) # Compute cross correlation for regressional value val = model.score(np.asarray(inn_X_test), np.asarray(inn_y_test)) # Add score to matrix vals[j, k] = val k += 1 j += 1 # Get optimal parameter param_score = np.sum(vals, axis=0) #plt.title("Lambda scores, TA: %i, SNR: %i" %(TA, SNR)) #plt.plot(lambdas, param_score) #plt.xlabel("Lambda value") #plt.ylabel("Correlation score") #plt.show() lambda_opt = lambdas[np.argmax(param_score)] print("Optimal lambda = %f" % lambda_opt) # Train optimal model model_opt = ReceptiveField(tmin, tmax, sfreq, feature_names=list( data.columns[:16]), estimator=lambda_opt, scoring="corrcoef") # Fit model to train data model_opt.fit(np.asarray(X_train), np.asarray(y_train)) # Predict envelope y_pred = model_opt.predict(np.asarray(X_test)) #plt.plot(y_pred) #plt.plot(y_test) #plt.show() trial_test = np.unique(data_sub.iloc[out_test_idx]["trial"])[0] y_rand = random_trial(data, TA=TA, trial=trial_test)["target"] corr_true = pearsonr(y_pred, np.asarray(y_test)) corr_mask = pearsonr(y_pred, np.asarray(masks.iloc[out_test_idx])) corr_rand = pearsonr(y_pred, np.asarray(y_rand)) # Evaluate envelope, compare with random trial ### Add correlations to dataframe ### # Convert to DataFrame data_results = np.zeros((1, len(df_cols))) data_results[:, 0] = corr_true[0] data_results[:, 1] = corr_mask[0] data_results[:, 2] = corr_rand[0] data_results[:, 3] = TA data_results[:, 4] = SNR df_ = pd.DataFrame(data=data_results, columns=df_cols) # Concatenate df = pd.concat([df, df_], ignore_index=True) i += 1 df.to_pickle("local_data/results/result_%i_%i.pkl" % (TA, SNR)) return df
def cross_validate(data, TA=None, node_min=4, node_max=16, n_nodes=7): """ Parameters ---------- data : Pandas Dataframe load dataframe using from data_load, using getData function. node_min : Int, optional Min nr. of nodes in range of nodes to loop over in inner CV loop node_max : Int, optional Max nr. of nodes in range of nodes to loop over in inner CV loop n_nodes : Int, optional Number, N, of nodes to loop over in inner CV loop Returns ------- Pickle file with dataframe of results; """ np.random.seed(999) # Define range of nodes to optimize for training models nodes = np.linspace(node_min, node_max, n_nodes) nodes = nodes.astype(int) # Define result DataFrame df_cols = [ "corr_true", "corr_mask", "corr_rand", "TA", "SNR", "Optimal Nr. Nodes" ] df = pd.DataFrame(columns=df_cols) if TA == None: TAs = np.unique(data["TA"]) else: TAs = np.array([TA]) for TA in TAs: data_sub = data[data["TA"] == TA] data_train = data_sub SNRs = np.unique(data_sub["SNR"]) SNR_order = [] for SNR in SNRs: trials = data_sub[data_sub["SNR"] == SNR][ "trial"] # Get the trials trials = np.unique(trials) # Get the unique trial indicies np.random.shuffle(trials) # Shuffle the order of the trials SNR_order.append(trials) # Store the order # Get the lowest possible k for k-fold H = np.inf for order in SNR_order: if len(order) < H: H = len(order) # Outer fold for k in range(H): # Split into test and training data_train = data_sub data_test = pd.DataFrame() # Filter the test data away for i in range(len(SNR_order)): data_test = pd.concat([ data_test, data_train[(data_sub["SNR"] == i) & (data_train["trial"] == SNR_order[i][k])] ], ignore_index=True) data_train = data_train.drop( data_train[(data_train["SNR"] == i) & (data_train["trial"] == SNR_order[i][k])].index) # Initiate errors for inner fold validations vals = np.zeros((H - 1, n_nodes)) # Get the list of validation trials SNR_valid_order = SNR_order.copy() for i in range(len(SNR_order)): SNR_valid_order[i] = np.delete(SNR_valid_order[i], k) # Inner fold for j in range(H - 1): print("TA: %i / %i\n\tFold: %i / %i\n\tInner fold: %i / %i" % (TA + 1, len(TAs), k + 1, H, j + 1, H - 1)) # Find optimal hyperparameter data_valid_train = data_train data_valid_test = pd.DataFrame() for i in range(len(SNR_order)): data_valid_test = pd.concat([ data_valid_test, data_valid_train[(data_valid_train["SNR"] == i) & (data_valid_train["trial"] == SNR_valid_order[i][j])] ], ignore_index=True) data_valid_train = data_valid_train.drop( data_valid_train[(data_valid_train["SNR"] == i) & (data_valid_train["trial"] == SNR_valid_order[i][j])].index) i = 0 for l in nodes: # Define model with l parameter model = Sequential() model.add(BatchNormalization()) model.add( Dense( units=l, activation='tanh', input_shape=(np.asarray( data_valid_train[data.columns[:16]]).shape[0], np.asarray(data_valid_train[ data.columns[:16]]).shape[1]))) model.add(Dropout(0.2)) # Batch normalization model.add(BatchNormalization()) model.add(Dense(units=2, activation='tanh')) model.add(Dropout(0.5)) # Batch normalization model.add(BatchNormalization()) model.add(Dense(units=1, activation='linear')) model.compile(optimizer='rmsprop', loss=corr_loss) model.fit(np.asarray(data_valid_train[data.columns[:16]]), np.asarray(data_valid_train["target"]), epochs=30, verbose=2, shuffle=False) # Compute cross correlation for regressional value val = np.zeros(len(SNR_order)) for i_ in range(len(SNR_order)): val[i_] = model.evaluate( np.asarray( data_valid_test[data_valid_test["SNR"] == i_][ data.columns[:16]]), np.asarray(data_valid_test[data_valid_test["SNR"] == i_]["target"])) # Add score to matrix vals[j, i] = np.mean(val) i += 1 j += 1 # Get optimal parameter param_score = np.sum(vals, axis=0) node_opt = nodes[np.argmin(param_score)] print("Optimal lambda = %f" % node_opt) # Train optimal model model_opt = Sequential() model_opt.add(BatchNormalization()) model_opt.add( Dense( units=node_opt, activation='tanh', input_shape=(np.asarray( data_train[data.columns[:16]]).shape[0], np.asarray( data_train[data.columns[:16]]).shape[1]))) model_opt.add(Dropout(0.2)) # Batch normalization model_opt.add(BatchNormalization()) model_opt.add(Dense(units=2, activation='tanh')) model_opt.add(Dropout(0.5)) # Batch normalization model_opt.add(BatchNormalization()) model_opt.add(Dense(units=1, activation='linear')) model_opt.compile(optimizer='rmsprop', loss=corr_loss) # Fit model to train data model_opt.fit(np.asarray(data_train[data.columns[:16]]), np.asarray(data_train["target"]), epochs=30, verbose=2, shuffle=False) for i in range(len(SNR_order)): # Predict envelope data_test_SNR = data_test[data_test["SNR"] == i] y_pred = model_opt.predict( np.asarray(data_test_SNR[data.columns[:16]])) y_rand = random_trial(data, TA=TA, trial=SNR_order[i][k])["target"] # Compute Pearson R between predicted envelope and attended speech corr_true = corr( K.constant(np.asarray(data_test_SNR["target"])), K.constant(y_pred)) # Compute Pearson R between predicted envelope and unattended speech corr_mask = corr(K.constant(np.asarray(data_test_SNR["mask"])), K.constant(y_pred)) # Compute Pearson R between predicted envelope and random speech corr_rand = corr(K.constant(np.asarray(y_rand)), K.constant(y_pred)) # Convert to DataFrame data_results = np.zeros((1, len(df_cols))) data_results[:, 0] = np.asarray(corr_true) data_results[:, 1] = np.asarray(corr_mask) data_results[:, 2] = np.asarray(corr_rand) data_results[:, 3] = TA data_results[:, 4] = node_opt data_results[:, 5] = i df_ = pd.DataFrame(data=data_results, columns=df_cols) # Concatenate df = pd.concat([df, df_], ignore_index=True) df.to_pickle( "/content/Measuring_Cognitive_Load_DTU_WSAudiology/local_data/results/ALL_SNR_2_layer_ANN_result_%i_%i.pkl" % (TA, k)) print("Done") return df