print("loading data") dsets = ['clath_aux+gak_a7d2', 'clath_aux+gak', 'clath_aux+gak_a7d2_new', 'clath_aux+gak_new', 'clath_gak', 'clath_aux_dynamin'] splits = ['train', 'test'] #feat_names = ['X_same_length_normalized'] + data.select_final_feats(data.get_feature_names(df)) #['mean_total_displacement', 'mean_square_displacement', 'lifetime'] meta = ['cell_num', 'Y_sig_mean', 'Y_sig_mean_normalized', 'y_consec_thresh'] for length in [40, 100, 200]: for padding in ['front', 'end']: dfs, feat_names = data.load_dfs_for_lstm(dsets=dsets, splits=splits, meta=meta, length=length, padding=padding) df_full = pd.concat([dfs[(k, s)] for (k, s) in dfs if s == 'train'])[feat_names + meta] np.random.seed(42) checkpoint_fname = f'../models/dnn_full_long_normalized_across_track_1_feat_dynamin_{length}_{padding}_tuning.pkl' valid_cells = ['A7D2/1', 'CLTA-TagRFP EGFP-Aux1 EGFP-GAK F6/1', 'CLTA-TagRFP EGFP-GAK A8/1', 'EGFP-GAK F6/1', '488-1.5mW 561-1.5mW 647-1.5mW Exp100ms Int1.5s_4_Pos0/1_1.5s', '488-1.5mW 561-1.5mW 647-1.5mW Exp100ms Int1.5s_4_Pos1/1_1.5s', '488-1.5mW 561-1.5mW 647-1.5mW Exp100ms Int1.5s_4_Pos2/1_1.5s'] valid = df_full['cell_num'].isin(valid_cells) df_full_train = df_full[~valid] dnn = neural_networks.neural_net_sklearn(D_in=length, H=20, p=0, arch='lstm', epochs=200) dnn.fit(df_full_train[feat_names[:1]], df_full_train['Y_sig_mean_normalized'].values, verbose=True, checkpoint_fname=checkpoint_fname) pkl.dump({'model_state_dict': dnn.model.state_dict()}, open(checkpoint_fname, 'wb'))
] splits = ['train', 'test'] #feat_names = [''] + data.select_final_feats(data.get_feature_names(df)) #['mean_total_displacement', 'mean_square_displacement', 'lifetime'] length = 40 padding = 'end' feat_name = 'X_same_length_extended_normalized' # include buffer X_same_length_normalized outcome = 'Y_sig_mean_normalized' for lifetime_threshold in [5, 10, 15]: dfs, feat_names = data.load_dfs_for_lstm( dsets=dsets, splits=splits, lifetime_threshold=lifetime_threshold, length=length, padding=padding) df_full = pd.concat([dfs[(k, s)] for (k, s) in dfs if s == 'train']) np.random.seed(42) checkpoint_fname = f'../models/dnn_fit_extended_lifetimes>{lifetime_threshold}.pkl' dnn = neural_networks.neural_net_sklearn(D_in=length, H=20, p=0, arch='lstm', epochs=200, track_name=feat_name) dnn.fit(df_full[[feat_name]], df_full[outcome].values, verbose=True, checkpoint_fname=checkpoint_fname, device='cuda') pkl.dump({'model_state_dict': dnn.model.cpu().state_dict()}, open(checkpoint_fname, 'wb'))
def train_reg(df, feat_names, model_type='rf', outcome_def='Y_max_log', out_name='results/regression/test.pkl', seed=42, **kwargs): ''' train regression model hyperparameters of model can be specified using **kwargs ''' np.random.seed(seed) X = df[feat_names] # X = (X - X.mean()) / X.std() # normalize the data y = df[outcome_def].values if model_type == 'rf': m = RandomForestRegressor(n_estimators=100) elif model_type == 'dt': m = DecisionTreeRegressor() elif model_type == 'linear': m = LinearRegression() elif model_type == 'ridge': m = RidgeCV() elif model_type == 'svm': m = SVR(gamma='scale') elif model_type == 'gb': m = GradientBoostingRegressor() elif model_type == 'irf': m = irf.ensemble.wrf() elif 'nn' in model_type: # neural nets """ train fully connected neural network """ H = kwargs[ 'fcnn_hidden_neurons'] if 'fcnn_hidden_neurons' in kwargs else 40 epochs = kwargs['fcnn_epochs'] if 'fcnn_epochs' in kwargs else 1000 batch_size = kwargs[ 'fcnn_batch_size'] if 'fcnn_batch_size' in kwargs else 1000 track_name = kwargs[ 'track_name'] if 'track_name' in kwargs else 'X_same_length' D_in = len(df[track_name].iloc[0]) m = neural_net_sklearn(D_in=D_in, H=H, p=len(feat_names) - 1, epochs=epochs, batch_size=batch_size, track_name=track_name, arch=model_type) # scores_cv = {s: [] for s in scorers.keys()} # scores_test = {s: [] for s in scorers.keys()} imps = {'model': [], 'imps': []} cell_nums_train = np.array(list(set(df.cell_num.values))) kf = KFold(n_splits=len(cell_nums_train)) # split testing data based on cell num #idxs_test = df.cell_num.isin(cell_nums_test) #idxs_train = df.cell_num.isin(cell_nums_train) #X_test, Y_test = X[idxs_test], y[idxs_test] num_pts_by_fold_cv = [] y_preds = {} cv_score = [] cv_pearsonr = [] print("Looping over cv...") # loops over cv, where test set order is cell_nums_train[0], ..., cell_nums_train[-1] for cv_idx, cv_val_idx in tqdm(kf.split(cell_nums_train)): # get sample indices idxs_cv = df.cell_num.isin(cell_nums_train[np.array(cv_idx)]) idxs_val_cv = df.cell_num.isin(cell_nums_train[np.array(cv_val_idx)]) X_train_cv, Y_train_cv = X[idxs_cv], y[idxs_cv] X_val_cv, Y_val_cv = X[idxs_val_cv], y[idxs_val_cv] num_pts_by_fold_cv.append(X_val_cv.shape[0]) # resample training data # fit m.fit(X_train_cv, Y_train_cv) # get preds preds = m.predict(X_val_cv) y_preds[cell_nums_train[np.array(cv_val_idx)][0]] = preds if 'log' in outcome_def: cv_score.append(r2_score(np.exp(Y_val_cv), np.exp(preds))) cv_pearsonr.append(pearsonr(np.exp(Y_val_cv), np.exp(preds))[0]) else: print(r2_score(Y_val_cv, preds)) cv_score.append(r2_score(Y_val_cv, preds)) cv_pearsonr.append(pearsonr(Y_val_cv, preds)[0]) print("Training with full data...") # cv_score = cv_score/len(cell_nums_train) m.fit(X, y) #print(cv_score) #test_preds = m.predict(X_test) results = { 'y_preds': y_preds, 'y': y, 'model_state_dict': m.model.state_dict(), #'test_preds': test_preds, 'cv': { 'r2': cv_score, 'pearsonr': cv_pearsonr }, 'model_type': model_type, #'model': m, 'num_pts_by_fold_cv': np.array(num_pts_by_fold_cv), } if model_type in ['rf', 'linear', 'ridge', 'gb', 'svm', 'irf']: results['model'] = m # save results # os.makedirs(out_dir, exist_ok=True) pkl.dump(results, open(out_name, 'wb'))
# df = df.dropna() X = df[feat_set] X = X.fillna(X.mean()) #y = df['Y_sig_mean_normalized'] y_reg = df['Y_sig_mean_normalized'].values y = df[outcome_def].values preds = m.predict(X) get_all_scores(y, preds, y_reg, df) print("computing predictions for lstm") models.append('lstm') results = pkl.load( open( '../models/dnn_full_long_normalized_across_track_1_feat_dynamin.pkl', 'rb')) dnn = neural_networks.neural_net_sklearn(D_in=40, H=20, p=0, arch='lstm') dnn.model.load_state_dict(results['model_state_dict']) for i, (k, v) in enumerate(ds.keys()): if v == 'test': df = ds[(k, v)] X = df[feat_names[:1]] y_reg = df['Y_sig_mean_normalized'].values y = df[outcome_def].values #preds = np.logical_and(dnn.predict(X), df['X_max'] > 1500).values.astype(int) preds = dnn.predict(X) get_all_scores(y, preds, y_reg, df) print('saving') dataset_level_res = pd.DataFrame(dataset_level_res, index=models) dataset_level_res.to_csv(f"../reports/dataset_level_res_{outcome_def}.csv")