Example #1
0
 
 print("loading data")
 dsets = ['clath_aux+gak_a7d2', 'clath_aux+gak', 'clath_aux+gak_a7d2_new', 'clath_aux+gak_new', 'clath_gak', 'clath_aux_dynamin']
 splits = ['train', 'test']
 #feat_names = ['X_same_length_normalized'] + data.select_final_feats(data.get_feature_names(df))
               #['mean_total_displacement', 'mean_square_displacement', 'lifetime']
 meta = ['cell_num', 'Y_sig_mean', 'Y_sig_mean_normalized', 'y_consec_thresh']
 for length in [40, 100, 200]:
     for padding in ['front', 'end']:
         dfs, feat_names = data.load_dfs_for_lstm(dsets=dsets, 
                                                  splits=splits, 
                                                  meta=meta,
                                                  length=length,
                                                  padding=padding)
         df_full = pd.concat([dfs[(k, s)]
                              for (k, s) in dfs
                              if s == 'train'])[feat_names + meta]
         np.random.seed(42)
         checkpoint_fname = f'../models/dnn_full_long_normalized_across_track_1_feat_dynamin_{length}_{padding}_tuning.pkl'
         valid_cells = ['A7D2/1', 
                        'CLTA-TagRFP EGFP-Aux1 EGFP-GAK F6/1', 
                        'CLTA-TagRFP EGFP-GAK A8/1', 
                        'EGFP-GAK F6/1',
                        '488-1.5mW 561-1.5mW 647-1.5mW Exp100ms Int1.5s_4_Pos0/1_1.5s',
                        '488-1.5mW 561-1.5mW 647-1.5mW Exp100ms Int1.5s_4_Pos1/1_1.5s',
                        '488-1.5mW 561-1.5mW 647-1.5mW Exp100ms Int1.5s_4_Pos2/1_1.5s']
         valid = df_full['cell_num'].isin(valid_cells)
         df_full_train = df_full[~valid]
         dnn = neural_networks.neural_net_sklearn(D_in=length, H=20, p=0, arch='lstm', epochs=200)
         dnn.fit(df_full_train[feat_names[:1]], df_full_train['Y_sig_mean_normalized'].values, verbose=True, checkpoint_fname=checkpoint_fname)
         pkl.dump({'model_state_dict': dnn.model.state_dict()}, open(checkpoint_fname, 'wb'))
Example #2
0
 ]
 splits = ['train', 'test']
 #feat_names = [''] + data.select_final_feats(data.get_feature_names(df))
 #['mean_total_displacement', 'mean_square_displacement', 'lifetime']
 length = 40
 padding = 'end'
 feat_name = 'X_same_length_extended_normalized'  # include buffer X_same_length_normalized
 outcome = 'Y_sig_mean_normalized'
 for lifetime_threshold in [5, 10, 15]:
     dfs, feat_names = data.load_dfs_for_lstm(
         dsets=dsets,
         splits=splits,
         lifetime_threshold=lifetime_threshold,
         length=length,
         padding=padding)
     df_full = pd.concat([dfs[(k, s)] for (k, s) in dfs if s == 'train'])
     np.random.seed(42)
     checkpoint_fname = f'../models/dnn_fit_extended_lifetimes>{lifetime_threshold}.pkl'
     dnn = neural_networks.neural_net_sklearn(D_in=length,
                                              H=20,
                                              p=0,
                                              arch='lstm',
                                              epochs=200,
                                              track_name=feat_name)
     dnn.fit(df_full[[feat_name]],
             df_full[outcome].values,
             verbose=True,
             checkpoint_fname=checkpoint_fname,
             device='cuda')
     pkl.dump({'model_state_dict': dnn.model.cpu().state_dict()},
              open(checkpoint_fname, 'wb'))
def train_reg(df,
              feat_names,
              model_type='rf',
              outcome_def='Y_max_log',
              out_name='results/regression/test.pkl',
              seed=42,
              **kwargs):
    '''
    train regression model
    
    hyperparameters of model can be specified using **kwargs
    '''
    np.random.seed(seed)
    X = df[feat_names]
    # X = (X - X.mean()) / X.std() # normalize the data
    y = df[outcome_def].values

    if model_type == 'rf':
        m = RandomForestRegressor(n_estimators=100)
    elif model_type == 'dt':
        m = DecisionTreeRegressor()
    elif model_type == 'linear':
        m = LinearRegression()
    elif model_type == 'ridge':
        m = RidgeCV()
    elif model_type == 'svm':
        m = SVR(gamma='scale')
    elif model_type == 'gb':
        m = GradientBoostingRegressor()
    elif model_type == 'irf':
        m = irf.ensemble.wrf()
    elif 'nn' in model_type:  # neural nets
        """
        train fully connected neural network
        """

        H = kwargs[
            'fcnn_hidden_neurons'] if 'fcnn_hidden_neurons' in kwargs else 40
        epochs = kwargs['fcnn_epochs'] if 'fcnn_epochs' in kwargs else 1000
        batch_size = kwargs[
            'fcnn_batch_size'] if 'fcnn_batch_size' in kwargs else 1000
        track_name = kwargs[
            'track_name'] if 'track_name' in kwargs else 'X_same_length'
        D_in = len(df[track_name].iloc[0])

        m = neural_net_sklearn(D_in=D_in,
                               H=H,
                               p=len(feat_names) - 1,
                               epochs=epochs,
                               batch_size=batch_size,
                               track_name=track_name,
                               arch=model_type)

    # scores_cv = {s: [] for s in scorers.keys()}
    # scores_test = {s: [] for s in scorers.keys()}
    imps = {'model': [], 'imps': []}

    cell_nums_train = np.array(list(set(df.cell_num.values)))
    kf = KFold(n_splits=len(cell_nums_train))

    # split testing data based on cell num
    #idxs_test = df.cell_num.isin(cell_nums_test)
    #idxs_train = df.cell_num.isin(cell_nums_train)
    #X_test, Y_test = X[idxs_test], y[idxs_test]
    num_pts_by_fold_cv = []
    y_preds = {}
    cv_score = []
    cv_pearsonr = []

    print("Looping over cv...")
    # loops over cv, where test set order is cell_nums_train[0], ..., cell_nums_train[-1]
    for cv_idx, cv_val_idx in tqdm(kf.split(cell_nums_train)):
        # get sample indices

        idxs_cv = df.cell_num.isin(cell_nums_train[np.array(cv_idx)])
        idxs_val_cv = df.cell_num.isin(cell_nums_train[np.array(cv_val_idx)])
        X_train_cv, Y_train_cv = X[idxs_cv], y[idxs_cv]
        X_val_cv, Y_val_cv = X[idxs_val_cv], y[idxs_val_cv]
        num_pts_by_fold_cv.append(X_val_cv.shape[0])

        # resample training data

        # fit
        m.fit(X_train_cv, Y_train_cv)

        # get preds
        preds = m.predict(X_val_cv)
        y_preds[cell_nums_train[np.array(cv_val_idx)][0]] = preds
        if 'log' in outcome_def:
            cv_score.append(r2_score(np.exp(Y_val_cv), np.exp(preds)))
            cv_pearsonr.append(pearsonr(np.exp(Y_val_cv), np.exp(preds))[0])
        else:
            print(r2_score(Y_val_cv, preds))
            cv_score.append(r2_score(Y_val_cv, preds))
            cv_pearsonr.append(pearsonr(Y_val_cv, preds)[0])

    print("Training with full data...")
    # cv_score = cv_score/len(cell_nums_train)
    m.fit(X, y)
    #print(cv_score)
    #test_preds = m.predict(X_test)
    results = {
        'y_preds': y_preds,
        'y': y,
        'model_state_dict': m.model.state_dict(),
        #'test_preds': test_preds,
        'cv': {
            'r2': cv_score,
            'pearsonr': cv_pearsonr
        },
        'model_type': model_type,
        #'model': m,
        'num_pts_by_fold_cv': np.array(num_pts_by_fold_cv),
    }
    if model_type in ['rf', 'linear', 'ridge', 'gb', 'svm', 'irf']:
        results['model'] = m
    # save results
    # os.makedirs(out_dir, exist_ok=True)

    pkl.dump(results, open(out_name, 'wb'))
Example #4
0
                    #    df = df.dropna()
                    X = df[feat_set]
                    X = X.fillna(X.mean())
                    #y = df['Y_sig_mean_normalized']
                    y_reg = df['Y_sig_mean_normalized'].values
                    y = df[outcome_def].values
                    preds = m.predict(X)
                    get_all_scores(y, preds, y_reg, df)

    print("computing predictions for lstm")
    models.append('lstm')
    results = pkl.load(
        open(
            '../models/dnn_full_long_normalized_across_track_1_feat_dynamin.pkl',
            'rb'))
    dnn = neural_networks.neural_net_sklearn(D_in=40, H=20, p=0, arch='lstm')
    dnn.model.load_state_dict(results['model_state_dict'])
    for i, (k, v) in enumerate(ds.keys()):
        if v == 'test':
            df = ds[(k, v)]
            X = df[feat_names[:1]]
            y_reg = df['Y_sig_mean_normalized'].values
            y = df[outcome_def].values
            #preds = np.logical_and(dnn.predict(X), df['X_max'] > 1500).values.astype(int)
            preds = dnn.predict(X)
            get_all_scores(y, preds, y_reg, df)

    print('saving')
    dataset_level_res = pd.DataFrame(dataset_level_res, index=models)
    dataset_level_res.to_csv(f"../reports/dataset_level_res_{outcome_def}.csv")