Ejemplo n.º 1
0
def viz_errs_outliers_venn(X_test, preds, Y_test, num_feats_reduced=5):
    '''Compare outliers to errors in venn-diagram
    '''
    feat_names = data.get_feature_names(X_test)
    X_feat = X_test[feat_names]

    if num_feats_reduced is not None:
        pca = decomposition.PCA(n_components=num_feats_reduced)
        X_reduced = pca.fit_transform(X_feat)
    else:
        X_reduced = X_feat

    R, C = 2, 2
    titles = [
        'isolation forest', 'local outlier factor', 'elliptic envelop',
        'one-class svm'
    ]
    plt.figure(figsize=(6, 5), dpi=200)
    for i in range(4):
        plt.subplot(R, C, i + 1)
        plt.title(titles[i])
        if i == 0:
            clf = IsolationForest(n_estimators=10, warm_start=True)
        elif i == 1:
            clf = LocalOutlierFactor(novelty=True)
        elif i == 2:
            clf = EllipticEnvelope()
        elif i == 3:
            clf = OneClassSVM()
        clf.fit(X_reduced)  # fit 10 trees
        is_outlier = clf.predict(X_reduced) == -1
        is_err = preds != Y_test
        idxs = np.arange(is_outlier.size)
        venn2([set(idxs[is_outlier]), set(idxs[is_err])],
              set_labels=['outliers', 'errors'])
def test_reg(df,
             model,
             feat_names=None,
             outcome_def='Y_max_log',
             out_name='results/regression/test.pkl',
             seed=42):

    np.random.seed(seed)
    if not feat_names:
        feat_names = data.get_feature_names(df)
        feat_names = [
            x for x in feat_names if not x.startswith('sc_')
            and not x.startswith('nmf_') and not x in [
                'center_max', 'left_max', 'right_max', 'up_max', 'down_max',
                'X_max_around_Y_peak', 'X_max_after_Y_peak'
            ] and not x.startswith('pc_') and not 'log' in x
            and not 'binary' in x
            #               and not 'slope' in x
        ]
    X = df[feat_names]
    # X = (X - X.mean()) / X.std() # normalize the data
    test_preds = model.predict(X)
    results = {'preds': test_preds}
    if outcome_def in df.keys():
        y = df[outcome_def].values
        results['r2'] = r2_score(y, test_preds)
        results['pearsonr'] = pearsonr(y, test_preds)
        results['kendalltau'] = kendalltau(y, test_preds)

    return results
def normalize_and_predict(m0,
                          feat_names_selected,
                          dset_name,
                          normalize_by_train,
                          exclude_easy_tracks=False,
                          outcome_def='y_consec_thresh'):
    df_new = data.get_data(dset=dset_name,
                           use_processed=True,
                           use_processed_dicts=True,
                           outcome_def=outcome_def,
                           previous_meta_file=oj(
                               DIR_PROCESSED,
                               'metadata_clath_aux+gak_a7d2.pkl'))
    if exclude_easy_tracks:
        df_new = df_new[
            df_new['valid']]  # exclude test cells, short/long tracks, hotspots

    # impute (only does anything for dynamin data)
    df_new = df_new.fillna(df_new.median())

    X_new = df_new[data.get_feature_names(df_new)]
    if normalize_by_train:
        X_new = (X_new - X_mean_train) / X_std_train
    else:
        X_new = (X_new - X_new.mean()) / X_new.std()
    y_new = df_new[outcome_def].values
    preds_new = m0.predict(X_new[feat_names_selected])
    preds_proba_new = m0.predict_proba(X_new[feat_names_selected])[:, 1]
    Y_maxes = df_new['Y_max']
    return df_new, y_new, preds_new, preds_proba_new, Y_maxes
Ejemplo n.º 4
0
def add_binary_features(df, outcome_def):
    '''binarize features at the difference between the mean of each class
    '''
    feat_names = data.get_feature_names(df)
    threshes = (df[df[outcome_def] == 1].mean() +
                df[df[outcome_def] == 0].mean()) / 2
    for i, k in tqdm(enumerate(feat_names)):
        thresh = threshes.loc[k]
        df[k + '_binary'] = df[k] >= thresh
    return df
def normalize(df, outcome_def):
    X = df[data.get_feature_names(df)]
    X_mean = X.mean()
    X_std = X.std()
    ks = list(X.keys())

    norms = {ks[i]: {'mu': X_mean[i], 'std': X_std[i]} for i in range(len(ks))}
    X = (X - X_mean) / X_std
    y = df[outcome_def].values
    return X, y, norms
Ejemplo n.º 6
0
def add_pcs(df):
    '''adds 10 pcs based on feature names
    '''
    feat_names = data.get_feature_names(df)
    X = df[feat_names]
    X = (X - X.mean()) / X.std()
    pca = decomposition.PCA(whiten=True)
    pca.fit(X[df.valid])
    X_reduced = pca.transform(X)
    for i in range(10):
        df['pc_' + str(i)] = X_reduced[:, i]
    return df
Ejemplo n.º 7
0
def input_fn_dataset(df):
    features = df[data.get_feature_names()]
    labels = df[data.get_target_name()]

    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    if param('shuffle_data'):
        dataset = dataset.shuffle(buffer_size=256)
    dataset = dataset.repeat(param('epochs'))
    dataset = dataset.batch(param('batch_size'))
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    return batch_features, batch_labels
Ejemplo n.º 8
0
def input_fn_pandas(df):
    x = df[data.get_feature_names()]
    y = df[data.get_target_name()]

    return tf.estimator.inputs.pandas_input_fn(
        x,
        y,
        batch_size=param('batch_size'),
        num_epochs=param('epochs'),
        shuffle=param('shuffle_data'),
        queue_capacity=1000,
        num_threads=4,
        target_column=data.get_target_name()
    )
Ejemplo n.º 9
0
def get_feature_columns():
    feature_names = data.get_feature_names()
    classifier_type = param('classifier_type')
    if classifier_type == 'linear':
        return [
            tf.feature_column.categorical_column_with_vocabulary_list(name, data.get_feature_values(name))
            for name in feature_names
        ]
    elif classifier_type == 'dnn':
        return [
            tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_vocabulary_list(name, data.get_feature_values(name))
            )
            for name in feature_names
        ]
    raise Exception('Unsupported classifier type: {}'.format(classifier_type))
def load_and_train(dset,
                   outcome_def,
                   out_dir,
                   feat_names=None,
                   use_processed=True):

    df = pd.read_pickle(f'../data/tracks/tracks_{dset}.pkl')
    if dset == 'clath_aux_dynamin':
        df = df[df.catIdx.isin([1, 2])]
        df = df[df.lifetime > 15]
    else:
        df = df[df['valid'] == 1]
    df = features.add_basic_features(df)
    df = log_transforms(df)
    df = add_sig_mean(df)
    df_train = df[df.cell_num.isin(config.DSETS[dset]['train'])]
    df_test = df[df.cell_num.isin(config.DSETS[dset]['test'])]
    df_train = df_train.dropna()

    #outcome_def = 'Z_sig_mean'
    #out_dir = 'results/regression/Sep15'
    os.makedirs(out_dir, exist_ok=True)
    if not feat_names:
        feat_names = data.get_feature_names(df_train)
        feat_names = [
            x for x in feat_names if not x.startswith('sc_')
            and not x.startswith('nmf_') and not x in [
                'center_max', 'left_max', 'right_max', 'up_max', 'down_max',
                'X_max_around_Y_peak', 'X_max_after_Y_peak'
            ] and not x.startswith('pc_') and not 'log' in x
            and not 'binary' in x
            #               and not 'slope' in x
        ]
    for model_type in tqdm(['linear', 'gb', 'rf', 'svm', 'ridge']):
        out_name = f'{model_type}'
        #print(out_name)
        if use_processed and os.path.exists(f'{out_dir}/{out_name}.pkl'):
            continue
        train_reg(df_train,
                  feat_names=feat_names,
                  model_type=model_type,
                  outcome_def=outcome_def,
                  out_name=f'{out_dir}/{out_name}.pkl')
Ejemplo n.º 11
0
def add_trend_filtering(df):
    df_tf = deepcopy(df)
    for i in range(len(df)):
        df_tf['X'].iloc[i] = trend_filtering.trend_filtering(
            y=df['X'].iloc[i], vlambda=len(df['X'].iloc[i]) * 5, order=1)
    df_tf = add_features(df_tf)
    feat_names = data.get_feature_names(df_tf)
    feat_names = [
        x for x in feat_names
        if not x.startswith('sc_') and not x.startswith('nmf_') and not x in [
            'center_max', 'left_max', 'right_max', 'up_max', 'down_max',
            'X_max_around_Y_peak', 'X_max_after_Y_peak',
            'X_max_diff_after_Y_peak', 'X_tf'
        ] and not x.startswith('pc_')
        #               and not 'local' in x
        #               and not 'X_peak' in x
        #               and not 'slope' in x
        #               and not x in ['fall_final', 'fall_slope', 'fall_imp', 'fall']
    ]
    for feat in feat_names:
        df[feat + '_tf_smooth'] = df_tf[feat]
    return df
Ejemplo n.º 12
0
                    res[f'{feat}_{metric}'].append(test_preds[metric])
                else:
                    res[f'{feat}_{metric}'].append(test_preds[metric][0])
                    
    res_df = pd.DataFrame(res, index=datasets)
    res_df.to_pickle('results/regression/regression_results.pkl')
            
if __name__ == '__main__':
    
    print("loading data...")
    all_data = load_all_datasets()
    print("training regression models...")
    df = all_data[orig_dset]
    df_train = df[df.cell_num.isin(config.DSETS[orig_dset]['train'])] 
    df_train = df_train[df_train['valid'] == 1]
    feat_names = data.get_feature_names(df_train)
    feat_names = [x for x in feat_names 
                      if not x.startswith('sc_') 
                      and not x.startswith('nmf_')
                      and not x in ['center_max', 'left_max', 'right_max', 'up_max', 'down_max',
                                   'X_max_around_Y_peak', 'X_max_after_Y_peak', 'X_quantiles',
                                   'X_d1', 'X_d2', 'X_d3', 'slope_end'
                                   ]
                      and not x.startswith('pc_')
                      and not 'log' in x
                      and not 'binary' in x
        #               and not 'slope' in x
                     ]
    train_and_save_models(df_train, feat_names, outcome_def)
    print("testing regression models...")
    test_and_save_results()
Ejemplo n.º 13
0
from src import train
# from src.viz import *
import config

if __name__ == '__main__':
    # some settings
    outcome_def = 'y_consec_thresh'
    out_dir = oj('/scratch/users/vision/chandan/abc', 'nov16')
    dset_key = 'clath_aux+gak_a7d2'
    dset = config.DSETS[dset_key]
    binarize = False  # True

    # get data
    df = data.get_data(dset=dset_key)
    df = df[df['valid']]  # exclude test cells, short/long tracks, hotspots
    feat_names = data.get_feature_names(df)
    feat_names = data.select_final_feats(feat_names, binarize=binarize)
    print('num feats', len(feat_names))
    print(feat_names)

    # run
    os.makedirs(out_dir, exist_ok=True)
    feature_selection_nums = [
        2, 3, 4, 5, 6, 7, 8
    ]  #, 9, 15, len(feat_names)] #[3, 5, 7, 12, 16]: # number of feature to select [4, 9, 11, 23, 35, 39]
    for calibrated in [True, False]:
        for feature_selection_num in feature_selection_nums:
            for feature_selection in ['select_lasso', 'select_rf'
                                      ]:  # select_lasso, select_rf, None
                if feature_selection is None and feature_selection_num > feature_selection_nums[
                        0]:  # don't do extra computation