Exemple #1
0
def merge_seq_with_mpra(dataset):
    ''' converts DNA sequences around variants into dataframe, merges with MPRA
        train and test sets '''
    train = load_train_set(dataset)
    test = load_test_set(dataset)

    seq_df = convert_seq_fa_to_df(os.path.join(cfg.SEQ_DIR, SeqDatasets))
    seq_df['pos'] = seq_df['pos1'] + 9

    train_seq = pd.merge(train[['chr','pos','rs','Label']], seq_df[['chr','pos','seq']], how='inner')
    test_seq = pd.merge(test[['chr','pos','rs','Label']], seq_df[['chr','pos','seq']], how='inner')

    return train_seq, test_seq
Exemple #2
0
def add_valley_scores(dataset):
    dataset='E116'
    train = load_train_set(dataset)
    test = load_test_set(dataset)

    valley = pd.read_csv(os.path.join(cfg.DATA_DIR, 'bigwig', '{0}_valley.csv'.format(dataset)))
    cols = valley.columns.values
    new_cols = ['val' + x for x in cols]
    new_cols[0] = 'variant'
    valley.columns = new_cols

    val_train = pd.merge(train[['chr','pos','rs','Label']], valley, left_on='rs', right_on='variant')
    val_test = pd.merge(test[['chr','pos','rs','Label']], valley, left_on='rs', right_on='variant')

    return val_train, val_test
Exemple #3
0
    '''
    all_results = pd.DataFrame()
    param_storage = []

    random_param_list = [param_fn() for _ in range(iterations)]
    for iter in range(iterations):
        print('Iteration {0}: '.format(iter))

        params = random_param_list[iter]
        trials = splits_single_model(data, params, mod='mpra')

        params.update(trials)
        param_storage.append(params)

        # periodically save results to file and flush param_storage
        if iter % 2 == 1:
            all_results = pd.concat([all_results, pd.DataFrame(param_storage)])
            all_results.to_csv(outfile, index=False)
            param_storage = []


if __name__ == '__main__':

    data = load_train_set(dataset='E116')
    outfile = join(cfg.OUTPUT_DIR, 'hparams_vat2.csv')

    execute_random_search(data,
                          outfile=outfile,
                          iterations=50,
                          param_fn=get_random_params_vat)
fig.savefig(join(cfg.OUTPUT_DIR, 'f1.png'), dpi=150, bbox_inches='tight')

tsne2 = sns.lmplot(x='tsne-1',
                   y='tsne-2',
                   data=df_tsne,
                   hue='Predicted',
                   fit_reg=False,
                   palette='Set2')
plt.title('t-SNE components by predicted label, threshold=0.03')
fig = tsne2.fig
fig.savefig(join(cfg.OUTPUT_DIR, 'f2.png'), dpi=150, bbox_inches='tight')

# error analysis
res = pd.read_csv(join(cfg.OUTPUT_DIR, 'scores.csv'))

train = load_train_set('E116')
train_counts = train.groupby('chr').sum().Label

errs = np.zeros(22)
for c in range(1, 23):
    tmp = res[res['chr'] == c]
    met = avgPR(tmp['Label'], tmp['Score'])
    errs[c - 1] = met

chrs = np.arange(1, 23)
col1 = '#66c2a5'
col2 = '#fc8d62'

fig, axes = plt.subplots(nrows=2, sharex=True)
axes[0].bar(chrs, errs, color=col1)
axes[1].bar(chrs, train_counts, color=col2)