def merge_seq_with_mpra(dataset): ''' converts DNA sequences around variants into dataframe, merges with MPRA train and test sets ''' train = load_train_set(dataset) test = load_test_set(dataset) seq_df = convert_seq_fa_to_df(os.path.join(cfg.SEQ_DIR, SeqDatasets)) seq_df['pos'] = seq_df['pos1'] + 9 train_seq = pd.merge(train[['chr','pos','rs','Label']], seq_df[['chr','pos','seq']], how='inner') test_seq = pd.merge(test[['chr','pos','rs','Label']], seq_df[['chr','pos','seq']], how='inner') return train_seq, test_seq
def add_valley_scores(dataset): dataset='E116' train = load_train_set(dataset) test = load_test_set(dataset) valley = pd.read_csv(os.path.join(cfg.DATA_DIR, 'bigwig', '{0}_valley.csv'.format(dataset))) cols = valley.columns.values new_cols = ['val' + x for x in cols] new_cols[0] = 'variant' valley.columns = new_cols val_train = pd.merge(train[['chr','pos','rs','Label']], valley, left_on='rs', right_on='variant') val_test = pd.merge(test[['chr','pos','rs','Label']], valley, left_on='rs', right_on='variant') return val_train, val_test
''' all_results = pd.DataFrame() param_storage = [] random_param_list = [param_fn() for _ in range(iterations)] for iter in range(iterations): print('Iteration {0}: '.format(iter)) params = random_param_list[iter] trials = splits_single_model(data, params, mod='mpra') params.update(trials) param_storage.append(params) # periodically save results to file and flush param_storage if iter % 2 == 1: all_results = pd.concat([all_results, pd.DataFrame(param_storage)]) all_results.to_csv(outfile, index=False) param_storage = [] if __name__ == '__main__': data = load_train_set(dataset='E116') outfile = join(cfg.OUTPUT_DIR, 'hparams_vat2.csv') execute_random_search(data, outfile=outfile, iterations=50, param_fn=get_random_params_vat)
fig.savefig(join(cfg.OUTPUT_DIR, 'f1.png'), dpi=150, bbox_inches='tight') tsne2 = sns.lmplot(x='tsne-1', y='tsne-2', data=df_tsne, hue='Predicted', fit_reg=False, palette='Set2') plt.title('t-SNE components by predicted label, threshold=0.03') fig = tsne2.fig fig.savefig(join(cfg.OUTPUT_DIR, 'f2.png'), dpi=150, bbox_inches='tight') # error analysis res = pd.read_csv(join(cfg.OUTPUT_DIR, 'scores.csv')) train = load_train_set('E116') train_counts = train.groupby('chr').sum().Label errs = np.zeros(22) for c in range(1, 23): tmp = res[res['chr'] == c] met = avgPR(tmp['Label'], tmp['Score']) errs[c - 1] = met chrs = np.arange(1, 23) col1 = '#66c2a5' col2 = '#fc8d62' fig, axes = plt.subplots(nrows=2, sharex=True) axes[0].bar(chrs, errs, color=col1) axes[1].bar(chrs, train_counts, color=col2)