def test_read(self): in_gmt = gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test.gmt")) self.assertEqual(len(self.example_gmt), len(in_gmt)) self.assertEqual(self.example_gmt[0], in_gmt[0]) self.assertEqual(self.example_gmt[1], in_gmt[1]) with self.assertRaises(AssertionError) as e: gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test_bad.gmt")) self.assertIn("3 tab-delimited items. line_num: 0", str(e.exception)) with self.assertRaises(AssertionError) as e: gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test_bad2.gmt")) self.assertIn("same set. line_num: 1", str(e.exception))
def test_write(self): out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_write.gmt") gmt.write(self.example_gmt, out_path) self.assertTrue(os.path.exists(out_path)) read_back_in = gmt.read(out_path) self.assertEqual(len(self.example_gmt), len(read_back_in)) self.assertEqual(self.example_gmt[0], read_back_in[0]) self.assertEqual(self.example_gmt[1], read_back_in[1]) # Cleanup os.remove(out_path)
def wtks(gct, metadata, outfolder, gmt_path='/Users/elemire/Workspace/merino/full_sensitivities.gmt', group_col='weave_prefix'): gmt_file = gmt.read(gmt_path) sensitivity_map = reformat_gmt(gmt_file) metadata = metadata.loc[gct.data_df.columns] expected_sensitivity_ranks = [] for rep in metadata[group_col].dropna().unique(): if not os.path.exists(os.path.join(outfolder, rep)): os.mkdir(os.path.join(outfolder, rep)) os.mkdir(os.path.join(outfolder, rep, 'enrichment_ecdf')) os.mkdir(os.path.join(outfolder, rep, 'mountain_plots')) rep_folder = os.path.join(outfolder, rep) ids = metadata[metadata[group_col] == rep].index data = gct.data_df[ids] data.index = [str(x) for x in data.index] col_meta = metadata.loc[ids] marks = {} s_qi_map = make_sqi_map(sensitivity_map, data=data, col_meta=col_meta) for key in s_qi_map: #print key.split('_')[0] if key.split('_')[0] in col_meta['pert_id'].tolist(): if len(s_qi_map[key][0]) > 0 and len(s_qi_map[key][1]) > 0: #calculate enrichment score sensitivity_score, cumsum = compute_wtcs.compute_wtcs( pd.Series(s_qi_map[key][0]), pd.Series(s_qi_map[key][1])) #make mountain plot of enrichment score plot_enrichment_score.plot_enrichment_score( sensitivity_score, cumsum, title='Enrichment Score of {}, Set Size = {}'.format( col_meta[col_meta['pert_id'] == key.split('_')[0]] ['pert_iname'][0], len(s_qi_map[key][1])), outfile=os.path.join( rep_folder, 'mountain_plots', '{}_{}_dose'.format( col_meta[col_meta['pert_id'] == key.split('_') [0]]['pert_iname'][0], key.split('_')[1]) + '.png')) bortezomib_scores = [] dmso_scores = [] pos_dex = col_meta[col_meta['pert_type'] == 'trt_poscon'].index.tolist() neg_dex = col_meta[col_meta['pert_type'] == 'ctl_vehicle'].index.tolist() bortez = data[pos_dex] dmso = data[neg_dex] brd = key.split('_')[0] rids = sensitivity_map[brd] ################################ for column in bortez: temp_s = bortez[column].dropna() temp_qi = [ thing for thing in np.where( temp_s.dropna().index.isin(rids))[0].tolist() ] if (len(temp_s) - len(temp_qi)) < 1: continue y, x = compute_wtcs.compute_wtcs( temp_s, pd.Series(temp_qi)) bortezomib_scores.append(y) for column in dmso: temp_s = dmso[~dmso.index.isin(invariants )][column].dropna() temp_qi = [ thing for thing in np.where( temp_s.dropna().index.isin(rids))[0].tolist() ] if (len(temp_s) - len(temp_qi)) < 1: continue y, x = compute_wtcs.compute_wtcs( temp_s, pd.Series(temp_qi)) dmso_scores.append(y) all_scores = [] for column in data: temp_s = data[~data.index.isin(invariants )][column].dropna() temp_qi = [ thing for thing in np.where( temp_s.dropna().index.isin(rids))[0].tolist() ] if (len(temp_s) - len(temp_qi)) < 1: continue y, x = compute_wtcs.compute_wtcs( temp_s, pd.Series(temp_qi)) all_scores.append(y) ecdf = ECDF(all_scores) mark = bisect.bisect_left(ecdf.x, sensitivity_score) if mark == len(ecdf.y): mark -= 1 marks[col_meta[col_meta['pert_id'] == key.split('_') [0]]['pert_iname'][0] + '_' + str(key.split('_')[1])] = mark poscons_x = [] poscons_y = [] for pos in bortezomib_scores: y = bisect.bisect_left(ecdf.x, pos) poscons_x.append(pos) poscons_y.append(ecdf.y[y - 1]) neg_x = [] neg_y = [] for neg in dmso_scores: y = bisect.bisect_left(ecdf.x, neg) neg_x.append(neg) neg_y.append(ecdf.y[y - 1]) plt.figure() plt.plot(ecdf.x, ecdf.y) plt.scatter(poscons_x, poscons_y, marker='o', color='g', label='Bortezomib') # plt.scatter(neg_x, neg_y, marker='o', color='b', label='DMSO') plt.scatter(sensitivity_score, ecdf.y[mark], marker='o', color='r', label='Sensitivity {}_dose'.format( key.split('_')[1])) plt.xlabel( 'Enrichment Score for {} Sensitivities Compound Rank = {}' .format( col_meta[col_meta['pert_id'] == key.split('_')[0]] ['pert_iname'][0], mark)) plt.ylabel('Fraction of Compounds') plt.title( 'ECDF of Enrichment Score by Compound, Set Size = {}'. format(len(s_qi_map[key][1]))) axes = plt.gca() axes.set_xlim( [np.nanmin(ecdf.x[ecdf.x != -np.inf]), max(ecdf.x)]) axes.set_ylim([0, 1]) axes.legend(bbox_to_anchor=(0., 0.8, 0.8, .102), loc=3, borderaxespad=0.) plt.savefig( os.path.join( rep_folder, 'enrichment_ecdf', '{}_Competitive_Enrichment_ECDF_{}_dose.png'. format( col_meta[col_meta['pert_id'] == key.split('_') [0]]['pert_iname'][0], key.split('_')[1]))) plt.clf() plate_summary = pd.Series(marks) # plate_summary.set_index('det_plate', inplace=True) plate_summary.rename(rep, inplace=True) pd.DataFrame(plate_summary).to_csv(os.path.join( rep_folder, '{}_expected_sensitivity_ranks.txt'.format(rep)), sep='\t') expected_sensitivity_ranks.append(marks) marks['det_plate'] = rep summary = pd.DataFrame(expected_sensitivity_ranks) summary.set_index('det_plate', inplace=True) summary.to_csv(os.path.join(outfolder, 'expected_sensitivity_ranks.txt'), sep='\t')