cellFrm['mod_sig_id'] = cellFrm.distil_id.str.replace(':', '.') cellFrm.index = cellFrm.mod_sig_id cellFrm.to_csv(outF, sep='\t') ### make gene signature groups - gmt file # geneGrped = cellFrm.groupby('pert_mfc_desc') geneGrped = cellFrm.groupby('x_mutation_status') gmtList = [] for grp in geneGrped: gmtDictUp = {} gmtDictUp['id'] = grp[0] # gmtDictUp['desc'] = grp[0] gmtDictUp['desc'] = str(list(set(grp[1].x_mutation_status))) gmtDictUp['sig'] = list(grp[1].index.values) gmtList.append(gmtDictUp) gmtOut = cellDir + '/mutation_status_oe_sig_id.gmt' gmt.write(gmtList, gmtOut) ######################### ### Run NMF projection ## ######################### # COMPZ.MODZ_SCORE nComponents = 20 dimDict = { 'A549': 'n4487x978', # 'AALE': 'n2235x978', 'H1299': 'n1503x978', 'SALE': 'n2128x978' } # ZSPCINF
### add lines for gct headers line_pre_adder(outFile,str(mtrx.shape[0])+'\t'+str(mtrx.shape[1]-1)) line_pre_adder(outFile,"#1.2") ### make gmts of gene shRNAs geneGrped = annt.groupby('pert_id') gmtList = [] for grp in geneGrped: gmtDictUp = {} gmtDictUp['id'] = grp[0] gmtDictUp['desc'] = grp[0] gmtDictUp['sig'] = list(grp[1].sig_id.values) gmtList.append(gmtDictUp) # gmtOut = wkdir + '/gene_shRNA_sig_id.gmt' gmtOut = wkdir + '/gene_oe_sig_id.gmt' gmt.write(gmtList,gmtOut) ### load core drivers - save sig_ids to new gmt gFile= wkdir + '/core_lung_drivers.gmt' coreGMT = gmt.read(gFile) coreOE = coreGMT['sig'] coreFrm = annt[annt.pert_id.isin(coreOE)] sig_ids = list(coreFrm.sig_id.values) gmtDict = {} gmtDict['id'] = 'core_lung_drivers' gmtDict['desc'] = 'core_lung_drivers' gmtDict['sig'] = sig_ids gmtOut = wkdir + '/core_lung_drivers_sig_id.gmt' gmt.write([gmtDict],gmtOut)
cpd_targets_n368_file = '/xchip/cogs/sig_tools/sig_cliqueselect_tool/sample/cpd_targets_n368/summly/self_connectivity.txt' n368 = pd.read_csv(cpd_targets_n368_file, sep='\t') median_rnkpt_thresh = 73 cp_connected = n368[n368.median_rankpt >= median_rnkpt_thresh] #load in clique annotations and matrix cFile = '/xchip/cogs/projects/pharm_class/rnwork/cliques/cpd_targets_n368.gmt' cliqueGMT = gmt.read(cFile) cliqFrm = pd.DataFrame(cliqueGMT) # limit only to drug-gene groups that have coherence cliqFrm = cliqFrm[cliqFrm.id.isin(cp_connected.group_id)] # write a new, shorter gmt file gmtUpdate = [x for x in cliqueGMT if x['desc'] in cliqFrm.desc.values] outF = basedir + '/n69_drug_targets.gmt' gmt.write(gmtUpdate, outF) ### set parameters probeSpace = 'lm_epsilon' # lm_epsilon or bing nDMSO = 50 nKeep = 2 # number of signatures per drug for cell in cellList: print(cell) prefix = cell + '_drug_c9_' + probeSpace wkdir = basedir + '/' + prefix if not os.path.exists(wkdir): os.mkdir(wkdir) # set grouping structures pclDict = {} for x in cliqFrm.iterrows(): pclDict[x[1]['id']] = set(x[1]['sig'])
mtrx.to_csv(outFile, sep='\t') ### add lines for gct headers line_pre_adder(outFile, str(mtrx.shape[0]) + '\t' + str(mtrx.shape[1] - 1)) line_pre_adder(outFile, "#1.2") ### make gmts of gene shRNAs geneGrped = annt.groupby('pert_id') gmtList = [] for grp in geneGrped: gmtDictUp = {} gmtDictUp['id'] = grp[0] gmtDictUp['desc'] = grp[0] gmtDictUp['sig'] = list(grp[1].sig_id.values) gmtList.append(gmtDictUp) # gmtOut = wkdir + '/gene_shRNA_sig_id.gmt' gmtOut = wkdir + '/gene_oe_sig_id.gmt' gmt.write(gmtList, gmtOut) ### load core drivers - save sig_ids to new gmt gFile = wkdir + '/core_lung_drivers.gmt' coreGMT = gmt.read(gFile) coreOE = coreGMT['sig'] coreFrm = annt[annt.pert_id.isin(coreOE)] sig_ids = list(coreFrm.sig_id.values) gmtDict = {} gmtDict['id'] = 'core_lung_drivers' gmtDict['desc'] = 'core_lung_drivers' gmtDict['sig'] = sig_ids gmtOut = wkdir + '/core_lung_drivers_sig_id.gmt' gmt.write([gmtDict], gmtOut)
gmtDictUp['desc'] = sig gmtDictUp['sig'] = upProbes gmtListUp.append(gmtDictUp) # Dn gmtDictDn = {} gmtDictDn['id'] = sig gmtDictDn['desc'] = sig gmtDictDn['sig'] = dnProbes gmtListDn.append(gmtDictDn) # make query directory queryDir = os.path.join(sig_path,'cmap_query') if not os.path.exists(queryDir): os.mkdir(queryDir) # write gmt file gmtOutUp = queryDir + '/EMT_signatures_up.gmt' gmt.write(gmtListUp,gmtOutUp) gmtOutDn = queryDir + '/EMT_signatures_dn.gmt' gmt.write(gmtListDn,gmtOutDn) ### run cmap query metric = 'wtcs' cmd = ' '.join(['rum -q local -f sig_query_tool', '--uptag ' + gmtOutUp, '--dntag ' + gmtOutDn, '--metric ' + metric, '--row_space full', '--column_space gold', '--out ' + queryDir, '--mkdir false', '--save_tail false']) os.system(cmd)
gmtDictUp['desc'] = sigPrefix gmtDictUp['sig'] = upProbes gmtListUp.append(gmtDictUp) # Dn gmtDictDn = {} gmtDictDn['id'] = sig gmtDictDn['desc'] = sigPrefix gmtDictDn['sig'] = dnProbes gmtListDn.append(gmtDictDn) # make query directory queryDir = os.path.join(sig_path,'cmap_query') if not os.path.exists(queryDir): os.mkdir(queryDir) # write gmt file gmtOutUp = os.path.join(queryDir,sig + '_up.gmt') gmt.write(gmtListUp,gmtOutUp) gmtOutDn = os.path.join(queryDir,sig + '_dn.gmt') gmt.write(gmtListDn,gmtOutDn) ### run cmap query metric = 'wtcs' cmd = ' '.join(['rum -q local -f sig_query_tool', '--uptag ' + gmtOutUp, '--dntag ' + gmtOutDn, '--metric ' + metric, '--row_space full', '--column_space gold', '--out ' + queryDir, '--mkdir false', '--save_tail false']) os.system(cmd) ### run summly
cellFrm['mod_sig_id'] = cellFrm.distil_id_original.str.replace(':','.') cellFrm.index = cellFrm.mod_sig_id cellFrm.to_csv(outF,sep='\t') ### make gene signature groups - gmt file mtch_field = 'pert_iname' geneGrped = cellFrm.groupby(mtch_field) gmtList = [] for grp in geneGrped: gmtDictUp = {} gmtDictUp['id'] = grp[0] gmtDictUp['desc'] = grp[0] # gmtDictUp['desc'] = str(list(set(grp[1][mtch_field]))) gmtDictUp['sig'] = list(grp[1].index.values) gmtList.append(gmtDictUp) gmtOut = cellDir + '/actomyosin_kd_distil_id.gmt' gmt.write(gmtList,gmtOut) ######################### ### Run NMF projection ## ######################### # COMPZ.MODZ_SCORE nComponents = 20 # dimDict = {} # for grp in cell_grped: # dimDict[grp[0]] = 'n'+str(grp[1].shape[0])+'x978' dimDict = {'A375': 'n1684x978', 'A549': 'n1410x978', 'ASC': 'n260x978', 'HA1E': 'n1445x978', 'HCC515': 'n1163x978',
dose_len = dose_set.apply(len) is_at_dose = dose_len > 3 cps_at_dose = dose_set[is_at_dose] #which PCL members are at dose? PCL_members_dose = cps_at_dose[cps_at_dose.index.isin(brdAllGroups)] ### make dose GMT new_gmt = [] for x in cliqueGMT: brds = x['sig'] brd_dose = [j for j in brds if j in PCL_members_dose.index] if len(brd_dose) > 0: x['sig'] = brd_dose new_gmt.append(x) cFile = source_dir + '/PCL_compounds_at_dose.gmt' gmt.write(new_gmt,cFile) # load in new file cliqueGMT = gmt.read(cFile) cliqFrm = pd.DataFrame(cliqueGMT) # set grouping structures pclDict = {} for x in cliqFrm.iterrows(): pclDict[x[1]['id']] = set(x[1]['sig']) # create list of all compounds members brdAllGroups = [] for group in pclDict: brdAllGroups.extend(pclDict[group]) brdAllGroups.append('DMSO') brdAllGroups = list(set(brdAllGroups)) testGroups = cliqFrm['id'].values
cpd_targets_n368_file = '/xchip/cogs/sig_tools/sig_cliqueselect_tool/sample/cpd_targets_n368/summly/self_connectivity.txt' n368 = pd.read_csv(cpd_targets_n368_file,sep='\t') median_rnkpt_thresh = 73 cp_connected = n368[n368.median_rankpt >= median_rnkpt_thresh] #load in clique annotations and matrix cFile = '/xchip/cogs/projects/pharm_class/rnwork/cliques/cpd_targets_n368.gmt' cliqueGMT = gmt.read(cFile) cliqFrm = pd.DataFrame(cliqueGMT) # limit only to drug-gene groups that have coherence cliqFrm = cliqFrm[cliqFrm.id.isin(cp_connected.group_id)] # write a new, shorter gmt file gmtUpdate = [x for x in cliqueGMT if x['desc'] in cliqFrm.desc.values] outF = basedir + '/n69_drug_targets.gmt' gmt.write(gmtUpdate,outF) ### set parameters probeSpace = 'lm_epsilon' # lm_epsilon or bing nDMSO = 50 nKeep = 2 # number of signatures per drug for cell in cellList: print(cell) prefix = cell + '_drug_c9_' + probeSpace wkdir = basedir + '/' + prefix if not os.path.exists(wkdir): os.mkdir(wkdir) # set grouping structures pclDict = {} for x in cliqFrm.iterrows(): pclDict[x[1]['id']] = set(x[1]['sig'])