def fpr_calc_parallel(inSum, outSum, dmsoSum, matrixType, rnkptRange, graph=True, fpr_max=False): ''' -calculate the rate of false positives for bioactive signatures vs. DMSO -make heatmap Parameters: ----------- fpr_max: bool if false positive rate is above 1, set to 1 ''' #### false positive calculation in parallel #### n_procs = 30 tupList = [(inSum, dmsoSum, x) for x in range(0, 100)] prog = update.DeterminateProgressBar('self-connection graph builder') pool = multiprocessing.Pool(n_procs) rs = pool.map_async(_cdf_worker, tupList) pool.close() # No more work while (True): if (rs.ready()): break remaining = rs._number_left prog.show_message( 'SVM evaluation - {0} tasks to complete'.format(remaining)) time.sleep(0.1) results = rs.get() fpFrame = pd.concat(results, axis=1, keys=[s.name for s in results]) return fpFrame
def read_gctx_row_meta(self, src, row_inds=None, verbose=True): ''' read the row meta data from the file given in src. If row_inds is given, only those rows specified are read. ''' #open an update indicator if verbose: progress_bar = update.DeterminateProgressBar('GCTX_READER') #open the gctx file self._open_gctx(src) #set up the indices if not row_inds: row_inds = range(len(self.row_id_node)) #read in the row meta data row_headers = [x.name for x in self.row_data] row_headers.insert(0, 'ind') self._add_table_to_meta_db("row", row_headers) num_rows = len(row_inds) for i, ind in enumerate(row_inds): if verbose: progress_bar.update('reading row meta data', i, num_rows) data_list = [ind] for column in self.row_data: data_list.append(str(column[ind]).rstrip()) self._add_row_to_meta_table("row", data_list) #clear the update indicator if verbose: progress_bar.clear() #close the gctx file self._close_gctx()
def read_gctx_col_meta(self, src, col_inds=None, verbose=True): ''' read the column meta data from the file given in src. If col_inds is given, only those columns specified are read. ''' #open an update indicator if verbose: progress_bar = update.DeterminateProgressBar('GCTX_READER') #open the gctx file self._open_gctx(src) #set up the indices if not col_inds: col_inds = range(len(self.column_id_node)) #read in the column meta data column_headers = [x.name for x in self.column_data] column_headers.insert(0, 'ind') self._add_table_to_meta_db("col", column_headers) num_rows = self.column_data[0].shape[0] for i in col_inds: if verbose: progress_bar.update('reading column meta data', i, num_rows) data_list = [i] for column in self.column_data: data_list.append(str(column[i]).rstrip()) self._add_row_to_meta_table("col", data_list) #clear the update indicator if verbose: progress_bar.clear() #close the gctx file self._close_gctx()
def _read_gct(self, src, verbose=True, frame=True): ''' reads tab delimited gct file ''' #open a update indicator if verbose: progress_bar = update.DeterminateProgressBar('GCT_READER') #open the file f = open(src, 'rb') reader = csv.reader(f, delimiter='\t') self.src = src #read the gct file header information and build the empty self.matrix #array for later use self.version = reader.next()[0] dims = reader.next() self.matrix = numpy.ndarray([int(dims[0]), int(dims[1])]) #parse the first line to get sample names and row meta_data headers titles = reader.next() cid = titles[int(dims[2]) + 1:] row_meta_headers = titles[:int(dims[2]) + 1] row_meta_headers.insert(0, 'ind') self._add_table_to_meta_db('row', row_meta_headers) #parse the _meta data for the columns col_meta_array = [] for ii, c in enumerate(cid): col_meta_array.append([ii, c]) current_row = 0 col_meta_headers = ['ind', 'id'] while current_row < int(dims[3]): tmp_row = reader.next() col_meta_headers.append(tmp_row[0]) for ii, item in enumerate(tmp_row[int(dims[2]) + 1:]): col_meta_array[ii].append(item) current_row += 1 self._add_table_to_meta_db('col', col_meta_headers) for item in col_meta_array: self._add_row_to_meta_table('col', item) #parse the meta_data for the rows and store the data matrix for ii, row in enumerate(reader): row_meta_tmp = row[:int(dims[2]) + 1] row_meta_tmp.insert(0, ii) self._add_row_to_meta_table('row', row_meta_tmp) self.matrix[ii] = row[int(dims[2]) + 1:] if verbose: progress_bar.update('reading gct file: ', ii, int(dims[0])) if verbose: progress_bar.clear() #populate a data frame if frame: self.frame = pd.DataFrame(self.matrix, index=self.get_row_meta('id'), columns=self.get_column_meta('id'))
def rates_of_DMSO_connections(inSum, outSum, dmsoSum, matrixType, rnkptRange, graph=True, fpr_max=False): ''' -calculate the rate of false positives for bioactive signatures vs. DMSO Parameters: ----------- fpr_max: bool if false positive rate is above 1, set to 1 ''' # goldSum = pd.concat([inSum,outSum],axis=0) ratioThresh = 3 # fpThresh = .25 ratioDict = {} fpDict = {} fpFrame = pd.DataFrame() #### false positive calculation with loop #### progress_bar = update.DeterminateProgressBar( 'connection ratio-calculation') for ii, rnkpt_thresh in enumerate(rnkptRange): progress_bar.update('observed to dmso', ii, len(rnkptRange)) # rnkpt_thresh = 90 # grtrThresh = inSum >= rnkpt_thresh grtrThresh = np.greater_equal(inSum, rnkpt_thresh) grtrSum = grtrThresh.sum(axis=1) connRate = grtrSum / float(inSum.shape[1]) # dmso grtrDMSO = dmsoSum >= rnkpt_thresh dSum = grtrDMSO.sum(axis=1) dConnRate = dSum / float(dmsoSum.shape[1]) # summly space: dmso connection rate obsToDmso = connRate / dConnRate # falsePosR = dConnRate / (dConnRate + connRate) # dmso / (dmso + obs) falsePosR = dConnRate / connRate # dmso / obs # if false postive rate is above 1, set to 1 if fpr_max: falsePosR[falsePosR >= 1] = 1 falsePosR.name = rnkpt_thresh fpFrame = pd.concat([fpFrame, pd.DataFrame(falsePosR)], axis=1) highRatioCount = (obsToDmso >= ratioThresh).sum() ratioDict[rnkpt_thresh] = highRatioCount fpDict[rnkpt_thresh] = (falsePosR <= fpThresh).sum() # deal with inf # isInf = np.isinf(obsToDmso) # obsToDmso[isInf] = grtrSum[isInf] # replace inf with obs sum # obsToDmso = obsToDmso[~np.isnan(obsToDmso)]# remove nan return fpFrame
def build_combine_null(Hmtrx,cliqFrm,topMeanFrm,nTop=3,nPerm=4000): ''' - shuffle signatures from random drugs - keep same group size Parameters ---------- Hmtrx : pandas DataFrame -matrix of NMF weightings for each signatures (n_signatures x n_components) cliqFrm : pandas DataFrame -signature annotationsw nTop : int -number of top largest components to sort by topMeanFrm : pandas DataFrame for each group: group_signature_counts top_mean_metric Returns ---------- nullMean : pandas DataFrame -matrix of null distributions for each group size ''' # count the number of signatures in the input data that belong to each group cliqSize = topMeanFrm.group_signature_counts groupSizeSet = set(cliqSize) # unique group sizes across input groups zFrm = np.zeros([len(groupSizeSet),nPerm]) nullMean = pd.DataFrame(zFrm,index=np.sort(list(groupSizeSet))) # one row for each group size nullMean.index.name = 'group_size' prog = update.DeterminateProgressBar('group size') for ix,nGrp in enumerate(nullMean.index): prog.update(nGrp,ix,len(nullMean.index)) for ir in range(nPerm): iRand = np.random.choice(Hmtrx.index.values,nGrp,replace=False) grpH = Hmtrx.reindex(iRand) meanVec = grpH.mean() #get mean of top components iTop3 = meanVec.order(ascending=False).index[:nTop] sortedTop = grpH.ix[:,iTop3].sort() topSum = sortedTop.sum(axis=1).order(ascending=False) nullMean.ix[nGrp,ir] = topSum.mean() return nullMean
def read_gctx_col_meta(self, src, col_inds=None, verbose=True): ''' read the column meta data from the file given in src. If col_inds is given, only those columns specified are read. ''' #open an update indicator if verbose: progress_bar = update.DeterminateProgressBar('GCTX_READER') #open the gctx file self._open_gctx(src) #set up the indices if not col_inds: col_inds = range(len(self.column_id_node)) #read in the column meta data column_headers = [x.name for x in self.column_data] column_headers.insert(0, 'ind') self._add_table_to_meta_db("col", column_headers) num_rows = len(col_inds) meta_data_array = numpy.empty([len(column_headers), num_rows], dtype=numpy.dtype('a400')) meta_data_array[0, :] = [str(x) for x in col_inds] for i, column in enumerate(self.column_data): data = column[col_inds] meta_data_array[i + 1, :] = [str(x).rstrip() for x in data] for i, col_ind in enumerate(col_inds): if verbose: progress_bar.update('reading column meta data', i, num_rows) data_list = list(meta_data_array[:, i]) self._add_row_to_meta_table("col", data_list) #clear the update indicator if verbose: progress_bar.clear() #close the gctx file self._close_gctx()
def permutation_template(self,n_permutation=10000): ''' creates null distribution of correlations returns correlations, p-values, and FDR correction must add_from_gct() before running ''' prog = progress.DeterminateProgressBar('template matching') gcto = self.gct doses = gcto.get_column_meta('pert_dose') perts = gcto.get_column_meta('pert_id') rids = gcto.get_rids() examineList = self.perts_at_dose num_perts = len(examineList) # templateMatchInd = {} #nested dict for index of significant probes # pvecDictParametric = {} pvecDictEmpirical = {} corr_null_distribution = {} # probe_template_corrs = {} for icmpd,unique_pert in enumerate(examineList): prog.update('template match {0}'.format(unique_pert),icmpd,num_perts) cid_inds = [i for i,x in enumerate(perts) if unique_pert in x] pert_doses = [float(doses[x]) for x in cid_inds] tmp_tup = zip(pert_doses,cid_inds) tmp_tup.sort() pert_doses,cid_inds = zip(*tmp_tup) pert_data = gcto.matrix[:,cid_inds] template_names = ['linear', 'log10', 'log2'] # templateMatchInd[unique_pert] = {} # pvecDictParametric[unique_pert] = {} pvecDictEmpirical[unique_pert] = {} corr_null_distribution[unique_pert] = {} # probe_template_corrs[unique_pert] = {} for istep,step in enumerate(template_names): template1 = step if step == 'linear': template_curve = np.array(pert_doses) elif step == 'log10': template_curve = np.log10(pert_doses) elif step == 'log2': template_curve = np.log2(pert_doses) else: print 'template name error' # calcualte stats on observation of interest # cc_list = [stats.pearsonr(pert_data[x,:],template_curve) for x in range(len(rids))] # rho_vec = [cc_list[x][0] for x in range(len(rids))] # rho_vec = np.array(rho_vec) # p_vec = [cc_list[x][1] for x in range(len(rids))] # p_vec = np.array(p_vec) # pvecDictParametric[unique_pert][template1] = p_vec # probe_template_corrs[unique_pert][template1] = rho_vec # run permutations to creat null distribution of corr values ### full matrix of permutations nMtrxPerm = n_permutation/len(rids) + 1 #number of matrix permutations needed to reach desired probe perms permRhoMtrx = np.zeros((len(rids),nMtrxPerm)) for perm in range(nMtrxPerm): iRandObs = range(pert_data.shape[1]) np.random.shuffle(iRandObs) corrs = np.corrcoef(template_curve,pert_data[:,iRandObs]) permRhoMtrx[:,perm] = corrs[0,1:] #test to see if two calculations methods are the same to a given precision # cc_list = [stats.pearsonr(pert_data[x,iRandObs],template_curve) for x in range(len(rids))] #this takes too long # rho_vec = [cc_list[x][0] for x in range(len(rids))] # np.allclose(perm_list, np.array(rho_vec),rtol=1e-06) #calculate p-value based on null distribution grtrMtrx = np.greater(np.abs(permRhoMtrx.T),np.abs(rho_vec)) null_pVec1 = (1 + np.sum(grtrMtrx, axis=0)) / float(nPerm) #compare observed gene to all null genes rho_vec = dp.probe_template_corrs[unique_pert][step] null_pVec = np.zeros_like(rho_vec) for igene in range(len(rho_vec)): rho = rho_vec[igene] p = np.sum(np.abs(permRhoMtrx.flatten()) > np.abs(rho)) /float(len(permRhoMtrx.flatten())) null_pVec[igene] = p ## select probe perms1 num_probes = self.gct.matrix.shape[0] probe_inds = range(num_probes) perm_cc = [] for i in range(n_permutation): perm_curve_inds = [random.sample(probe_inds,1)[0] for x in range(len(pert_doses))] perm_curve = [pert_data[perm_curve_inds[x],x] for x in range(len(pert_doses))] perm_covar = np.corrcoef(perm_curve,template_curve) perm_cc.append(perm_covar[0][1]) corr_null_distribution[unique_pert][template1] = perm_cc null_pVec = np.zeros_like(rho_vec) for igene in range(len(rho_vec)): rho = rho_vec[igene] p = np.sum(np.abs(perm_cc) > np.abs(rho)) /float(len(perm_cc)) null_pVec[igene] = p pvecDictEmpirical[unique_pert][template1] = null_pVec ### thresholding q = .1 #FDR threshold pID, pN = FDR.FDR(p_vec,q) #find FDR threshold if type(pID) == list: print unique_pert + 'matching to ' + template1 + ' template - perterbation does not have any significant genes that pass the FDR threshold' templateMatchInd[unique_pert][template1] = [] continue else: pass_fdr = np.less_equal(p_vec,pID) ipass_fdr = np.array(range(len(rids)))[pass_fdr] #get indices which pass fdr iRhoSort = np.argsort(rho_vec[ipass_fdr])[::-1] iRhoSorted_passFDR = ipass_fdr[iRhoSort] #these are indices which pass FDR and are sorted by correlation data_pass_fdr = pert_data[iRhoSorted_passFDR,:] ordered_rids = [rids[i] for i in iRhoSorted_passFDR] templateMatchInd[unique_pert][template1] = iRhoSorted_passFDR self.templateMatchInd = templateMatchInd self.pvecDictParametric = pvecDictParametric self.probe_template_corrs = probe_template_corrs # self.pvecDictEmpirical = pvecDictEmpirical # self.corr_null_distribution = corr_null_distribution
hogTargetDict = {} for brd in hogSet: if brd in targetDict: hogTargetDict[brd] = targetDict[brd] #################################### ### make query with only HOG plates #################################### dg = dgo.QueryTargetAnalysis(out=work_dir) pert_list = list(hogSet) is_gold = False genomic_pert = 'KD' brdCounts = [] fullPertList = [] ### for each drug perturbations of interest - find all instances in CMAP prog = progress.DeterminateProgressBar('perturbation cid query') if pert_list: for i, pert in enumerate(pert_list): prog.update('querying cps', i, len(pert_list)) CM = mu.CMapMongo() if is_gold == True: pert_query = CM.find( { 'sig_id': { '$regex': 'HOG' }, 'pert_id': { '$regex': pert }, 'is_gold': True }, {
ID_pass_vec = np.less_equal(p, ID_thresh_vec) if any(ID_pass_vec): pID = max(p[ID_pass_vec]) else: pID = [] # Nonparametric threshold N_thresh_vec = (I * q) / V / cVN N_pass_vec = np.less_equal(p, N_thresh_vec) if any(N_pass_vec): pN = max(p[N_pass_vec]) else: pN = [] return pID, pN prog = progress.DeterminateProgressBar('Template Heatmaps') cell = 'PC3' tim = '6H' cellLine = cell timeP = tim #load data refControl = 'pc' #use pc vs vc controled data # gctfile = glob.glob('/xchip/obelix/pod/brew/%s/PRISM001_%s_%s/by_pert_id_pert_dose/PRISM001_%s_%s_COMPZ.MODZ_SCORE_LM_*.gctx' % (refControl,cellLine,timeP,cellLine,timeP)) # load in zscore roast data # gctfile = glob.glob('/xchip/obelix/pod/brew/%s/PRISM001_%s_%s/PRISM001_%s_%s_ZSPCQNORM_*.gctx' % (refControl,cellLine,timeP,cellLine,timeP)) # load in brewed by rna well gctfile = glob.glob( '/xchip/obelix/pod/brew_tmp/%s/PRISM001_%s_%s/by_rna_well/PRISM001_%s_%s_COMPZ.MODZ_SCORE_LM_*.gctx' % (refControl, cellLine, timeP, cellLine, timeP)) # load in brewed by rna well - INFERED
# sco = sc.SC() # sco.add_sc_from_gctx_meta(gctfile, verbose=False) # dose = [float(x.split('::')[0].split(':')[2]) for x in sco.pid] # p-value by permutation ''' matches data to a dose template returns p-values, FDR correction must add_from_gct() before running ''' prog = progress.DeterminateProgressBar('template matching') gcto = dp.gct doses = gcto.get_column_meta('pert_dose') perts = gcto.get_column_meta('pert_id') unique_perts = list(set(perts)) rids = gcto.get_rids() examineList = dp.perts_at_dose num_perts = len(examineList) templateMatchInd = {} #nested dict for index of significant probes for icmpd,unique_pert in enumerate(examineList): prog.update('template match {0}'.format(unique_pert),icmpd,num_perts) cid_inds = [i for i,x in enumerate(perts) if unique_pert in x] pert_doses = [float(doses[x]) for x in cid_inds] tmp_tup = zip(pert_doses,cid_inds)
niter = 10000 direction = 'pos' # assert self.row_groups is not None, "Must define row_groups. Call build_groups first." # assert self.col_groups is not None, "Must define col_groups. Call build_groups first." # assert direction in ['pos','neg','both'], "direction must be 'pos','neg',or 'both'" # self.method=method # self.method_ = method summary_list = [] for q in ocl.row_groups.keys(): print "Connecting query group %s" % q # row_vals = ocl.query.pctrank.ix[ocl.row_groups[q]] testStats = [] counts = [] prog = progress.DeterminateProgressBar('row test statistic') for ig, g in enumerate(ocl.col_groups.keys()): prog.update('querying cps', ig, len(ocl.col_groups.keys())) rnk_vals = ocl.query.pctrank.ix[ocl.row_groups[q], ocl.col_groups[g]] testStat = rnk_vals.prod().prod() testStats.append(testStat) count = rnk_vals.count().sum() counts.append(count) csR = ocl.dfCS.ix[brd][ind] cpRank = self.dfRank.ix[brd] cpSmRank = cpRank / 100 # convert ranks back to 0 to 1 nCPs.append(cpRes.shape[0]) meanSer = cpRes.mean() meanRnk = cpRank.mean()
metric='wtcs' is_gold = False if not os.path.exists(work_dir): os.mkdir(work_dir) # dpathwayList = ['PIK3CA', 'PIK3CD', 'PIK3CG', 'MTOR', 'AKT1', 'AKT2', 'PTEN'] pathwayList = ['PIK3C', 'MTOR', 'AKT', 'PTEN'] drug_list = ['BRD-K05756698','BRD-K12184916'] #cgs cell lines CM = mu.CMapMongo() CGSbyCell = CM.find({'pert_type':'trt_sh.cgs'},{'cell_id':True}) cgsCells = list(set(CGSbyCell)) #loop through and write cell IDs prog = progress.DeterminateProgressBar('genomic pert query') for i,cell1 in enumerate(cgsCells): #get all CGS for a cell line prog.update('querying',i,len(cgsCells)) CM = mu.CMapMongo() for gene in pathwayList: CGSbyCell = CM.find({'pert_iname':{'$regex':gene},'pert_type':'trt_sh.cgs','is_gold':True,'cell_id':cell1},{'sig_id':True,'pert_iname':True}) if CGSbyCell: outdir = os.path.join(work_dir,cell1) if not os.path.exists(outdir): os.mkdir(outdir) nCGS = len(CGSbyCell) # sigF = os.path.join(outdir, cell1+ '_genomic_sig_ids_n' + str(nCGS) + '.grp') sigF = os.path.join(outdir, cell1+ '_genomic_sig_ids.grp') with open(sigF, 'a') as f: for sig in CGSbyCell:
def analyze_query(args,work_dir): ''' Analyze the output from query_tool - find self-connections and create graphs ''' #make a gct object db = gct.GCT() db.read(args.res) ##load query result - gctx file rslt = gct.GCT() #if specific result directory is specified, use that - otherwise get gctx from working dir if args.result: outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step rslt.read(outGctx[0]) else: rslt.read(args.resultDir) rsltSigID = rslt.get_rids() #sig IDs from result file qPert = db.get_column_meta('pert_desc') qPertID = db.get_column_meta('pert_id') qDose = db.get_column_meta('pert_dose') ESmat = rslt.matrix iES = ESmat.argsort(axis=0)[::-1] #sort ascending n_inst = len(iES[:,1]) #loop through each of the perts - graph ranks of query prog1 = progress.DeterminateProgressBar('creating self-connection graphs') avRnk = [] medRnk = [] for i, x in enumerate(qPert): prog1.update('graphing {0}',i,len(qPert)) iE = iES[:,i] #ES sort index for one column sSigID = [] for y in iE: sSigID.append(rsltSigID[y]) #make sorted sig ID list qStr = qPertID[i] cmpd1 = x dose1 = qDose[i] if len(qStr) >= 13: qStr = qStr[0:13] #shorten qPertID #i1 = IDsorted.index(qStr) #give first index of match #run pymongo query CM = mu.CMapMongo() #cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True}) cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db #i1 = __all_indices(qStr,sSigID) i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list if len(i1) < 1: print cmpd1 + ' has no instances in the cmap database' continue i2 = numpy.array(i1) #convert list to numpy array avr = sum(i2)/len(i2) #what is the average ES rank md = numpy.median(i2) # what is the median ES rank nAv = float(avr)/n_inst #normalize acording to number of instances in db nMd = float(md)/len(iES[:,1]) #normalized median avRnk.append(nAv) #store average ES rank medRnk.append(nMd) #plot fname = cmpd1 + '_' + dose1 + '_query_rank.png' outf = os.path.join(work_dir,fname) fig = plt.figure(figsize=(8.0, 2.0)) ax = fig.add_subplot(111) # the histogram of the data n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75) #ax.set_xlim(0, n_inst) ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k ax.set_xlabel('query rank') ax.set_ylabel('freq') ax.set_title('dose = '+ str(dose1) +'um') ax.grid(True) plt.savefig(outf, bbox_inches=0)
rowMedian = dmsoCM.median(axis=1) def no_diagonal_unstack(frm): 'return an unstacked matrix without the diagonal' np.fill_diagonal(frm.values, np.nan) overlapSer = frm.unstack() overlapSer = overlapSer[~overlapSer.isnull()] #remove nulls return overlapSer ### compare observed to null #construct graph = False pvalDict = {} progress_bar = update.DeterminateProgressBar('group p-val computation') for iicliq, icliq in enumerate(cliqueLabels.index): progress_bar.update('count', iicliq, len(cliqueLabels.index)) cName = cliqueLabels.ix[icliq, 'id'] pIds = cliqueLabels.ix[icliq, 'sig'] smFrm = sFrm.reindex(index=pIds, columns=pIds) uFrm = no_diagonal_unstack(smFrm) medObs = uFrm.median() rMed = rowMedian[pIds] fig = plt.figure(1, figsize=(10, 10)) # make matrix of equal size using null nperm = 10000 permDict = {} for iperm in range(nperm): iRand = np.random.choice(range(0, dmsoFrm.shape[1]), size=(len(pIds))) iRandCol = dmsoFrm.columns[iRand] #random column names
def build_probe_curves_and_summary(args,work_dir): ''' builds dose response curves for each for the specified probe ''' # instantiate a progress object prog = progress.DeterminateProgressBar('Dose Analysis') # read the specified probe from the input gctx file gcto = gct.GCT() probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True) gcto.read_gctx_matrix(args.res,row_inds=probe_ind) # grab the cids from the file and mine dose information from them. Find all of # the unique perts cids = gcto.get_gctx_cid(args.res) doses = [float(x.split(':')[2]) for x in cids] perts = [x.split(':')[1] for x in cids] unique_perts = list(set(perts)) # for each unique pert_id, find the dose that deviates from the base dose the most. # Do template matching to prototype curves. Output a report num_perts = len(unique_perts) CM = mu.CMapMongo() with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f: headers = ['pert_id','pert_desc','base_dose','base_z_score', 'best_dose','best_z_score', 'best_z_score_delta', 'linear','log','half-log','quarter-log','called shape'] f.write('\t'.join(headers) + '\n') for i,unique_pert in enumerate(unique_perts): prog.update('analyzing {0}'.format(args.probe),i,num_perts) # grab the z-scores and doses for the current pert and sort the pairs # by dose cid_inds = [i for i,x in enumerate(cids) if unique_pert in x] pert_scores = gcto.matrix[0,cid_inds] pert_doses = [doses[x] for x in cid_inds] tmp_tup = zip(pert_doses,pert_scores) tmp_tup.sort() pert_doses,pert_scores = zip(*tmp_tup) # build the dose response plot for the current pert and save it to disk plt.plot(pert_doses,pert_scores) plt.title('::'.join([unique_pert,args.probe])) plt.xlabel('dose') plt.ylabel('z-score') plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png']))) plt.close() # grab the pert_desc from mongo pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1) if not pert_desc: pert_desc = ['-666'] pert_desc = pert_desc[0] # find the best dose and cast them to lists base_dose = pert_doses[0] base_z_score = pert_scores[0] z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10) abs_z_delta = numpy.abs(z_delta) z_delta = z_delta.tolist() abs_z_delta = abs_z_delta.tolist() best_ind = z_delta.index(numpy.min(z_delta)) best_dose = pert_doses[best_ind] best_z_score = pert_scores[best_ind] best_z_score_delta = z_delta[best_ind] if len(pert_doses) > 1: # build prototype curves if there is more than one dose linear = numpy.linspace(1,10,len(pert_doses)) log_gen = _log_gen(1) log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.5) half_log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.25) quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))] curves = numpy.array([linear,log_curve, half_log_curve,quarter_log_curve]) # get the correlation coeficient for each of the curves and the # current pert dose curve corrs = numpy.corrcoef(pert_scores,curves) linear_corr = corrs[0][1] log_corr = corrs[0][2] half_log_corr = corrs[0][3] quarter_log_corr = corrs[0][4] #report the best shape by finding the best absolute correlation abs_corr = numpy.abs(corrs[0][1:]) if numpy.where(abs_corr > .8)[0].size > 0: abs_corr_max = max(abs_corr) abs_corr_max_ind = numpy.where(abs_corr == abs_corr_max)[0][0] curve_names = ['linear','log','half-log','quarter-log'] max_curve_name = curve_names[abs_corr_max_ind] else: max_curve_name = 'none' else: # if there is only one dose, set all corrs to 'nan' linear_corr = 'nan' log_corr = 'nan' half_log_corr = 'nan' quarter_log_corr = 'nan' max_curve_name = 'none' # write the dose data to the summary file data = [unique_pert,pert_desc,str(base_dose),str(base_z_score), str(best_dose),str(best_z_score),str(best_z_score_delta), str(linear_corr),str(log_corr),str(half_log_corr), str(quarter_log_corr),max_curve_name] f.write('\t'.join(data) + '\n') prog.clear()
def classification_across_cell(self, loo_type='by_cp', n_procs=9): ''' -build a single classifier treating observations from different cell lines equally -evaluate model with leave one out cross val. Parameters ---------- loo_type : str strategy for leave one out validation: 'by_cp' - leaves out all signatures for a given compounds 'by_sig' - leaves out individual signatures n_procs : int number of cores to be used for analysis ''' zFrm = self.signature_frame ### perform leave one out validation if loo_type == 'by_cp': zFrm['svm_prediction'] = np.nan cpSet = set(zFrm['pert_id']) tupList = [(zFrm, brd, self.probe_ids) for brd in cpSet] # run SVM in parallel prog = update.DeterminateProgressBar( 'self-connection graph builder') pool = multiprocessing.Pool(n_procs) rs = pool.map_async(_svm_worker, tupList) pool.close() # No more work while (True): if (rs.ready()): break remaining = rs._number_left prog.show_message( 'SVM evaluation - {0} tasks to complete'.format(remaining)) time.sleep(0.1) results = rs.get() predictedSer = pd.Series() for result in results: predictedSer = predictedSer.append(result) zFrm['svm_prediction'] = predictedSer if loo_type == 'by_sig': #start a update indicator progress_bar = update.DeterminateProgressBar('SVM calculation') predictDict = {} for ii, sig in enumerate(zFrm.index): progress_bar.update( 'running SVM and signature validation - ' + sig, ii, len(zFrm.index)) droppedFrm = zFrm[zFrm.index != sig] # remove test signature from training trainFrm = droppedFrm.reindex(columns=self.probe_ids) labelsTrain = droppedFrm['labels'].values C = 1.0 # SVM regularization parameter svc = svm.SVC(kernel='linear', C=C).fit(trainFrm.values, labelsTrain) zTest = zFrm.ix[sig, self.probe_ids] linPred = svc.predict(zTest.values) predictDict[sig] = linPred[0] predSer = pd.Series(predictDict) predSer.name = 'svm_prediction' zFrm = pd.concat([zFrm, pd.DataFrame(predSer)], axis=1) accuracyArray = zFrm['labels'] == zFrm['svm_prediction'] accuracyRate = accuracyArray.sum() / float(accuracyArray.shape[0]) zFrm['correct_prediction'] = accuracyArray self.model_accuracy_across_cells = accuracyRate self.signature_frame = zFrm
meanVec = grpH.describe().ix['mean'] #get top nTop = 3 # number of top largest components to sort by iTop3 = meanVec.order(ascending=False).index[:nTop] sortedTop = grpH.ix[:, iTop3].sort() topSum = sortedTop.sum(axis=1).order(ascending=False) topMeanDict[grp] = topSum.mean() topMeanSer = pd.Series(topMeanDict) ############################## ### build null distribution ## ############################## # shuffle signatures frtopom random drugs - keep same group size nPerm = 4000 zFrm = np.zeros([cliqFrm.shape[0], nPerm]) nullMean = pd.DataFrame(zFrm, index=cliqFrm['desc']) prog = update.DeterminateProgressBar('cliq group') for irr, r in enumerate(cliqFrm.iterrows()): grp = r[1]['id'] prog.update(grp, irr, len(cliqFrm.desc)) brds = r[1]['sig'] anntMtch = anntFrm[anntFrm.pert_id.isin(brds)] for ir in range(nPerm): nGrp = anntMtch.shape[0] iRand = np.random.choice(Hmtrx.index.values, nGrp, replace=False) grpH = Hmtrx.reindex(iRand) meanVec = grpH.mean() #get mean of top components nTop = 3 # number of top largest components to sort by iTop3 = meanVec.order(ascending=False).index[:nTop] sortedTop = grpH.ix[:, iTop3].sort() topSum = sortedTop.sum(axis=1).order(ascending=False)
val1 = params[0] val2 = params[1] sum1 = val1 + val2 time.sleep(10) return sum1 # make list of tuples tupList = [] for x1 in np.arange(3): for x2 in np.arange(3): tup1 = (x1, x2) tupList.append(tup1) #add to list of tuples # instantiate a progress object prog = progress.DeterminateProgressBar('self-connection graph builder') #build graphs in parallel n_procs = 4 pool = multiprocessing.Pool(n_procs) rs = pool.map_async(_add_two, tupList) pool.close() # No more work while (True): if (rs.ready()): break remaining = rs._number_left prog.show_message('Waiting for {0} tasks to complete...'.format(remaining)) time.sleep(0.1) rs.get() ### attempt 2 import sys, time, random, multiprocessing
def analyze_query(args,work_dir): ''' Analyze the output from query_tool - find self-connections and create graphs ''' #make a gct object db = gct.GCT() db.read(args.res) ##load query result - gctx file rslt = gct.GCT() #if specific result directory is specified, use that - otherwise get gctx from working dir if args.result: outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step rslt.read(outGctx[0]) else: rslt.read(args.resultDir) rsltSigID = rslt.get_rids() #sig IDs from result file qPert = db.get_column_meta('pert_desc') qPertID = db.get_column_meta('pert_id') qDose = db.get_column_meta('pert_dose') ESmat = rslt.matrix iES = ESmat.argsort(axis=0)[::-1] #sort ascending n_inst = len(iES[:,1]) #loop through each of the perts - graph ranks of query prog1 = progress.DeterminateProgressBar('creating self-connection graphs') avRnk = [] medRnk = [] prRnk = [] #loop through each of the UNIQUE perts - graph ranks of query pertSet = set(qPert) for pert in pertSet: cmpd1 = pert iP = _all_indices(pert, qPert) #index of doses on plate if len(iP) < 2: print pert + ' has only one instance' continue uDose = [qDose[i] for i in iP] fDose = [float(x) for x in uDose] #convert strings to float aDose = numpy.asarray(fDose) #convert to numpy array iD = aDose.argsort() #local ordering sDose = [fDose[j] for j in iD] #sort local doses iPo = [iP[i] for i in iD] #ordered index qStr = qPertID[iPo[0]] #set pertID if len(qStr) >= 13: qStr = qStr[0:13] #shorten qPertID #run pymongo query CM = mutil.CMapMongo() #cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True}) cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db if len(cmpdSigIds) < 2: print cmpd1 + ' has one or no instances in the cmap database' continue #loop through each dose for d in iPo: #count probe enrichment and plot cmpd1 = qPert[d] dose1 = qDose[d] iE = iES[:,d] #ES sort index for one column sSigID = [] for y in iE: sSigID.append(rsltSigID[y]) #make sorted sig ID list i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list i2 = numpy.array(i1) #convert list to numpy array avr = sum(i2)/len(i2) #what is the average ES rank md = numpy.median(i2) # what is the median ES rank nAv = float(avr)/n_inst #normalize acording to number of instances in db nMd = float(md)/len(iES[:,1]) #normalized median i1.sort() np = 1000 ntop = [x for x in i1 if x <= np] nPr = float(len(ntop))/(len(i1)) #percent of instances at the top of the list prRnk.append(nPr) avRnk.append(nAv) #store average ES rank medRnk.append(nMd) #plot fname = cmpd1 + '_' + dose1 + '_query_rank.png' outf = os.path.join(work_dir,fname) fig = plt.figure(figsize=(8.0, 2.0)) ax = fig.add_subplot(111) # the histogram of the data n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75) #ax.set_xlim(0, n_inst) ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k ax.set_xlabel('query rank') ax.set_ylabel('freq') ax.set_title('dose = '+ str(dose1) +'um') ax.grid(True) plt.savefig(outf, bbox_inches=0)
def template_heatmap(args,work_dir): ''' uses template matching to find the most does responsive probesets for each compound in the dataset and generates a list of the top 50 and bottom 50 most dose responsive probes. heatmaps across all of the doses are made using these probesets ''' # instantiate a progress object prog = progress.DeterminateProgressBar('Template Heatmaps') # read the data gcto = gct.GCT(args.res) gcto.read() # grab the cids from the file and mine dose information from them. Find all of # the unique perts cids = gcto.get_gctx_cid(args.res) pert_descs = gcto.get_column_meta('pert_desc') doses = [float(x.split(':')[2]) for x in cids] perts = [x.split(':')[1] for x in cids] unique_perts = list(set(perts)) # grab the rid for use below rids = gcto.get_gctx_rid(args.res) num_perts = len(unique_perts) for i,unique_pert in enumerate(unique_perts): prog.update('analyzing {0}'.format(unique_pert),i,num_perts) # grab the z-scores and doses for the current pert and sort the pairs # by dose. put the cid_inds in the same sorted order cid_inds = [i for i,x in enumerate(cids) if unique_pert in x] pert_desc = pert_descs[cid_inds[0]] #set pert desc to the first dose pert_doses = [doses[x] for x in cid_inds] tmp_tup = zip(pert_doses,cid_inds) tmp_tup.sort() pert_doses,cid_inds = zip(*tmp_tup) if len(pert_doses) > 1: # build prototype curves if there is more than one dose linear = numpy.linspace(1,10,len(pert_doses)) log_gen = _log_gen(1) log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.5) half_log_curve = [log_gen.next() for x in range(len(pert_doses))] log_gen = _log_gen(.25) quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))] curves = numpy.array([linear,log_curve, half_log_curve,quarter_log_curve]) # correlate all of the probes in the data to the prototype curves pert_data = gcto.matrix[:,cid_inds] num_probes = pert_data.shape[0] cc = numpy.corrcoef(pert_data,curves) # grab the correlation values for all the probes against prototype curves linear_probe_corrs = cc[0:num_probes,num_probes] log_probe_corrs = cc[0:num_probes,num_probes + 1] half_log_probe_corrs = cc[0:num_probes,num_probes + 2] quarter_log_probe_corrs = cc[0:num_probes,num_probes + 3] # compute the random correlation profile for this pert num_probes = gcto.matrix.shape[0] probe_inds = range(num_probes) linear_perm_cc = [] log_perm_cc = [] half_log_perm_cc = [] quarter_log_perm_cc = [] for i in range(1000): perm_curve_inds = [random.sample(probe_inds,1)[0] for x in range(len(pert_doses))] perm_curve = [pert_data[perm_curve_inds[x],x] for x in range(len(pert_doses))] perm_covar = numpy.corrcoef(perm_curve,curves) linear_perm_cc.append(perm_covar[0][1]) log_perm_cc.append(perm_covar[0][2]) half_log_perm_cc.append(perm_covar[0][3]) quarter_log_perm_cc.append(perm_covar[0][4]) # compute the nominal p values for all correlation values linear_probe_corrs_p = numpy.array([stats.percentileofscore(linear_perm_cc,x) for x in linear_probe_corrs]) log_probe_corrs_p = numpy.array([stats.percentileofscore(log_perm_cc,x) for x in log_probe_corrs]) half_log_probe_corrs_p = numpy.array([stats.percentileofscore(half_log_perm_cc,x) for x in half_log_probe_corrs]) quarter_log_probe_corrs_p = numpy.array([stats.percentileofscore(quarter_log_perm_cc,x) for x in quarter_log_probe_corrs]) # write the p values and correlations out to file with open(os.path.join(work_dir,unique_pert + '_template_match_summary.txt'),'w') as f: f.write('\t'.join(['probeset','linear corr', 'linear p','log corr', 'log p', 'half-log corr', 'half-log p','quarter-log corr', 'quarter-log p']) + '\n') for j in range(len(linear_probe_corrs)): f.write('\t'.join([rids[j],str(linear_probe_corrs[j]), str(linear_probe_corrs_p[j]) ,str(log_probe_corrs[j]), str(log_probe_corrs_p[j]) ,str(half_log_probe_corrs[j]), str(half_log_probe_corrs_p[j]) ,str(quarter_log_probe_corrs[j]), str(quarter_log_probe_corrs_p[j])]) + '\n') # build the linear heatmap linear_probe_corrs_sort_ind = numpy.argsort(linear_probe_corrs_p)[::-1] top = pert_data[linear_probe_corrs_sort_ind[0:50],:] bot = pert_data[linear_probe_corrs_sort_ind[-50:],:] combined = numpy.vstack([top,bot]) combined_row_normalized = combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T) row_sums = combined_row_normalized.sum(axis=1) combined_row_normalized = combined_row_normalized / row_sums[:,numpy.newaxis] plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu') plt.axis('off') plt.savefig(os.path.join(work_dir,unique_pert + '_linear_heatmap.png')) # build the log heatmap log_probe_corrs_sort_ind = numpy.argsort(log_probe_corrs_p)[::-1] top = pert_data[log_probe_corrs_sort_ind[0:50],:] bot = pert_data[log_probe_corrs_sort_ind[-50:],:] combined = numpy.vstack([top,bot]) combined_row_normalized = combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T) row_sums = combined_row_normalized.sum(axis=1) combined_row_normalized = combined_row_normalized / row_sums[:,numpy.newaxis] plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu') plt.axis('off') plt.savefig(os.path.join(work_dir,unique_pert + '_log_heatmap.png')) # build the half log heatmap half_log_probe_corrs_sort_ind = numpy.argsort(half_log_probe_corrs_p)[::-1] top = pert_data[half_log_probe_corrs_sort_ind[0:50],:] bot = pert_data[half_log_probe_corrs_sort_ind[-50:],:] combined = numpy.vstack([top,bot]) combined_row_normalized = combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T) row_sums = combined_row_normalized.sum(axis=1) combined_row_normalized = combined_row_normalized / row_sums[:,numpy.newaxis] plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu') plt.axis('off') plt.savefig(os.path.join(work_dir,unique_pert + '_half_log_heatmap.png')) # build the quarter log heatmap quarter_log_probe_corrs_sort_ind = numpy.argsort(quarter_log_probe_corrs_p)[::-1] top = pert_data[quarter_log_probe_corrs_sort_ind[0:50],:] bot = pert_data[quarter_log_probe_corrs_sort_ind[-50:],:] combined = numpy.vstack([top,bot]) combined_row_normalized = combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T) row_sums = combined_row_normalized.sum(axis=1) combined_row_normalized = combined_row_normalized / row_sums[:,numpy.newaxis] plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu') plt.axis('off') plt.savefig(os.path.join(work_dir,pert_desc + '_quarter_log_heatmap.png')) # clear that progress bar prog.clear()
def rates_of_DMSO_connections(inSum,outSum,dmsoSum,matrixType,rnkptRange,graph=True): ''' -calculate the rate of false positives for bioactive signatures vs. DMSO -make heatmap ''' # goldSum = pd.concat([inSum,outSum],axis=0) ratioThresh = 3 # fpThresh = .25 ratioDict = {} fpDict = {} fpFrame = pd.DataFrame() progress_bar = update.DeterminateProgressBar('connection ratio-calculation') for ii,rnkpt_thresh in enumerate(rnkptRange): progress_bar.update('observed to dmso', ii, len(rnkptRange)) # rnkpt_thresh = 90 grtrThresh = inSum >= rnkpt_thresh grtrSum = grtrThresh.sum(axis=1) connRate = grtrSum/float(inSum.shape[1]) # dmso grtrDMSO = dmsoSum >= rnkpt_thresh dSum = grtrDMSO.sum(axis=1) dConnRate = dSum/float(dmsoSum.shape[1]) # summly space: dmso connection rate obsToDmso = connRate/dConnRate # falsePosR = dConnRate / (dConnRate + connRate) # dmso / (dmso + obs) falsePosR = dConnRate / connRate # dmso / obs falsePosR.name = rnkpt_thresh fpFrame = pd.concat([fpFrame,pd.DataFrame(falsePosR)],axis=1) highRatioCount = (obsToDmso >= ratioThresh).sum() ratioDict[rnkpt_thresh] = highRatioCount fpDict[rnkpt_thresh] = (falsePosR <= fpThresh).sum() # deal with inf # isInf = np.isinf(obsToDmso) # obsToDmso[isInf] = grtrSum[isInf] # replace inf with obs sum # obsToDmso = obsToDmso[~np.isnan(obsToDmso)]# remove nan #heatmap # order acording to highest false positive rate @ rnkpt 90 fpSort = fpFrame.sort(90) # plot result if graph == True: fig = plt.figure(1, figsize=(10, 10)) plt.imshow(fpSort.values, interpolation='nearest', aspect='auto', cmap=cm.gray_r) # vmin=0, # vmax=1, tickRange = range(0,40,5) xtcks = [str(x) for x in fpFrame.columns[tickRange]] plt.xticks(tickRange, xtcks) # plt.yticks(np.arange(len(ytcks)),ytcks) plt.colorbar() plt.xlabel(matrixType + ' threshold') plt.ylabel('unique perturbations') plt.title('summly false positive rate - based on DMSO') out = wkdir + '/false_positive_matrix_' + matrixType + '_threshold.png' plt.savefig(out, bbox_inches='tight') plt.close() # heatmap by pert_type fpGrped = fpFrame.groupby(level='pert_type') for grp in fpGrped.groups: grpFrm = fpGrped.get_group(grp) grpSort = grpFrm.sort(90) fig = plt.figure(1, figsize=(10, 10)) plt.imshow(grpSort.values, interpolation='nearest', aspect='auto', cmap=cm.gray_r) # vmin=0, # vmax=1, tickRange = range(0,40,5) xtcks = [str(x) for x in grpSort.columns[tickRange]] plt.xticks(tickRange, xtcks) # plt.yticks(np.arange(len(ytcks)),ytcks) plt.colorbar() plt.xlabel(matrixType + ' threshold') plt.ylabel('unique perturbations') plt.title(grp +' summly false positive rate - based on DMSO') out = wkdir + '/' + grp + '_false_positive_matrix_' + matrixType + '_threshold.png' plt.savefig(out, bbox_inches='tight') plt.close() # graph false positive rate fpSer = pd.Series(fpDict) plt.plot(fpSer.index,fpSer.values) plt.ylabel('number of perturbations') plt.xlabel(matrixType + 'threshold') plt.title('false positive rates bellow .25 - (out of 7147)') outF = os.path.join(wkdir,'false_positive_rates_by_' + matrixType + '_threshold.png') plt.savefig(outF, bbox_inches=0) plt.close() # graph - obs:dmso ratio ratioSer = pd.Series(ratioDict) plt.plot(ratioSer.index,ratioSer.values) plt.ylabel('number of connections') plt.xlabel(matrixType + ' threshold') plt.title('observed:dmso connection ratios above 3 - (out of 7147)') outF = os.path.join(wkdir,'connection_ratio_by_' + matrixType + '_threshold.png') plt.savefig(outF, bbox_inches=0) plt.close() return fpFrame
def build_SC(args,work_dir): ''' builds SC plots for the dose analysis ''' # instantiate a progress object prog = progress.DeterminateProgressBar('Dose Analysis') # make an SC object from the given gctx file sco = sc.SC() sco.add_sc_from_gctx_meta(args.res, verbose=False) sco.set_thresh_by_specificity(0.8) # find all of the unique pert_ids in the data #perts = [':'.join(x.split('::')[0].split(':')[0:2]) for x in sco.pid] $perts is pert_id perts = [x.split(':::')[0].split('::')[1] for x in sco.pid] #perts is pert_desc pert_ids = [x.split(':')[1] for x in sco.pid] unique_perts = set(perts) ctl_perts = [] for i, unique_pert in enumerate(unique_perts): #pert_id = unique_pert.split(':')[1] #if pert_id == 'DMSO' or pert_id =='CMAP-000': #ctl_perts.append(unique_pert) if unique_pert == 'DMSO': ctl_perts.append(unique_pert) unique_perts.difference_update(set(ctl_perts)) # grab the dose information dose = [float(x.split('::')[0].split(':')[2]) for x in sco.pid] # grab pert_descs desc = [x.split('::')[1].split(':::')[0] for x in sco.pid] # write sc plots to file num_perts = len(unique_perts) for i,unique_pert in enumerate(unique_perts): prog.update('making SC plots',i,num_perts) sco.plot(include=unique_pert,size=dose,title=unique_pert,pos_con=['None'],out=os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),'SC.png']))) # write SC summary table with open(os.path.join(work_dir,'SC_summary.txt'),'w') as f: headers = ['pert_id','pert_desc','base_dose','base_ss', 'base_cc','best_dose','best_ss','best_cc', 'best_ss_lfc','best_cc_lfc','best_sc_lfc_distance'] f.write('\t'.join(headers) + '\n') for i,unique_pert in enumerate(unique_perts): prog.update('making SC summary',i,num_perts) pert_inds = [i for i,x in enumerate(perts) if unique_pert in x] pert_dose = [dose[x] for x in pert_inds] pert_desc = desc[pert_inds[0]] pert_ss = [sco.s[x] for x in pert_inds] pert_cc = [sco.c[x] for x in pert_inds] pert_cc = [x if x != -666 else 0 for x in pert_cc] base_dose = numpy.min(pert_dose) base_ind = pert_dose.index(base_dose) base_ss = pert_ss[base_ind] base_cc = pert_cc[base_ind] ss_ratio = numpy.log(numpy.array(pert_ss)/base_ss) cc_ratio = numpy.log((numpy.array(pert_cc)+1)/(base_cc +1)) sc_distance = (ss_ratio**2 + cc_ratio**2)**.5 sc_distance = sc_distance.tolist() best_ind = sc_distance.index(numpy.max(sc_distance)) best_dose = pert_dose[best_ind] best_ss = pert_ss[best_ind] best_cc = pert_cc[best_ind] best_ss_ratio = ss_ratio[best_ind] best_cc_ratio = cc_ratio[best_ind] best_sc_distance = sc_distance[best_ind] data = [unique_pert,pert_desc,str(base_dose),str(base_ss), str(base_cc),str(best_dose),str(best_ss),str(best_cc), str(best_ss_ratio),str(best_cc_ratio),str(best_sc_distance)] f.write('\t'.join(data) + '\n')
]) processes.add(subprocess.Popen(cmd, shell=True)) if len(processes) >= max_processes: os.wait() processes.difference_update(p for p in processes if p.poll() is not None) ### make result frame # dg.make_result_frames(gp_type='OE',metric='spearman') gp_type = 'KD' work_dir = dg.outputdir #which cell lines have a result dir cellDirs = [ f for f in os.listdir(work_dir) if os.path.isdir(work_dir + '/' + f) ] prog = progress.DeterminateProgressBar('dataframe read') df = pd.DataFrame() dfRank = pd.DataFrame() #loop through each cell line add to df for icell, cell1 in enumerate(cellDirs): #define directories and load in outputs outdir = os.path.join(work_dir, cell1, 'sig_query_out') if not glob.glob(outdir + '/result_*.gctx'): print cell1 + ' no query result file' continue #if no results file, skip loop if metric == 'wtcs': rsltFile = glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx')[0] if metric == 'spearman': rsltFile = glob.glob(outdir + '/result_SPEARMAN_n*.gctx')[0] rslt = gct.GCT() rslt.read(rsltFile)
from cmap.tools import sig_slice_tool from cmap.io import gct, plategrp, rnk import cmap.analytics.dgo as dgo import cmap.util.progress as progress import subprocess import datetime import cmap.util.tool_ops as to import random metric = 'wtcs' work_dir = '/xchip/cogs/projects/target_id/OE_KD_25June2013' if not os.path.exists(work_dir): os.mkdir(work_dir) prog = progress.DeterminateProgressBar('perturbation cid query') #cell lines in which OEs were recorded CM = mu.CMapMongo() allOE = CM.find({ 'pert_type': 'trt_oe', 'is_gold': True }, { 'sig_id': True, 'pert_iname': True, 'cell_id': True }) cell_lines_tested = [] cellsAll = [sig['cell_id'] for sig in allOE] uniqCells = list(set(cellsAll)) cell_lines_tested = []
def ecdf_calc(inSum, dmsoSum, matrixType, graph=True, fpr_max=True): ''' -create empirical cdf for observed and dmso Parameters: ----------- ''' #look at edcf by row seriesList = [] progress_bar = update.DeterminateProgressBar('ecdf calculation') for ii, ix in enumerate(inSum.index): progress_bar.update('count', ii, len(inSum.index)) pID = ix[1] obsVec = inSum.ix[ix] dmsoVec = dmsoSum.ix[ix] # flip sign of rnkpt values # evaluate ecdf oecdf = ECDF(obsVec) decdf = ECDF(dmsoVec) # min1 = np.min([np.min(obsVec),np.min(dmsoVec)]) # max1 = np.max([np.max(obsVec),np.max(dmsoVec)]) # vals = np.linspace(min1,max1,100) vals = np.linspace(-100, 100, 201) oEval = oecdf(vals) dEval = decdf(vals) # make individual plots # fdrVec = dEval / oEval fdrVec = (1 - dEval) / (1 - oEval) # looking for positive connections fdrSer = pd.Series(data=fdrVec, index=vals) if fpr_max: fdrSer[fdrSer >= 1] = 1 fdrSer.name = ix seriesList.append(fdrSer) if graph: fig = plt.figure(1, figsize=(10, 10)) plt.subplot(2, 1, 1) a1 = plt.plot(vals, oEval, color='b', label='observed n=' + str(len(obsVec))) a3 = plt.plot(vals, dEval, color='r', label='DMSO n=' + str(len(dmsoVec))) # plt.legend(loc=2) plt.ylabel('F(x)', fontweight='bold') # plt.xlabel(matrixType,fontweight='bold') plt.title('ecdf for summly row - ' + pID) plt.subplot(2, 1, 2) h1 = plt.hist(obsVec, 30, color='b', range=[-100, 100], label=['observed'], alpha=.4, normed=True) h2 = plt.hist(dmsoVec, 30, color='r', range=[-100, 100], label='DMSO', alpha=.3, normed=True) # plt.legend() plt.ylabel('freq', fontweight='bold') plt.xlabel(matrixType, fontweight='bold') outF = os.path.join(wkdir, pID + '_ecdf.png') plt.savefig(outF, bbox_inches='tight', dpi=200) plt.close() fpFrame = pd.concat(seriesList, axis=1, keys=[s.name for s in seriesList]) fpFrame = fpFrame.T mCol = pd.MultiIndex.from_tuples(fpFrame.index, names=['pert_type', 'pert_id']) fpFrame.index = mCol return fpFrame
def read_gctx_matrix(self, src=None, cid=None, rid=None, col_inds=None, row_inds=None, verbose=True, convert_to_double=False, row_optimized=False): ''' read just the matrix data from a gctx file ''' #open an update indicator if verbose: progress_bar = update.DeterminateProgressBar('GCTX_READER') progress_bar.show_message('reading matrix data') if not src: src = self.src #get the appropriate column indices if not col_inds: col_inds = self.get_gctx_cid_inds(src, match_list=cid) #get the appropriate row indices if not row_inds: row_inds = self.get_gctx_rid_inds(src, match_list=rid) #open the gctx file self._open_gctx(src) #set up the indices if not col_inds: col_inds = range(len(self.column_id_node)) if not row_inds: row_inds = range(len(self.row_id_node)) #check if we're reading just reading the epsilon landmark genes #if so, can get the matrix in one read if row_inds == range(978): self.matrix = self.matrix_node[col_inds, 0:978] #otherwise, figure out which direction reads the fewest elements # then read in that orientation else: ncols, nrows = self.matrix_node.shape n_bycol = nrows * len(col_inds) n_byrow = ncols * len(row_inds) if row_optimized: # pre-allocate the matrix to be filled as we iterate over the # HDF5 matrix on disk self.matrix = numpy.zeros( [len(col_inds), len(row_inds)], dtype=numpy.float32) # create a set of col_inds to check membership on each row # iteration col_ind_set = dict(zip(col_inds, col_inds)) # dtermine the range of columns we must read col_ind_min = numpy.min(col_inds) col_ind_max = numpy.max(col_inds) # set up an iterator for the progress indicator. This will be # iterated every time we read a row that is called for. The # progress will be logged every time we reach 1/50th more of the # data p_iter = 0 p_max = len(col_inds) num_rows = len(row_inds) p_mod = numpy.round(p_max / 50.0) for i, row in enumerate( self.matrix_node.iterrows(start=col_ind_min, stop=col_ind_max + 1)): if i in col_ind_set: self.matrix[p_iter, :] = numpy.take(row, row_inds) p_iter += 1 if p_iter % p_mod == 0: if verbose: progress_bar.update( "reading matrix data ({0},{1})".format( num_rows, p_max), p_iter, p_max) else: if n_bycol <= n_byrow: self.matrix = self.matrix_node[col_inds, :] self.matrix = self.matrix[:, row_inds] else: self.matrix = self.matrix_node[:, row_inds] self.matrix = self.matrix[col_inds, :] # make sure the data is in the right order given the col_inds and row_inds self.matrix = self.matrix[col_inds.sort(), :] self.matrix = self.matrix[:, row_inds.sort()] self.matrix = numpy.reshape(self.matrix, (len(col_inds), len(row_inds))) self.matrix = self.matrix.transpose() # convert data to double precision of called for if convert_to_double: self.matrix = self.matrix.astype(numpy.float) #close the gctx file self._close_gctx() #clear the progress indicator if verbose: progress_bar.clear()
for cell in cellLst: for tim in timeLst: ### make SC plots cellLine = cell timeP = tim refControl = 'pc' #use pc vs vc controled data gctfile = glob.glob('/xchip/obelix/pod/brew/%s/PRISM001_%s_%s/by_pert_id_pert_dose/PRISM001_%s_%s_COMPZ.MODZ_SCORE_LM_*.gctx' % (refControl,cellLine,timeP,cellLine,timeP)) gctfile = gctfile[0] work_dir = '/xchip/cogs/hogstrom/analysis/scratch/prism/%s_%s_%s' % (cell,timeP,refControl) if not os.path.exists(work_dir): os.mkdir(work_dir) db = gct.GCT() #make a gct object db.read(gctfile) ### copy stuff from query tool # instantiate a progress object prog = progress.DeterminateProgressBar('Dose Analysis') # make an SC object from the given gctx file sco = sc.SC() sco.add_sc_from_gctx_meta(gctfile, verbose=False) sco.set_thresh_by_specificity(0.8) # find all of the unique pert_ids in the data perts = [x.split(':::')[0].split('::')[1] for x in sco.pid] #perts is pert_desc pert_ids = [x.split(':')[1] for x in sco.pid] # unique_perts = set(perts) unique_perts = set(pert_ids) ctl_perts = [] for i, unique_pert in enumerate(unique_perts): if unique_pert == 'DMSO': ctl_perts.append(unique_pert) unique_perts.difference_update(set(ctl_perts)) #make pairing of pert id and pert_desc
os.wait() processes.difference_update(p for p in processes if p.poll() is not None) # Create a pandas dataframe that lets you see connection results across # cell lines it is structured as follows: # index1 = BRD short # index2 = perurbation sig_id # each column - a unique gene ID/ time point - representing the CGS for that gene, matching cell line # cell line listed as a column gp_type = 'KD' # genetic perturbation type #which cell lines have a result dir cellDirs = [ f for f in os.listdir(work_dir) if os.path.isdir(work_dir + '/' + f) ] prog = progress.DeterminateProgressBar('Drug-target') df = pd.DataFrame() dfRank = pd.DataFrame() #loop through each cell line add to df # for icell, cell1 in enumerate(cgsCells): for icell, cell1 in enumerate(cellDirs): #define directories and load in outputs outdir = os.path.join(work_dir, cell1, 'sig_query_out') if not glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx'): print cell1 + 'no query result file' continue #if no results file, skip loop rsltFile = glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx')[0] rslt = gct.GCT() rslt.read(rsltFile) prog.update('analyzing {0}', icell, len(cellDirs)) rsltF = rslt.frame
# load in results frslt = '/xchip/cogs/hogstrom/analysis/scratch/Nov20/dose_analysis_tool.1353449771597/nov20/my_analysis.query_tool.2012112017162991/result_ESLM.COMBINED_n85x398050.gctx' rslt = gct.GCT() rslt.read(frslt) rSigIds = rslt.get_rids() rsltSigID = rslt.get_rids() #sig IDs from result file qPert = db.get_column_meta('pert_desc') qPertID = db.get_column_meta('pert_id') qDose = db.get_column_meta('pert_dose') ESmat = rslt.matrix iES = ESmat.argsort(axis=0)[::-1] #sort ascending n_inst = len(iES[:, 1]) #loop through each of the perts - graph ranks of query prog1 = progress.DeterminateProgressBar('creating self-connection graphs') avRnk = [] medRnk = [] #loop through each of the UNIQUE perts - graph ranks of query pertSet = set(qPert) for pert in pertSet: cmpd1 = pert iP = _all_indices(pert, qPert) #index of doses on plate if len(iP) < 2: print pert + ' has only one instance' continue uDose = [qDose[i] for i in iP] fDose = [float(x) for x in uDose] #convert strings to float aDose = numpy.asarray(fDose) #convert to numpy array iD = aDose.argsort() #local ordering sDose = [fDose[j] for j in iD] #sort local doses