Esempio n. 1
0
def get_dos_signatures(dosBrds):
    "1) return signature info for all DOS compounds \
    2) return list counts of number of cell lines tested"

    CM = mu.CMapMongo()
    dosQuery = CM.find(
        {
            'pert_id': {
                '$in': list(dosBrds)
            },
            'pert_type': 'trt_cp'
        },  #, 
        {
            'sig_id': True,
            'pert_id': True,
            'cell_id': True,
            'pert_time': True,
            'is_gold': True,
            'pert_iname': True,
            'distil_ss': True,
            'distil_cc_q75': True
        },
        toDataFrame=True)
    dosQuery.index = dosQuery['sig_id']
    dosSetLen = len(set(dosQuery['pert_id']))
    dosGrped = dosQuery.groupby(['pert_id'])
    countDict = {}
    for grp in dosGrped:
        grpName = grp[0]
        cellSet = set(grp[1]['cell_id'])
        nCells = len(cellSet)
        countDict[grpName] = nCells
    countSer = pd.Series(countDict)
    countMax = max(countSer)
    return dosQuery, countSer
def build_probe_curves(args,work_dir):
	'''
	builds dose response curves for the specified probe
	'''
	gcto = gct.GCT()
	probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True)
	gcto.read_gctx_matrix(args.res,row_inds=probe_ind)
	cids = gcto.get_gctx_cid(args.res)
	doses = [float(x.split(':')[2]) for x in cids]
	CM = mu.CMapMongo()
	with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f:
		headers = ['pert_id','pert_desc','base_dose','base_z_score',
				   'best_dose','best_z_score', 'best_z_score_delta']
		f.write('\t'.join(headers) + '\n')
		for i,unique_pert in enumerate(unique_perts):
			prog.update('analyzing {0}'.format(args.probe),i,num_perts)
			cid_inds = [i for i,x in enumerate(cids) if unique_pert in x]
			pert_scores = gcto.matrix[0,cid_inds]
			pert_doses = [doses[x] for x in cid_inds]
			tmp_tup = zip(pert_doses,pert_scores)
			tmp_tup.sort()
			pert_doses,pert_scores = zip(*tmp_tup)
			plt.plot(pert_doses,pert_scores)
			plt.title('::'.join([unique_pert,args.probe]))
			plt.xlabel('dose')
			plt.ylabel('z-score')
			plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png'])))
			plt.close()
			
			pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1)
			if not pert_desc:
				pert_desc = ['-666']
			pert_desc = pert_desc[0]

			base_dose = pert_doses[0]
			base_z_score = pert_scores[0]

			z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10)
			abs_z_delta = numpy.abs(z_delta)
			z_delta =  z_delta.tolist()
			abs_z_delta = abs_z_delta.tolist()
			
			best_ind = z_delta.index(numpy.min(z_delta))
			best_dose = pert_doses[best_ind]
			best_z_score = pert_scores[best_ind]
			best_z_score_delta = z_delta[best_ind]

			data = [unique_pert,pert_desc,str(base_dose),str(base_z_score),
					str(best_dose),str(best_z_score),str(best_z_score_delta)]
			f.write('\t'.join(data) + '\n')
	prog.clear()
 def get_inames(self):
     '''
     get pert_inames for each input compound
     '''
     cm = mu.CMapMongo()
     inameID = cm.find({'pert_id': {
         '$in': list(self.cpSet)
     }}, {
         'pert_id': True,
         'pert_iname': True
     },
                       toDataFrame=True)
     inameSer = pd.Series(data=inameID['pert_iname'])
     inameSer.index = inameID['pert_id']
     inameDict = inameSer.to_dict()
     self.inameDict = inameDict
Esempio n. 4
0
    def PCL_vs_DMSO(self, max_signatures_per_cp=20, n_test_max=False):
        '''
        -grab equal amounts of DMSO and signatures from a PCL class
        -test one PCL at a time

        Parameters
        ----------
        n_test_max : int
            -max number of PCL groups to incorporate into the classifier 
            -if set to False, all groups are tested
        '''
        for group_name in self.test_groups:
            group_cps = self.pclDict[group_name]
            CM = mu.CMapMongo()
            # set minimum dose
            cpQuery = CM.find(
                {
                    'is_gold': True,
                    'pert_id': {
                        '$in': group_cps
                    },
                    'pert_dose': {
                        '$gt': 1
                    }
                },  #, 
                {
                    'sig_id': True,
                    'pert_id': True,
                    'cell_id': True,
                    'pert_time': True,
                    'is_gold': True,
                    'pert_iname': True
                },
                toDataFrame=True)
            # inameGrped = cpQuery.groupby('pert_iname')
            cpQuery.index = cpQuery['sig_id']
            cpQuery = self.set_class_labels(cpQuery)
            droppedQ = self.cut_signatures(cpQuery,
                                           nKeep=max_signatures_per_cp,
                                           cut_by='pert_iname')
            droppedGrped = droppedQ.groupby('pert_iname')
            droppedGrped.size()
Esempio n. 5
0
work_dir = '/xchip/cogs/projects/HOG/DG_connect'
#load in OMIM genes. Which ones have a CGS in > 4 cell lines? which ones are LM?
inFile = '/xchip/cogs/hogstrom/analysis/OMIM/OMIM_CGS.txt'
omimGeneList = []
with open(inFile, 'rt') as f:
    for string in f:
        splt = string.split('\r')
        for i, line in enumerate(splt):
            if i == 0:  # skip headder
                continue
            splt2 = line.split('\t')
            geneID = splt2[0]  #the pert_id listed the line
            omimGeneList.append(geneID)

CM = mutil.CMapMongo()
CGSall = CM.find({'pert_type': 'trt_sh.cgs'}, {
    'sig_id': True,
    'pert_iname': True,
    'cell_id': True,
    'pert_id': True
})
#which drugs to use --> informer set and HOG plate

### which genes have a CGS in > 4 cell lines
ominWithContext = []
for geneID in omimGeneList:
    cellLst = []
    sigIDLst = []
    for q in CGSall:
        if q['pert_iname'] == geneID:
Esempio n. 6
0
### use pert_info collection to get sig_ids in mongo
# cellList = []
# for pert in targetDict.keys()[:10]:
# 	pertdb = mutil.CMapMongo(mongo_location = None, collection = 'pert_info')
# 	p1 = pertdb.find({'pert_id':'BRD-M79902621'},{'sig_id':True})
# 	g = p1[0]
# 	gSplit = g.split('\'')
# 	sigIDs = [x for x in gSplit if len(x) >= 5]
# 	cells = [x.split('_')[1] for x in sigIDs]
# 	cellList.extend(cells)
# print p1
# type(p1[0])

### which targets have CGS signatures
#get all CGS gene IDs
CM = mu.CMapMongo()
# pert_List = CM.find({'pert_type':{'$regex':pert}},{'sig_id':True,'cell_id':True})
CGSbyCell = CM.find({'pert_type': 'trt_sh.cgs'}, {'pert_iname': True})
CGSgeneSyms = set(CGSbyCell)
#check overlap with DB targets
nestedTargets = targetDict.values()
DBtargets = [item for sublist in nestedTargets for item in sublist]
setDBtargets = set(DBtargets)
DBcgsOverlap = setDBtargets.intersection(CGSgeneSyms)

targetDictCGS = {}
for pert in targetDict:
    for gene in targetDict[pert]:
        if gene in DBcgsOverlap:
            if targetDictCGS.has_key(pert):
                targetDictCGS[pert].append(gene)
 def group_probe_frq_plot(self,
                          make_heatmaps=True,
                          sum_score_metric='sum_score_4',
                          rankpt_metric='mean_rankpt_4'):
     '''
     test relative occurance of up/dn regulation of probes for a specific group
     
     '''
     brd = 'BRD-K02130563'
     sigs = po.sigIDdict[brd]
     sig = sigs[0]
     #
     afPath = cmap.score_path
     gt = gct.GCT()
     gt.read(src=afPath, cid=sigs, rid='lm_epsilon')
     zFrm = gt.frame
     # zFrm = pd.DataFrame(data=gt.matrix,
     #                     index=gt.get_rids(),
     #                     columns=sigs)
     # take modz of signature group
     modZed = modzsig.modzsig(zFrm)
     modZed = modZed.order()
     #pick a group
     # grpName = 'tubulin'
     grpName = 'HDAC-inhibitor'
     #get all sig_ids for that group
     grpSigList = []
     for brd in self.pclResultDict[grpName]:
         grpSigList.extend(self.sigIDdict[brd])
     #query for up/dn probes
     cm = mu.CMapMongo()
     regFrm = cm.find({'sig_id': {
         '$in': list(grpSigList)
     }}, {
         'sig_id': True,
         'pert_id': True,
         'pert_iname': True,
         'up50_lm': True,
         'dn50_lm': True
     },
                      toDataFrame=True)
     # count dn probe freq
     nInstances = regFrm.shape[0]
     dnNested = regFrm['dn50_lm'].values
     dnArray = [item for sublist in dnNested for item in sublist]
     dnSer = pd.Series(dnArray)
     dnCounts = dnSer.value_counts()
     zDnCounts = dnCounts.reindex_like(modZed)
     # count dn probe freq
     upNested = regFrm['up50_lm'].values
     upArray = [item for sublist in upNested for item in sublist]
     upSer = pd.Series(upArray)
     upCounts = upSer.value_counts()
     zUpCounts = upCounts.reindex_like(modZed)
     # adjust marker size
     upPercMkrs = np.divide(
         zUpCounts, nInstances
     )  #divide by total instances to make for relative frequency
     dnPercMkrs = np.divide(zDnCounts, nInstances)
     upMkrs = np.multiply(upPercMkrs, 100)
     dnMkrs = np.multiply(dnPercMkrs, 100)
     upMkrs = upMkrs.replace(np.nan, 0)
     dnMkrs = dnMkrs.replace(np.nan, 0)
     # make plot
     fig = plt.figure()
     ax = fig.add_subplot(111)
     # ax.plot(s,s,'b')
     for j, sl in enumerate(modZed):
         ax.plot(j, 1, 'r.', markersize=upMkrs[j], alpha=.25)
         ax.plot(j, 1, 'b.', markersize=dnMkrs[j], alpha=.25)
Esempio n. 8
0
def analyze_query(args,work_dir):
	'''
	Analyze the output from query_tool - find self-connections and create graphs
	'''
	#make a gct object
	db = gct.GCT()
	db.read(args.res)

	##load query result - gctx file
	rslt = gct.GCT()
	#if specific result directory is specified, use that - otherwise get gctx from working dir
	if args.result:
		outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step
		rslt.read(outGctx[0])
	else:
		rslt.read(args.resultDir)

	rsltSigID = rslt.get_rids() #sig IDs from result file

	qPert = db.get_column_meta('pert_desc')
	qPertID = db.get_column_meta('pert_id')
	qDose = db.get_column_meta('pert_dose')
	ESmat = rslt.matrix
	iES = ESmat.argsort(axis=0)[::-1] #sort ascending
	n_inst = len(iES[:,1])

	#loop through each of the perts - graph ranks of query
	prog1 = progress.DeterminateProgressBar('creating self-connection graphs')
	avRnk = []
	medRnk = []
	for i, x in enumerate(qPert):
		prog1.update('graphing {0}',i,len(qPert))
		iE = iES[:,i] #ES sort index for one column
		sSigID = []
		for y in iE:
			sSigID.append(rsltSigID[y]) #make sorted sig ID list
		qStr = qPertID[i]
		cmpd1 = x
		dose1 = qDose[i]
		if len(qStr) >= 13:
			qStr = qStr[0:13] #shorten qPertID
		#i1 = IDsorted.index(qStr) #give first index of match

		#run pymongo query
		CM = mu.CMapMongo()
		#cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True})
		cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db

		#i1 = __all_indices(qStr,sSigID)
		i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list
		if len(i1) < 1:
			print cmpd1 + ' has no instances in the cmap database'
			continue
		i2 = numpy.array(i1) #convert list to numpy array
		avr = sum(i2)/len(i2) #what is the average ES rank
		md = numpy.median(i2) # what is the median ES rank
		nAv = float(avr)/n_inst #normalize acording to number of instances in db
		nMd = float(md)/len(iES[:,1]) #normalized median
		avRnk.append(nAv) #store average ES rank
		medRnk.append(nMd)
		#plot
		fname = cmpd1 + '_' + dose1 + '_query_rank.png'
		outf = os.path.join(work_dir,fname)
		fig = plt.figure(figsize=(8.0, 2.0))
		ax = fig.add_subplot(111)
		# the histogram of the data
		n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75)
		#ax.set_xlim(0, n_inst)
		ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k
		ax.set_xlabel('query rank')
		ax.set_ylabel('freq')
		ax.set_title('dose = '+ str(dose1) +'um')
		ax.grid(True)
		plt.savefig(outf, bbox_inches=0)
Esempio n. 9
0
def build_probe_curves_and_summary(args,work_dir):
	'''
	builds dose response curves for each for the specified probe
	'''
	# instantiate a progress object
	prog = progress.DeterminateProgressBar('Dose Analysis')

	# read the specified probe from the input gctx file
	gcto = gct.GCT()
	probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True)
	gcto.read_gctx_matrix(args.res,row_inds=probe_ind)

	# grab the cids from the file and mine dose information from them.  Find all of 
	# the unique perts
	cids = gcto.get_gctx_cid(args.res)
	doses = [float(x.split(':')[2]) for x in cids]
	perts = [x.split(':')[1] for x in cids]
	unique_perts = list(set(perts))
	
	# for each unique pert_id, find the dose that deviates from the base dose the most.
	# Do template matching to prototype curves. Output a report
	num_perts = len(unique_perts)
	CM = mu.CMapMongo()
	with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f:
		headers = ['pert_id','pert_desc','base_dose','base_z_score',
				   'best_dose','best_z_score', 'best_z_score_delta',
				   'linear','log','half-log','quarter-log','called shape']
		f.write('\t'.join(headers) + '\n')
		for i,unique_pert in enumerate(unique_perts):
			prog.update('analyzing {0}'.format(args.probe),i,num_perts)
			
			# grab the z-scores and doses for the current pert and sort the pairs
			# by dose
			cid_inds = [i for i,x in enumerate(cids) if unique_pert in x]
			pert_scores = gcto.matrix[0,cid_inds]
			pert_doses = [doses[x] for x in cid_inds]
			tmp_tup = zip(pert_doses,pert_scores)
			tmp_tup.sort()
			pert_doses,pert_scores = zip(*tmp_tup)

			# build the dose response plot for the current pert and save it to disk
			plt.plot(pert_doses,pert_scores)
			plt.title('::'.join([unique_pert,args.probe]))
			plt.xlabel('dose')
			plt.ylabel('z-score')
			plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png'])))
			plt.close()

			# grab the pert_desc from mongo
			pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1)
			if not pert_desc:
				pert_desc = ['-666']
			pert_desc = pert_desc[0]

			# find the best dose and cast them to lists
			base_dose = pert_doses[0]
			base_z_score = pert_scores[0]

			z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10)
			abs_z_delta = numpy.abs(z_delta)
			z_delta =  z_delta.tolist()
			abs_z_delta = abs_z_delta.tolist()
			
			best_ind = z_delta.index(numpy.min(z_delta))
			best_dose = pert_doses[best_ind]
			best_z_score = pert_scores[best_ind]
			best_z_score_delta = z_delta[best_ind]

			if len(pert_doses) > 1:
				# build prototype curves if there is more than one dose
				linear = numpy.linspace(1,10,len(pert_doses))
				log_gen = _log_gen(1)
				log_curve = [log_gen.next() for x in range(len(pert_doses))]
				log_gen = _log_gen(.5)
				half_log_curve = [log_gen.next() for x in range(len(pert_doses))]
				log_gen = _log_gen(.25)
				quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))]

				curves = numpy.array([linear,log_curve,
									  half_log_curve,quarter_log_curve])

				# get the correlation coeficient for each of the curves and the
				# current pert dose curve
				corrs = numpy.corrcoef(pert_scores,curves)
				linear_corr = corrs[0][1]
				log_corr = corrs[0][2]
				half_log_corr = corrs[0][3]
				quarter_log_corr = corrs[0][4]

				#report the best shape by finding the best absolute correlation
				abs_corr = numpy.abs(corrs[0][1:])
				if numpy.where(abs_corr > .8)[0].size > 0:
					abs_corr_max = max(abs_corr)
					abs_corr_max_ind = numpy.where(abs_corr == abs_corr_max)[0][0]
					curve_names = ['linear','log','half-log','quarter-log']
					max_curve_name = curve_names[abs_corr_max_ind]
				else:
					max_curve_name = 'none'

			else:
				# if there is only one dose, set all corrs to 'nan'
				linear_corr = 'nan'
				log_corr = 'nan'
				half_log_corr = 'nan'
				quarter_log_corr = 'nan'
				max_curve_name = 'none'



			# write the dose data to the summary file
			data = [unique_pert,pert_desc,str(base_dose),str(base_z_score),
					str(best_dose),str(best_z_score),str(best_z_score_delta),
					str(linear_corr),str(log_corr),str(half_log_corr),
					str(quarter_log_corr),max_curve_name]
			f.write('\t'.join(data) + '\n')
	prog.clear()
Esempio n. 10
0
    def load_expression_data(self,
                             max_signatures_per_cp=3,
                             groups_to_model=None,
                             keep_by_cell_line=False):
        '''
        -search for z-score data of compounds that fall into one of the different classes
        -limit the number of signatures per compound
        -load in z-score data signatures
        
        Parameters
        ----------
        groups_to_model : list
            -list of group names in the pclDict
            -default is to use all keys
        max_signatures_per_cp : int
            maximum number of signatures per compound to incorporate into the classifier
            (to avoid overfitting to compounds with many signatures)
        keep_by_cell_line : bool
            -if True - keep n number of signatues per cell line
            -if False - keep first n signatures regardless of cell line

        '''
        if groups_to_model == None:
            groups_to_model = self.pclDict.keys()
        brdAllGroups = []
        for group in groups_to_model:
            brdAllGroups.extend(self.pclDict[group])
        CM = mu.CMapMongo()
        # set minimum dose
        goldQuery = CM.find(
            {
                'is_gold': True,
                'pert_id': {
                    '$in': brdAllGroups
                },
                'pert_dose': {
                    '$gt': 1
                }
            },  #, 
            {
                'sig_id': True,
                'pert_id': True,
                'cell_id': True,
                'pert_time': True,
                'is_gold': True,
                'pert_iname': True
            },
            toDataFrame=True)
        goldQuery.index = goldQuery['sig_id']
        # asign drug class labels
        goldQuery = self.set_class_labels(goldQuery)
        # reduce signatures to prevent overfitting to one compound
        droppedQ = self.cut_signatures(goldQuery,
                                       nKeep=max_signatures_per_cp,
                                       keep_by_cell_line=keep_by_cell_line)
        sigList = droppedQ['sig_id'].values
        ### load in expression data for the two sets of signatures
        afPath = cmap.score_path
        gt = gct.GCT()
        gt.read(src=afPath, cid=sigList, rid='lm_epsilon')
        zFrm = gt.frame
        zFrm = zFrm.T
        probeIDs = zFrm.columns
        self.probe_ids = probeIDs
        ## merge data with
        zFrm = pd.concat([zFrm, droppedQ], axis=1)
        self.signature_frame = zFrm
Esempio n. 11
0
 def classification_by_cell(self, loo_type='by_cp'):
     '''
     -For each of the specified cell lines, build a separate classifier
     -evaluate model with leave one out cross val.
     
     Parameters
     ----------
     loo_type : str
         strategy for leave one out validation:
             'by_cp' - leaves out all signatures for a given compounds
             'by_sig' - leaves out individual signatures 
     '''
     combinedFrm = pd.DataFrame()
     accuracyDict = {}
     for cellLine in self.core_cell_lines:
         CM = mu.CMapMongo()
         # goldQuery = CM.find({'is_gold' : True,'pert_id':{'$in':brdAllGroups},'cell_id':cellLine}, #,
         #         {'sig_id':True,'pert_id':True,'cell_id':True,'pert_time':True,'is_gold':True,'pert_iname':True},
         #         toDataFrame=True)
         # set minimum dose
         goldQuery = CM.find(
             {
                 'is_gold': True,
                 'pert_id': {
                     '$in': self.all_group_cps
                 },
                 'cell_id': cellLine,
                 'pert_dose': {
                     '$gt': 1
                 }
             },  #, 
             {
                 'sig_id': True,
                 'pert_id': True,
                 'cell_id': True,
                 'pert_time': True,
                 'is_gold': True,
                 'pert_iname': True
             },
             toDataFrame=True)
         goldQuery.index = goldQuery['sig_id']
         # asign drug class labels
         goldQuery = self.set_class_labels(goldQuery)
         # reduce signatures to prevent overfitting to one compound
         droppedQ = self.cut_signatures(goldQuery)
         sigList = droppedQ['sig_id'].values
         ### load in expression data for the two sets of signatures
         afPath = cmap.score_path
         gt = gct.GCT()
         gt.read(src=afPath, cid=sigList, rid='lm_epsilon')
         zFrm = gt.frame
         zFrm = zFrm.T
         probeIDs = zFrm.columns
         ## merge data with
         zFrm = pd.concat([zFrm, droppedQ], axis=1)
         ### perform leave one out validation
         if loo_type == 'by_cp':
             zFrm['svm_prediction'] = np.nan
             cpSet = set(zFrm['pert_id'])
             # loop through the compounds - leave out in building the model then test
             for brd in cpSet:
                 brd_match = zFrm['pert_id'] == brd
                 droppedFrm = zFrm[
                     ~brd_match]  # remove test signature from training
                 trainFrm = droppedFrm.reindex(columns=probeIDs)
                 labelsTrain = droppedFrm['labels'].values
                 C = 1.0  # SVM regularization parameter
                 svc = svm.SVC(kernel='linear',
                               C=C).fit(trainFrm.values, labelsTrain)
                 zTest = zFrm.ix[brd_match, probeIDs]
                 linPred = svc.predict(zTest.values)
                 zFrm['svm_prediction'][zTest.index] = linPred
         if loo_type == 'by_sig':
             predictDict = {}
             for sig in zFrm.index:
                 droppedFrm = zFrm[
                     zFrm.index !=
                     sig]  # remove test signature from training
                 trainFrm = droppedFrm.reindex(columns=probeIDs)
                 labelsTrain = droppedFrm['labels'].values
                 C = 1.0  # SVM regularization parameter
                 svc = svm.SVC(kernel='linear',
                               C=C).fit(trainFrm.values, labelsTrain)
                 zTest = zFrm.ix[sig, probeIDs]
                 linPred = svc.predict(zTest.values)
                 predictDict[sig] = linPred[0]
             predSer = pd.Series(predictDict)
             predSer.name = 'svm_prediction'
             zFrm = pd.concat([zFrm, pd.DataFrame(predSer)], axis=1)
         combinedFrm = pd.concat([combinedFrm, zFrm], axis=0)
         accuracyArray = zFrm['labels'] == zFrm['svm_prediction']
         accuracyRate = accuracyArray.sum() / float(accuracyArray.shape[0])
         accuracyDict[cellLine] = accuracyRate
         self.modelFrame = combinedFrm
         self.model_accuracy = accuracyDict
Esempio n. 12
0
with open(targetSheetF, 'rt') as f:
    for string in f:
        splt = string.split('\r')
        for i, line in enumerate(splt):
            splt2 = line.split('\t')
            pID = splt2[0]  #the pert_id listed the line
            pDesc = splt2[1]
            targets = splt2[2]
            targets = targets.split(';')
            if targets[0] == '' or targets[0] == '?' or targets[0] == '-666':
                continue
            else:
                targetDict[pID] = targets
                pDescDict[pID] = pDesc

db = mu.CMapMongo()
test1 = db.find({
    'cell_id': 'A375',
    'is_gold': True,
    'pert_type': 'trt_oe'
}, {'sig_id': 1})
test2 = db.find({
    'cell_id': 'A375',
    'is_gold': True,
    'pert_type': 'trt_sh'
}, {'sig_id': 1})
test1 = test1[1:10]
test2 = test2[1:10]
# t = dgo.Oracle(test1,test2,out=work_dir + '/Oracle')
# t.compute_scores()
# t.get_results()
Esempio n. 13
0
def external_qq(args,work_dir):
	'''
	make a qq plot of each unique instance - plot the size of each probe acording to how
	often it occurs in the affogato top/bottom 50 list
	'''
	#make a gct object
	db = gct.GCT()
	db.read(args.res)

	qPert = db.get_column_meta('pert_desc')
	qPertID = db.get_column_meta('pert_id')
	qDose = db.get_column_meta('pert_dose')
	probeIDs = db.get_row_meta('id')

	#set null distirbution of z-scores (currently normal)
	ESmat = db.matrix
	#calculate null distribution
	mu, sigma = 0, 1
	s = numpy.random.normal(mu, sigma, len(ESmat[:,1]))
	s.sort()

	pertSet = set(qPert)
	for pert in pertSet:
		iP = _all_indices(pert, qPert) #index of doses on plate
		if len(iP) < 2:
			print pert + ' has only one instance'
			continue
		uDose = [qDose[i] for i in iP]
		fDose = [float(x) for x in uDose] #convert strings to float
		aDose = numpy.asarray(fDose) #convert to numpy array
		iD = aDose.argsort() #local ordering
		sDose = [fDose[j] for j in iD] #sort local doses
		iPo =  [iP[i] for i in iD] #ordered index
		#sMat = ESmat[:,iPo]
		#sMat.sort(axis=0)
		#mongo query for each unique pertID
		qStr = qPertID[iPo[0]] #set pertID
		if len(qStr) >= 13:
			qStr = qStr[0:13] #shorten qPertID
		CM = mutil.CMapMongo()
		#cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True})
		edge50Lst = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True,'up50_lm':True,'dn50_lm':True,'cell_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db
		nInstances = len(edge50Lst) #number of instances in db
		#count number of times a probe is in the top/bottom 50 genes of an instance
		upProbeCnts = [0] * len(probeIDs)
		dnProbeCnts = [0] * len(probeIDs)
		for j,inst in enumerate(edge50Lst):
			up50 = edge50Lst[j]['up50_lm']
			dn50 = edge50Lst[j]['dn50_lm']
			#loop through every gene in the top and bottom list - where does it live on the rank list?
			for prb in up50:
				if prb in probeIDs:
					iPrb = probeIDs.index(prb)
					upProbeCnts[iPrb] = upProbeCnts[iPrb] +1
			for prb in dn50:
				if prb in probeIDs:
					iPrb = probeIDs.index(prb)
					dnProbeCnts[iPrb] = dnProbeCnts[iPrb] +1
		#loop through each dose
		for d in iPo:
		#count probe enrichment and plot
				cmpd1 = qPert[d]
				dose1 = qDose[d]
				zLst = db.matrix[:,d]
				iLst = zLst.argsort() #sort z-scores and save index
				sLst = zLst[iLst]
				sUpProbeCnts = [upProbeCnts[l] for l in iLst] #sort probe counts acording to z-score
				sDnProbeCnts = [dnProbeCnts[l] for l in iLst]
				#mkrs = numpy.sqrt(sprobeCnts) # non linear scaling of marker points
				sUpProbeCnts = [float(l) for l in sUpProbeCnts] #convert to float
				sDnProbeCnts = [float(l) for l in sDnProbeCnts] #convert to float
				# upPercMkrs = numpy.divide(sUpProbeCnts,max(sUpProbeCnts)) #divide by max count to make for relative frequency
				# dnPercMkrs = numpy.divide(sDnProbeCnts,max(sDnProbeCnts))
				upPercMkrs = numpy.divide(sUpProbeCnts,nInstances) #divide by total instances to make for relative frequency
				dnPercMkrs = numpy.divide(sDnProbeCnts,nInstances)
				upMkrs = numpy.multiply(upPercMkrs,100)
				dnMkrs = numpy.multiply(dnPercMkrs,100)
				fig = plt.figure()
				ax = fig.add_subplot(111)
				ax.plot(s,s,'b')
				for j,sl in enumerate(sLst):
					ax.plot(s[j],sl,'r.',markersize=upMkrs[j],alpha=.25)
					ax.plot(s[j],sl,'b.',markersize=dnMkrs[j],alpha=.25)
				ax.set_ylabel('observed z-score')
				ax.set_xlabel('expected z-score')
				# #set legend based on the number of
				r1 = ax.plot(0,0,'r.',markersize=100,alpha=.25)
				b1 = ax.plot(0,0,'b.',markersize=100,alpha=.25)
				legStrUp = 'probe in 100% of ' + str(nInstances) + ' UP instances'
				legStrDn = 'probe in 100% of ' + str(nInstances) + ' DN instances'
				plt.legend([r1, b1], [legStrUp, legStrDn], numpoints=1, loc=4)
				#plt.legdend([b1], ['probe in 100% of ' + str(nInstances) + 'instances' ], numpoints=1)
				ax.set_title(pert + ' dose = ' + dose1)
				fname = pert + '_' + dose1 + 'um_connection_qq.png'
				outf = os.path.join(work_dir,fname)
				plt.savefig(outf, bbox_inches=0)
Esempio n. 14
0
def analyze_query(args,work_dir):
	'''
	Analyze the output from query_tool - find self-connections and create graphs
	'''
	#make a gct object
	db = gct.GCT()
	db.read(args.res)

	##load query result - gctx file
	rslt = gct.GCT()
	#if specific result directory is specified, use that - otherwise get gctx from working dir
	if args.result:
		outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step
		rslt.read(outGctx[0])
	else:
		rslt.read(args.resultDir)

	rsltSigID = rslt.get_rids() #sig IDs from result file

	qPert = db.get_column_meta('pert_desc')
	qPertID = db.get_column_meta('pert_id')
	qDose = db.get_column_meta('pert_dose')
	ESmat = rslt.matrix
	iES = ESmat.argsort(axis=0)[::-1] #sort ascending
	n_inst = len(iES[:,1])

	#loop through each of the perts - graph ranks of query
	prog1 = progress.DeterminateProgressBar('creating self-connection graphs')
	avRnk = []
	medRnk = []
	prRnk = []
	#loop through each of the UNIQUE perts - graph ranks of query
	pertSet = set(qPert)
	for pert in pertSet:
		cmpd1 = pert
		iP = _all_indices(pert, qPert) #index of doses on plate
		if len(iP) < 2:
			print pert + ' has only one instance'
			continue
		uDose = [qDose[i] for i in iP]
		fDose = [float(x) for x in uDose] #convert strings to float
		aDose = numpy.asarray(fDose) #convert to numpy array
		iD = aDose.argsort() #local ordering
		sDose = [fDose[j] for j in iD] #sort local doses
		iPo =  [iP[i] for i in iD] #ordered index
		qStr = qPertID[iPo[0]] #set pertID
		if len(qStr) >= 13:
			qStr = qStr[0:13] #shorten qPertID
		#run pymongo query
		CM = mutil.CMapMongo()
		#cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True})
		cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db
		if len(cmpdSigIds) < 2:
			print cmpd1 + ' has one or no instances in the cmap database'
			continue
		#loop through each dose
		for d in iPo:
		#count probe enrichment and plot
				cmpd1 = qPert[d]
				dose1 = qDose[d]
				iE = iES[:,d] #ES sort index for one column
				sSigID = []
				for y in iE:
					sSigID.append(rsltSigID[y]) #make sorted sig ID list
				i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list
				i2 = numpy.array(i1) #convert list to numpy array
				avr = sum(i2)/len(i2) #what is the average ES rank
				md = numpy.median(i2) # what is the median ES rank
				nAv = float(avr)/n_inst #normalize acording to number of instances in db
				nMd = float(md)/len(iES[:,1]) #normalized median
				i1.sort()
				np = 1000
				ntop = [x for x in i1 if x <= np]
				nPr = float(len(ntop))/(len(i1)) #percent of instances at the top of the list
				prRnk.append(nPr)
				avRnk.append(nAv) #store average ES rank
				medRnk.append(nMd)
				#plot
				fname = cmpd1 + '_' + dose1 + '_query_rank.png'
				outf = os.path.join(work_dir,fname)
				fig = plt.figure(figsize=(8.0, 2.0))
				ax = fig.add_subplot(111)
				# the histogram of the data
				n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75)
				#ax.set_xlim(0, n_inst)
				ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k
				ax.set_xlabel('query rank')
				ax.set_ylabel('freq')
				ax.set_title('dose = '+ str(dose1) +'um')
				ax.grid(True)
				plt.savefig(outf, bbox_inches=0)
Esempio n. 15
0
    os.mkdir(wkdir)

cliqueGMT = gmt.read(gFile)
cliqFrm = pd.DataFrame(cliqueGMT)
# unstack nested list
cliqMemberLong = [item for sublist in cliqFrm.sig.values for item in sublist]
cliqMemb = list(set(cliqMemberLong))

# load summly matrix
summMtrx = '/xchip/cogs/projects/connectivity/summly/matrices/matched_mrp4_n7147x7147.gctx'
gt = gct.GCT()
gt.read(summMtrx)
summFrm = gt.frame

### get info on drug signatures
MC = mu.CMapMongo()
pertInfo = MC.find({'pert_id': {
    '$in': cliqMemb
}}, {
    'sig_id': True,
    'cell_id': True,
    'pert_id': True,
    'pert_iname': True,
    'is_gold': True
},
                   toDataFrame=True)

# tabulate signature stats
pertGrped = pertInfo.groupby('pert_id')
nDrugs = len(pertGrped.groups)
Zs = np.zeros((nDrugs, 7))
Esempio n. 16
0
# get directory
dir1 = '/xchip/cogs/projects/TRIB1' 
wkdir = dir1 + '/TRIB1_analysis_Oct21'
if not os.path.exists(wkdir):
    os.mkdir(wkdir)

#define compounds of interest
trib1Cps = ['BRD-K75627148', 
    'BRD-K35860134', 
    'BRD-K67774729', 
    'BRD-K16956545', 
    'BRD-K16410418']

### 1 ) get signature info
CM = mu.CMapMongo()
qRes = CM.find({'pert_id':{'$in':trib1Cps}, 'is_gold' : True},
        {'sig_id':True,'pert_iname':True,'pert_id':True,'pert_mfc_id':True,'cell_id':True,'pert_time':True,'is_gold':True},
        toDataFrame=True)
qRes.index = qRes['sig_id']
outF = wkdir + '/TRIB1_signature_details.txt'
qRes.to_csv(outF,sep='\t',index=False)

grped = qRes.groupby('pert_id')
for grp in grped.groups:
    nSig = len(grped.groups[grp])
    # print grp + ' ' + str(nSig)
    print str(nSig)

### descriptive data on signatures
#number of cell lines per compound