Beispiel #1
0
def fpr_calc_parallel(inSum,
                      outSum,
                      dmsoSum,
                      matrixType,
                      rnkptRange,
                      graph=True,
                      fpr_max=False):
    '''
    -calculate the rate of false positives for bioactive signatures vs. DMSO
    -make heatmap

    Parameters:
    -----------
    fpr_max: bool
        if false positive rate is above 1, set to 1

    '''
    #### false positive calculation in parallel ####
    n_procs = 30
    tupList = [(inSum, dmsoSum, x) for x in range(0, 100)]
    prog = update.DeterminateProgressBar('self-connection graph builder')
    pool = multiprocessing.Pool(n_procs)
    rs = pool.map_async(_cdf_worker, tupList)
    pool.close()  # No more work
    while (True):
        if (rs.ready()): break
        remaining = rs._number_left
        prog.show_message(
            'SVM evaluation - {0} tasks to complete'.format(remaining))
        time.sleep(0.1)
    results = rs.get()
    fpFrame = pd.concat(results, axis=1, keys=[s.name for s in results])
    return fpFrame
Beispiel #2
0
    def read_gctx_row_meta(self, src, row_inds=None, verbose=True):
        '''
        read the row meta data from the file given in src.  If row_inds is given, only
        those rows specified are read.  
        '''
        #open an update indicator
        if verbose:
            progress_bar = update.DeterminateProgressBar('GCTX_READER')

        #open the gctx file
        self._open_gctx(src)

        #set up the indices
        if not row_inds:
            row_inds = range(len(self.row_id_node))

        #read in the row meta data
        row_headers = [x.name for x in self.row_data]
        row_headers.insert(0, 'ind')
        self._add_table_to_meta_db("row", row_headers)
        num_rows = len(row_inds)
        for i, ind in enumerate(row_inds):
            if verbose:
                progress_bar.update('reading row meta data', i, num_rows)
            data_list = [ind]
            for column in self.row_data:
                data_list.append(str(column[ind]).rstrip())
            self._add_row_to_meta_table("row", data_list)

        #clear the update indicator
        if verbose:
            progress_bar.clear()

        #close the gctx file
        self._close_gctx()
Beispiel #3
0
    def read_gctx_col_meta(self, src, col_inds=None, verbose=True):
        '''
        read the column meta data from the file given in src.  If col_inds is given, only
        those columns specified are read.
        '''
        #open an update indicator
        if verbose:
            progress_bar = update.DeterminateProgressBar('GCTX_READER')

        #open the gctx file
        self._open_gctx(src)

        #set up the indices
        if not col_inds:
            col_inds = range(len(self.column_id_node))

        #read in the column meta data
        column_headers = [x.name for x in self.column_data]
        column_headers.insert(0, 'ind')
        self._add_table_to_meta_db("col", column_headers)
        num_rows = self.column_data[0].shape[0]
        for i in col_inds:
            if verbose:
                progress_bar.update('reading column meta data', i, num_rows)
            data_list = [i]
            for column in self.column_data:
                data_list.append(str(column[i]).rstrip())
            self._add_row_to_meta_table("col", data_list)

        #clear the update indicator
        if verbose:
            progress_bar.clear()

        #close the gctx file
        self._close_gctx()
Beispiel #4
0
    def _read_gct(self, src, verbose=True, frame=True):
        '''
        reads tab delimited gct file
        '''
        #open a update indicator
        if verbose:
            progress_bar = update.DeterminateProgressBar('GCT_READER')

        #open the file
        f = open(src, 'rb')
        reader = csv.reader(f, delimiter='\t')
        self.src = src

        #read the gct file header information and build the empty self.matrix
        #array for later use
        self.version = reader.next()[0]
        dims = reader.next()
        self.matrix = numpy.ndarray([int(dims[0]), int(dims[1])])

        #parse the first line to get sample names and row meta_data headers
        titles = reader.next()
        cid = titles[int(dims[2]) + 1:]
        row_meta_headers = titles[:int(dims[2]) + 1]
        row_meta_headers.insert(0, 'ind')
        self._add_table_to_meta_db('row', row_meta_headers)

        #parse the _meta data for the columns
        col_meta_array = []
        for ii, c in enumerate(cid):
            col_meta_array.append([ii, c])
        current_row = 0
        col_meta_headers = ['ind', 'id']
        while current_row < int(dims[3]):
            tmp_row = reader.next()
            col_meta_headers.append(tmp_row[0])
            for ii, item in enumerate(tmp_row[int(dims[2]) + 1:]):
                col_meta_array[ii].append(item)
            current_row += 1
        self._add_table_to_meta_db('col', col_meta_headers)
        for item in col_meta_array:
            self._add_row_to_meta_table('col', item)

        #parse the meta_data for the rows and store the data matrix
        for ii, row in enumerate(reader):
            row_meta_tmp = row[:int(dims[2]) + 1]
            row_meta_tmp.insert(0, ii)
            self._add_row_to_meta_table('row', row_meta_tmp)
            self.matrix[ii] = row[int(dims[2]) + 1:]
            if verbose:
                progress_bar.update('reading gct file: ', ii, int(dims[0]))

        if verbose:
            progress_bar.clear()

        #populate a data frame
        if frame:
            self.frame = pd.DataFrame(self.matrix,
                                      index=self.get_row_meta('id'),
                                      columns=self.get_column_meta('id'))
Beispiel #5
0
def rates_of_DMSO_connections(inSum,
                              outSum,
                              dmsoSum,
                              matrixType,
                              rnkptRange,
                              graph=True,
                              fpr_max=False):
    '''
    -calculate the rate of false positives for bioactive signatures vs. DMSO

    Parameters:
    -----------
    fpr_max: bool
        if false positive rate is above 1, set to 1

    '''
    # goldSum = pd.concat([inSum,outSum],axis=0)
    ratioThresh = 3  #
    fpThresh = .25
    ratioDict = {}
    fpDict = {}
    fpFrame = pd.DataFrame()
    #### false positive calculation with loop ####
    progress_bar = update.DeterminateProgressBar(
        'connection ratio-calculation')
    for ii, rnkpt_thresh in enumerate(rnkptRange):
        progress_bar.update('observed to dmso', ii, len(rnkptRange))
        # rnkpt_thresh = 90
        # grtrThresh = inSum >= rnkpt_thresh
        grtrThresh = np.greater_equal(inSum, rnkpt_thresh)
        grtrSum = grtrThresh.sum(axis=1)
        connRate = grtrSum / float(inSum.shape[1])
        # dmso
        grtrDMSO = dmsoSum >= rnkpt_thresh
        dSum = grtrDMSO.sum(axis=1)
        dConnRate = dSum / float(dmsoSum.shape[1])
        # summly space: dmso connection rate
        obsToDmso = connRate / dConnRate
        # falsePosR = dConnRate / (dConnRate + connRate) # dmso / (dmso + obs)
        falsePosR = dConnRate / connRate  # dmso / obs
        # if false postive rate is above 1, set to 1
        if fpr_max:
            falsePosR[falsePosR >= 1] = 1
        falsePosR.name = rnkpt_thresh
        fpFrame = pd.concat([fpFrame, pd.DataFrame(falsePosR)], axis=1)
        highRatioCount = (obsToDmso >= ratioThresh).sum()
        ratioDict[rnkpt_thresh] = highRatioCount
        fpDict[rnkpt_thresh] = (falsePosR <= fpThresh).sum()
        # deal with inf
        # isInf = np.isinf(obsToDmso)
        # obsToDmso[isInf] = grtrSum[isInf] # replace inf with obs sum
        # obsToDmso = obsToDmso[~np.isnan(obsToDmso)]# remove nan
    return fpFrame
def build_combine_null(Hmtrx,cliqFrm,topMeanFrm,nTop=3,nPerm=4000):
    '''
    -
    shuffle signatures from random drugs - keep same group size

    Parameters
    ----------
    Hmtrx : pandas DataFrame
        -matrix of NMF weightings for each signatures (n_signatures x n_components)
    cliqFrm : pandas DataFrame
        -signature annotationsw
    nTop : int
        -number of top largest components to sort by  
    topMeanFrm : pandas DataFrame
        for each group:
            group_signature_counts  
            top_mean_metric         

    Returns
    ----------
    nullMean : pandas DataFrame
        -matrix of null distributions for each group size
    '''
    # count the number of signatures in the input data that belong to each group
    cliqSize = topMeanFrm.group_signature_counts
    groupSizeSet = set(cliqSize) # unique group sizes across input groups
    zFrm = np.zeros([len(groupSizeSet),nPerm])
    nullMean = pd.DataFrame(zFrm,index=np.sort(list(groupSizeSet))) # one row for each group size
    nullMean.index.name = 'group_size'
    prog = update.DeterminateProgressBar('group size')
    for ix,nGrp in enumerate(nullMean.index):
        prog.update(nGrp,ix,len(nullMean.index))
        for ir in range(nPerm):
            iRand = np.random.choice(Hmtrx.index.values,nGrp,replace=False)
            grpH = Hmtrx.reindex(iRand)
            meanVec = grpH.mean()
            #get mean of top components
            iTop3 = meanVec.order(ascending=False).index[:nTop]
            sortedTop = grpH.ix[:,iTop3].sort()
            topSum = sortedTop.sum(axis=1).order(ascending=False)
            nullMean.ix[nGrp,ir] = topSum.mean()
    return nullMean
Beispiel #7
0
    def read_gctx_col_meta(self, src, col_inds=None, verbose=True):
        '''
        read the column meta data from the file given in src.  If col_inds is given, only
        those columns specified are read.
        '''
        #open an update indicator
        if verbose:
            progress_bar = update.DeterminateProgressBar('GCTX_READER')

        #open the gctx file
        self._open_gctx(src)

        #set up the indices
        if not col_inds:
            col_inds = range(len(self.column_id_node))

        #read in the column meta data
        column_headers = [x.name for x in self.column_data]
        column_headers.insert(0, 'ind')
        self._add_table_to_meta_db("col", column_headers)
        num_rows = len(col_inds)
        meta_data_array = numpy.empty([len(column_headers), num_rows],
                                      dtype=numpy.dtype('a400'))
        meta_data_array[0, :] = [str(x) for x in col_inds]
        for i, column in enumerate(self.column_data):
            data = column[col_inds]
            meta_data_array[i + 1, :] = [str(x).rstrip() for x in data]
        for i, col_ind in enumerate(col_inds):
            if verbose:
                progress_bar.update('reading column meta data', i, num_rows)
            data_list = list(meta_data_array[:, i])
            self._add_row_to_meta_table("col", data_list)

        #clear the update indicator
        if verbose:
            progress_bar.clear()

        #close the gctx file
        self._close_gctx()
    def permutation_template(self,n_permutation=10000):
        '''
        creates null distribution of correlations 
        returns correlations, p-values, and FDR correction 

        must add_from_gct() before running
        '''
        prog = progress.DeterminateProgressBar('template matching')
        gcto = self.gct

        doses = gcto.get_column_meta('pert_dose')
        perts = gcto.get_column_meta('pert_id')
        rids = gcto.get_rids()

        examineList = self.perts_at_dose
        num_perts = len(examineList)
        # templateMatchInd = {} #nested dict for index of significant probes
        # pvecDictParametric = {}
        pvecDictEmpirical = {}
        corr_null_distribution = {}
        # probe_template_corrs = {}
        for icmpd,unique_pert in enumerate(examineList):
            prog.update('template match {0}'.format(unique_pert),icmpd,num_perts)
            cid_inds = [i for i,x in enumerate(perts) if unique_pert in x]
            pert_doses = [float(doses[x]) for x in cid_inds]
            tmp_tup = zip(pert_doses,cid_inds)
            tmp_tup.sort()
            pert_doses,cid_inds = zip(*tmp_tup)
            pert_data = gcto.matrix[:,cid_inds]
            template_names = ['linear', 'log10', 'log2']
            # templateMatchInd[unique_pert] = {}
            # pvecDictParametric[unique_pert] = {}
            pvecDictEmpirical[unique_pert] = {}
            corr_null_distribution[unique_pert] = {}
            # probe_template_corrs[unique_pert] = {}
            for istep,step in enumerate(template_names):
                template1 = step
                if step == 'linear':
                    template_curve = np.array(pert_doses)
                elif step == 'log10':
                    template_curve = np.log10(pert_doses)
                elif step == 'log2':
                    template_curve = np.log2(pert_doses)
                else:
                    print 'template name error'
                # calcualte stats on observation of interest 
                # cc_list = [stats.pearsonr(pert_data[x,:],template_curve) for x in range(len(rids))]
                # rho_vec = [cc_list[x][0] for x in range(len(rids))]
                # rho_vec = np.array(rho_vec)
                # p_vec = [cc_list[x][1] for x in range(len(rids))]
                # p_vec = np.array(p_vec)
                # pvecDictParametric[unique_pert][template1] = p_vec
                # probe_template_corrs[unique_pert][template1] = rho_vec
                # run permutations to creat null distribution of corr values
                ### full matrix of permutations
                nMtrxPerm = n_permutation/len(rids) + 1 #number of matrix permutations needed to reach desired probe perms
                permRhoMtrx = np.zeros((len(rids),nMtrxPerm))
                for perm in range(nMtrxPerm):
                    iRandObs = range(pert_data.shape[1])
                    np.random.shuffle(iRandObs)
                    corrs = np.corrcoef(template_curve,pert_data[:,iRandObs])
                    permRhoMtrx[:,perm] = corrs[0,1:]
                    #test to see if two calculations methods are the same to a given precision
                    # cc_list = [stats.pearsonr(pert_data[x,iRandObs],template_curve) for x in range(len(rids))] #this takes too long
                    # rho_vec = [cc_list[x][0] for x in range(len(rids))]                 
                    # np.allclose(perm_list, np.array(rho_vec),rtol=1e-06)
                #calculate p-value based on null distribution
                grtrMtrx = np.greater(np.abs(permRhoMtrx.T),np.abs(rho_vec))
                null_pVec1 = (1 + np.sum(grtrMtrx, axis=0)) / float(nPerm)
                #compare observed gene to all null genes
                rho_vec = dp.probe_template_corrs[unique_pert][step]
                null_pVec = np.zeros_like(rho_vec)
                for igene in range(len(rho_vec)):
                    rho = rho_vec[igene]
                    p = np.sum(np.abs(permRhoMtrx.flatten()) > np.abs(rho)) /float(len(permRhoMtrx.flatten()))
                    null_pVec[igene] = p
                ## select probe perms1
                num_probes = self.gct.matrix.shape[0]
                probe_inds = range(num_probes)
                perm_cc = []
                for i in range(n_permutation):
                    perm_curve_inds = [random.sample(probe_inds,1)[0] for x in range(len(pert_doses))]
                    perm_curve = [pert_data[perm_curve_inds[x],x] for x in range(len(pert_doses))]
                    perm_covar = np.corrcoef(perm_curve,template_curve)
                    perm_cc.append(perm_covar[0][1])
                corr_null_distribution[unique_pert][template1] = perm_cc
                null_pVec = np.zeros_like(rho_vec)
                for igene in range(len(rho_vec)):
                    rho = rho_vec[igene]
                    p = np.sum(np.abs(perm_cc) > np.abs(rho)) /float(len(perm_cc))
                    null_pVec[igene] = p
                pvecDictEmpirical[unique_pert][template1] = null_pVec
                ### thresholding
                q = .1 #FDR threshold
                pID, pN = FDR.FDR(p_vec,q) #find FDR threshold
                if type(pID) == list:
                    print unique_pert + 'matching to ' + template1 + ' template - perterbation does not have any significant genes that pass the FDR threshold'
                    templateMatchInd[unique_pert][template1] = []
                    continue
                else:
                    pass_fdr = np.less_equal(p_vec,pID) 
                    ipass_fdr = np.array(range(len(rids)))[pass_fdr] #get indices which pass fdr
                    iRhoSort = np.argsort(rho_vec[ipass_fdr])[::-1]
                    iRhoSorted_passFDR = ipass_fdr[iRhoSort] #these are indices which pass FDR and are sorted by correlation
                    data_pass_fdr = pert_data[iRhoSorted_passFDR,:]
                    ordered_rids = [rids[i] for i in iRhoSorted_passFDR]
                    templateMatchInd[unique_pert][template1] = iRhoSorted_passFDR
        self.templateMatchInd = templateMatchInd
        self.pvecDictParametric = pvecDictParametric
        self.probe_template_corrs = probe_template_corrs
        # self.pvecDictEmpirical = pvecDictEmpirical
        # self.corr_null_distribution = corr_null_distribution
hogTargetDict = {}
for brd in hogSet:
    if brd in targetDict:
        hogTargetDict[brd] = targetDict[brd]

####################################
### make query with only HOG plates
####################################
dg = dgo.QueryTargetAnalysis(out=work_dir)
pert_list = list(hogSet)
is_gold = False
genomic_pert = 'KD'
brdCounts = []
fullPertList = []
### for each drug perturbations of interest - find all instances in CMAP
prog = progress.DeterminateProgressBar('perturbation cid query')
if pert_list:
    for i, pert in enumerate(pert_list):
        prog.update('querying cps', i, len(pert_list))
        CM = mu.CMapMongo()
        if is_gold == True:
            pert_query = CM.find(
                {
                    'sig_id': {
                        '$regex': 'HOG'
                    },
                    'pert_id': {
                        '$regex': pert
                    },
                    'is_gold': True
                }, {
Beispiel #10
0
    ID_pass_vec = np.less_equal(p, ID_thresh_vec)
    if any(ID_pass_vec):
        pID = max(p[ID_pass_vec])
    else:
        pID = []
    # Nonparametric threshold
    N_thresh_vec = (I * q) / V / cVN
    N_pass_vec = np.less_equal(p, N_thresh_vec)
    if any(N_pass_vec):
        pN = max(p[N_pass_vec])
    else:
        pN = []
    return pID, pN


prog = progress.DeterminateProgressBar('Template Heatmaps')

cell = 'PC3'
tim = '6H'
cellLine = cell
timeP = tim
#load data
refControl = 'pc'  #use pc vs vc controled data
# gctfile = glob.glob('/xchip/obelix/pod/brew/%s/PRISM001_%s_%s/by_pert_id_pert_dose/PRISM001_%s_%s_COMPZ.MODZ_SCORE_LM_*.gctx' % (refControl,cellLine,timeP,cellLine,timeP))
# load in zscore roast data
# gctfile = glob.glob('/xchip/obelix/pod/brew/%s/PRISM001_%s_%s/PRISM001_%s_%s_ZSPCQNORM_*.gctx' % (refControl,cellLine,timeP,cellLine,timeP))
# load in brewed by rna well
gctfile = glob.glob(
    '/xchip/obelix/pod/brew_tmp/%s/PRISM001_%s_%s/by_rna_well/PRISM001_%s_%s_COMPZ.MODZ_SCORE_LM_*.gctx'
    % (refControl, cellLine, timeP, cellLine, timeP))
# load in brewed by rna well - INFERED
# sco = sc.SC()
# sco.add_sc_from_gctx_meta(gctfile, verbose=False)
# dose = [float(x.split('::')[0].split(':')[2]) for x in sco.pid]



# p-value by permutation 


'''
matches data to a dose template 
returns p-values, FDR correction 

must add_from_gct() before running
'''
prog = progress.DeterminateProgressBar('template matching')
gcto = dp.gct

doses = gcto.get_column_meta('pert_dose')
perts = gcto.get_column_meta('pert_id')
unique_perts = list(set(perts))
rids = gcto.get_rids()

examineList = dp.perts_at_dose
num_perts = len(examineList)
templateMatchInd = {} #nested dict for index of significant probes
for icmpd,unique_pert in enumerate(examineList):
    prog.update('template match {0}'.format(unique_pert),icmpd,num_perts)
    cid_inds = [i for i,x in enumerate(perts) if unique_pert in x]
    pert_doses = [float(doses[x]) for x in cid_inds]
    tmp_tup = zip(pert_doses,cid_inds)
Beispiel #12
0
niter = 10000
direction = 'pos'

# assert self.row_groups is not None, "Must define row_groups. Call build_groups first."
# assert self.col_groups is not None, "Must define col_groups. Call build_groups first."
# assert direction in ['pos','neg','both'], "direction must be 'pos','neg',or 'both'"

# self.method=method
# self.method_ = method
summary_list = []
for q in ocl.row_groups.keys():
    print "Connecting query group %s" % q
    # row_vals = ocl.query.pctrank.ix[ocl.row_groups[q]]
    testStats = []
    counts = []
    prog = progress.DeterminateProgressBar('row test statistic')
    for ig, g in enumerate(ocl.col_groups.keys()):
        prog.update('querying cps', ig, len(ocl.col_groups.keys()))
        rnk_vals = ocl.query.pctrank.ix[ocl.row_groups[q], ocl.col_groups[g]]
        testStat = rnk_vals.prod().prod()
        testStats.append(testStat)
        count = rnk_vals.count().sum()
        counts.append(count)

    csR = ocl.dfCS.ix[brd][ind]

    cpRank = self.dfRank.ix[brd]
    cpSmRank = cpRank / 100  # convert ranks back to 0 to 1
    nCPs.append(cpRes.shape[0])
    meanSer = cpRes.mean()
    meanRnk = cpRank.mean()
Beispiel #13
0
metric='wtcs'
is_gold = False
if not os.path.exists(work_dir):
    os.mkdir(work_dir)

# dpathwayList = ['PIK3CA', 'PIK3CD', 'PIK3CG', 'MTOR', 'AKT1', 'AKT2', 'PTEN']
pathwayList = ['PIK3C', 'MTOR', 'AKT', 'PTEN']
drug_list = ['BRD-K05756698','BRD-K12184916']

#cgs cell lines
CM = mu.CMapMongo()
CGSbyCell = CM.find({'pert_type':'trt_sh.cgs'},{'cell_id':True})
cgsCells = list(set(CGSbyCell))

#loop through and write cell IDs 
prog = progress.DeterminateProgressBar('genomic pert query')
for i,cell1 in enumerate(cgsCells):
    #get all CGS for a cell line
    prog.update('querying',i,len(cgsCells))
    CM = mu.CMapMongo()
    for gene in pathwayList:
        CGSbyCell = CM.find({'pert_iname':{'$regex':gene},'pert_type':'trt_sh.cgs','is_gold':True,'cell_id':cell1},{'sig_id':True,'pert_iname':True})
        if CGSbyCell:
            outdir = os.path.join(work_dir,cell1)
            if not os.path.exists(outdir):
                os.mkdir(outdir)
            nCGS = len(CGSbyCell)
            # sigF = os.path.join(outdir, cell1+ '_genomic_sig_ids_n' + str(nCGS) + '.grp')
            sigF = os.path.join(outdir, cell1+ '_genomic_sig_ids.grp')
            with open(sigF, 'a') as f:
                for sig in CGSbyCell:
Beispiel #14
0
def analyze_query(args,work_dir):
	'''
	Analyze the output from query_tool - find self-connections and create graphs
	'''
	#make a gct object
	db = gct.GCT()
	db.read(args.res)

	##load query result - gctx file
	rslt = gct.GCT()
	#if specific result directory is specified, use that - otherwise get gctx from working dir
	if args.result:
		outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step
		rslt.read(outGctx[0])
	else:
		rslt.read(args.resultDir)

	rsltSigID = rslt.get_rids() #sig IDs from result file

	qPert = db.get_column_meta('pert_desc')
	qPertID = db.get_column_meta('pert_id')
	qDose = db.get_column_meta('pert_dose')
	ESmat = rslt.matrix
	iES = ESmat.argsort(axis=0)[::-1] #sort ascending
	n_inst = len(iES[:,1])

	#loop through each of the perts - graph ranks of query
	prog1 = progress.DeterminateProgressBar('creating self-connection graphs')
	avRnk = []
	medRnk = []
	for i, x in enumerate(qPert):
		prog1.update('graphing {0}',i,len(qPert))
		iE = iES[:,i] #ES sort index for one column
		sSigID = []
		for y in iE:
			sSigID.append(rsltSigID[y]) #make sorted sig ID list
		qStr = qPertID[i]
		cmpd1 = x
		dose1 = qDose[i]
		if len(qStr) >= 13:
			qStr = qStr[0:13] #shorten qPertID
		#i1 = IDsorted.index(qStr) #give first index of match

		#run pymongo query
		CM = mu.CMapMongo()
		#cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True})
		cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db

		#i1 = __all_indices(qStr,sSigID)
		i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list
		if len(i1) < 1:
			print cmpd1 + ' has no instances in the cmap database'
			continue
		i2 = numpy.array(i1) #convert list to numpy array
		avr = sum(i2)/len(i2) #what is the average ES rank
		md = numpy.median(i2) # what is the median ES rank
		nAv = float(avr)/n_inst #normalize acording to number of instances in db
		nMd = float(md)/len(iES[:,1]) #normalized median
		avRnk.append(nAv) #store average ES rank
		medRnk.append(nMd)
		#plot
		fname = cmpd1 + '_' + dose1 + '_query_rank.png'
		outf = os.path.join(work_dir,fname)
		fig = plt.figure(figsize=(8.0, 2.0))
		ax = fig.add_subplot(111)
		# the histogram of the data
		n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75)
		#ax.set_xlim(0, n_inst)
		ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k
		ax.set_xlabel('query rank')
		ax.set_ylabel('freq')
		ax.set_title('dose = '+ str(dose1) +'um')
		ax.grid(True)
		plt.savefig(outf, bbox_inches=0)
Beispiel #15
0
rowMedian = dmsoCM.median(axis=1)


def no_diagonal_unstack(frm):
    'return an unstacked matrix without the diagonal'
    np.fill_diagonal(frm.values, np.nan)
    overlapSer = frm.unstack()
    overlapSer = overlapSer[~overlapSer.isnull()]  #remove nulls
    return overlapSer


### compare observed to null
#construct
graph = False
pvalDict = {}
progress_bar = update.DeterminateProgressBar('group p-val computation')
for iicliq, icliq in enumerate(cliqueLabels.index):
    progress_bar.update('count', iicliq, len(cliqueLabels.index))
    cName = cliqueLabels.ix[icliq, 'id']
    pIds = cliqueLabels.ix[icliq, 'sig']
    smFrm = sFrm.reindex(index=pIds, columns=pIds)
    uFrm = no_diagonal_unstack(smFrm)
    medObs = uFrm.median()
    rMed = rowMedian[pIds]
    fig = plt.figure(1, figsize=(10, 10))
    # make matrix of equal size using null
    nperm = 10000
    permDict = {}
    for iperm in range(nperm):
        iRand = np.random.choice(range(0, dmsoFrm.shape[1]), size=(len(pIds)))
        iRandCol = dmsoFrm.columns[iRand]  #random column names
Beispiel #16
0
def build_probe_curves_and_summary(args,work_dir):
	'''
	builds dose response curves for each for the specified probe
	'''
	# instantiate a progress object
	prog = progress.DeterminateProgressBar('Dose Analysis')

	# read the specified probe from the input gctx file
	gcto = gct.GCT()
	probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True)
	gcto.read_gctx_matrix(args.res,row_inds=probe_ind)

	# grab the cids from the file and mine dose information from them.  Find all of 
	# the unique perts
	cids = gcto.get_gctx_cid(args.res)
	doses = [float(x.split(':')[2]) for x in cids]
	perts = [x.split(':')[1] for x in cids]
	unique_perts = list(set(perts))
	
	# for each unique pert_id, find the dose that deviates from the base dose the most.
	# Do template matching to prototype curves. Output a report
	num_perts = len(unique_perts)
	CM = mu.CMapMongo()
	with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f:
		headers = ['pert_id','pert_desc','base_dose','base_z_score',
				   'best_dose','best_z_score', 'best_z_score_delta',
				   'linear','log','half-log','quarter-log','called shape']
		f.write('\t'.join(headers) + '\n')
		for i,unique_pert in enumerate(unique_perts):
			prog.update('analyzing {0}'.format(args.probe),i,num_perts)
			
			# grab the z-scores and doses for the current pert and sort the pairs
			# by dose
			cid_inds = [i for i,x in enumerate(cids) if unique_pert in x]
			pert_scores = gcto.matrix[0,cid_inds]
			pert_doses = [doses[x] for x in cid_inds]
			tmp_tup = zip(pert_doses,pert_scores)
			tmp_tup.sort()
			pert_doses,pert_scores = zip(*tmp_tup)

			# build the dose response plot for the current pert and save it to disk
			plt.plot(pert_doses,pert_scores)
			plt.title('::'.join([unique_pert,args.probe]))
			plt.xlabel('dose')
			plt.ylabel('z-score')
			plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png'])))
			plt.close()

			# grab the pert_desc from mongo
			pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1)
			if not pert_desc:
				pert_desc = ['-666']
			pert_desc = pert_desc[0]

			# find the best dose and cast them to lists
			base_dose = pert_doses[0]
			base_z_score = pert_scores[0]

			z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10)
			abs_z_delta = numpy.abs(z_delta)
			z_delta =  z_delta.tolist()
			abs_z_delta = abs_z_delta.tolist()
			
			best_ind = z_delta.index(numpy.min(z_delta))
			best_dose = pert_doses[best_ind]
			best_z_score = pert_scores[best_ind]
			best_z_score_delta = z_delta[best_ind]

			if len(pert_doses) > 1:
				# build prototype curves if there is more than one dose
				linear = numpy.linspace(1,10,len(pert_doses))
				log_gen = _log_gen(1)
				log_curve = [log_gen.next() for x in range(len(pert_doses))]
				log_gen = _log_gen(.5)
				half_log_curve = [log_gen.next() for x in range(len(pert_doses))]
				log_gen = _log_gen(.25)
				quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))]

				curves = numpy.array([linear,log_curve,
									  half_log_curve,quarter_log_curve])

				# get the correlation coeficient for each of the curves and the
				# current pert dose curve
				corrs = numpy.corrcoef(pert_scores,curves)
				linear_corr = corrs[0][1]
				log_corr = corrs[0][2]
				half_log_corr = corrs[0][3]
				quarter_log_corr = corrs[0][4]

				#report the best shape by finding the best absolute correlation
				abs_corr = numpy.abs(corrs[0][1:])
				if numpy.where(abs_corr > .8)[0].size > 0:
					abs_corr_max = max(abs_corr)
					abs_corr_max_ind = numpy.where(abs_corr == abs_corr_max)[0][0]
					curve_names = ['linear','log','half-log','quarter-log']
					max_curve_name = curve_names[abs_corr_max_ind]
				else:
					max_curve_name = 'none'

			else:
				# if there is only one dose, set all corrs to 'nan'
				linear_corr = 'nan'
				log_corr = 'nan'
				half_log_corr = 'nan'
				quarter_log_corr = 'nan'
				max_curve_name = 'none'



			# write the dose data to the summary file
			data = [unique_pert,pert_desc,str(base_dose),str(base_z_score),
					str(best_dose),str(best_z_score),str(best_z_score_delta),
					str(linear_corr),str(log_corr),str(half_log_corr),
					str(quarter_log_corr),max_curve_name]
			f.write('\t'.join(data) + '\n')
	prog.clear()
Beispiel #17
0
 def classification_across_cell(self, loo_type='by_cp', n_procs=9):
     '''
     -build a single classifier treating observations from different
     cell lines equally
     -evaluate model with leave one out cross val.
     
     Parameters
     ----------
     loo_type : str
         strategy for leave one out validation:
             'by_cp' - leaves out all signatures for a given compounds
             'by_sig' - leaves out individual signatures 
     n_procs : int
         number of cores to be used for analysis
     '''
     zFrm = self.signature_frame
     ### perform leave one out validation
     if loo_type == 'by_cp':
         zFrm['svm_prediction'] = np.nan
         cpSet = set(zFrm['pert_id'])
         tupList = [(zFrm, brd, self.probe_ids) for brd in cpSet]
         # run SVM in parallel
         prog = update.DeterminateProgressBar(
             'self-connection graph builder')
         pool = multiprocessing.Pool(n_procs)
         rs = pool.map_async(_svm_worker, tupList)
         pool.close()  # No more work
         while (True):
             if (rs.ready()): break
             remaining = rs._number_left
             prog.show_message(
                 'SVM evaluation - {0} tasks to complete'.format(remaining))
             time.sleep(0.1)
         results = rs.get()
         predictedSer = pd.Series()
         for result in results:
             predictedSer = predictedSer.append(result)
         zFrm['svm_prediction'] = predictedSer
     if loo_type == 'by_sig':
         #start a update indicator
         progress_bar = update.DeterminateProgressBar('SVM calculation')
         predictDict = {}
         for ii, sig in enumerate(zFrm.index):
             progress_bar.update(
                 'running SVM and signature validation - ' + sig, ii,
                 len(zFrm.index))
             droppedFrm = zFrm[zFrm.index !=
                               sig]  # remove test signature from training
             trainFrm = droppedFrm.reindex(columns=self.probe_ids)
             labelsTrain = droppedFrm['labels'].values
             C = 1.0  # SVM regularization parameter
             svc = svm.SVC(kernel='linear',
                           C=C).fit(trainFrm.values, labelsTrain)
             zTest = zFrm.ix[sig, self.probe_ids]
             linPred = svc.predict(zTest.values)
             predictDict[sig] = linPred[0]
         predSer = pd.Series(predictDict)
         predSer.name = 'svm_prediction'
         zFrm = pd.concat([zFrm, pd.DataFrame(predSer)], axis=1)
     accuracyArray = zFrm['labels'] == zFrm['svm_prediction']
     accuracyRate = accuracyArray.sum() / float(accuracyArray.shape[0])
     zFrm['correct_prediction'] = accuracyArray
     self.model_accuracy_across_cells = accuracyRate
     self.signature_frame = zFrm
     meanVec = grpH.describe().ix['mean']
     #get top
     nTop = 3  # number of top largest components to sort by
     iTop3 = meanVec.order(ascending=False).index[:nTop]
     sortedTop = grpH.ix[:, iTop3].sort()
     topSum = sortedTop.sum(axis=1).order(ascending=False)
     topMeanDict[grp] = topSum.mean()
 topMeanSer = pd.Series(topMeanDict)
 ##############################
 ### build null distribution ##
 ##############################
 # shuffle signatures frtopom random drugs - keep same group size
 nPerm = 4000
 zFrm = np.zeros([cliqFrm.shape[0], nPerm])
 nullMean = pd.DataFrame(zFrm, index=cliqFrm['desc'])
 prog = update.DeterminateProgressBar('cliq group')
 for irr, r in enumerate(cliqFrm.iterrows()):
     grp = r[1]['id']
     prog.update(grp, irr, len(cliqFrm.desc))
     brds = r[1]['sig']
     anntMtch = anntFrm[anntFrm.pert_id.isin(brds)]
     for ir in range(nPerm):
         nGrp = anntMtch.shape[0]
         iRand = np.random.choice(Hmtrx.index.values, nGrp, replace=False)
         grpH = Hmtrx.reindex(iRand)
         meanVec = grpH.mean()
         #get mean of top components
         nTop = 3  # number of top largest components to sort by
         iTop3 = meanVec.order(ascending=False).index[:nTop]
         sortedTop = grpH.ix[:, iTop3].sort()
         topSum = sortedTop.sum(axis=1).order(ascending=False)
Beispiel #19
0
    val1 = params[0]
    val2 = params[1]
    sum1 = val1 + val2
    time.sleep(10)
    return sum1


# make list of tuples
tupList = []
for x1 in np.arange(3):
    for x2 in np.arange(3):
        tup1 = (x1, x2)
        tupList.append(tup1)  #add to list of tuples

# instantiate a progress object
prog = progress.DeterminateProgressBar('self-connection graph builder')
#build graphs in parallel
n_procs = 4
pool = multiprocessing.Pool(n_procs)
rs = pool.map_async(_add_two, tupList)
pool.close()  # No more work
while (True):
    if (rs.ready()): break
    remaining = rs._number_left
    prog.show_message('Waiting for {0} tasks to complete...'.format(remaining))
    time.sleep(0.1)
rs.get()

### attempt 2
import sys, time, random, multiprocessing
Beispiel #20
0
def analyze_query(args,work_dir):
	'''
	Analyze the output from query_tool - find self-connections and create graphs
	'''
	#make a gct object
	db = gct.GCT()
	db.read(args.res)

	##load query result - gctx file
	rslt = gct.GCT()
	#if specific result directory is specified, use that - otherwise get gctx from working dir
	if args.result:
		outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step
		rslt.read(outGctx[0])
	else:
		rslt.read(args.resultDir)

	rsltSigID = rslt.get_rids() #sig IDs from result file

	qPert = db.get_column_meta('pert_desc')
	qPertID = db.get_column_meta('pert_id')
	qDose = db.get_column_meta('pert_dose')
	ESmat = rslt.matrix
	iES = ESmat.argsort(axis=0)[::-1] #sort ascending
	n_inst = len(iES[:,1])

	#loop through each of the perts - graph ranks of query
	prog1 = progress.DeterminateProgressBar('creating self-connection graphs')
	avRnk = []
	medRnk = []
	prRnk = []
	#loop through each of the UNIQUE perts - graph ranks of query
	pertSet = set(qPert)
	for pert in pertSet:
		cmpd1 = pert
		iP = _all_indices(pert, qPert) #index of doses on plate
		if len(iP) < 2:
			print pert + ' has only one instance'
			continue
		uDose = [qDose[i] for i in iP]
		fDose = [float(x) for x in uDose] #convert strings to float
		aDose = numpy.asarray(fDose) #convert to numpy array
		iD = aDose.argsort() #local ordering
		sDose = [fDose[j] for j in iD] #sort local doses
		iPo =  [iP[i] for i in iD] #ordered index
		qStr = qPertID[iPo[0]] #set pertID
		if len(qStr) >= 13:
			qStr = qStr[0:13] #shorten qPertID
		#run pymongo query
		CM = mutil.CMapMongo()
		#cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True})
		cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db
		if len(cmpdSigIds) < 2:
			print cmpd1 + ' has one or no instances in the cmap database'
			continue
		#loop through each dose
		for d in iPo:
		#count probe enrichment and plot
				cmpd1 = qPert[d]
				dose1 = qDose[d]
				iE = iES[:,d] #ES sort index for one column
				sSigID = []
				for y in iE:
					sSigID.append(rsltSigID[y]) #make sorted sig ID list
				i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list
				i2 = numpy.array(i1) #convert list to numpy array
				avr = sum(i2)/len(i2) #what is the average ES rank
				md = numpy.median(i2) # what is the median ES rank
				nAv = float(avr)/n_inst #normalize acording to number of instances in db
				nMd = float(md)/len(iES[:,1]) #normalized median
				i1.sort()
				np = 1000
				ntop = [x for x in i1 if x <= np]
				nPr = float(len(ntop))/(len(i1)) #percent of instances at the top of the list
				prRnk.append(nPr)
				avRnk.append(nAv) #store average ES rank
				medRnk.append(nMd)
				#plot
				fname = cmpd1 + '_' + dose1 + '_query_rank.png'
				outf = os.path.join(work_dir,fname)
				fig = plt.figure(figsize=(8.0, 2.0))
				ax = fig.add_subplot(111)
				# the histogram of the data
				n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75)
				#ax.set_xlim(0, n_inst)
				ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k
				ax.set_xlabel('query rank')
				ax.set_ylabel('freq')
				ax.set_title('dose = '+ str(dose1) +'um')
				ax.grid(True)
				plt.savefig(outf, bbox_inches=0)
Beispiel #21
0
def template_heatmap(args,work_dir):
	'''
	uses template matching to find the most does responsive probesets for each compound in
	the dataset and generates a list of the top 50 and bottom 50 most dose responsive probes.
	heatmaps across all of the doses are made using these probesets
	'''
	# instantiate a progress object
	prog = progress.DeterminateProgressBar('Template Heatmaps')

	# read the data
	gcto = gct.GCT(args.res)
	gcto.read()

	# grab the cids from the file and mine dose information from them.  Find all of 
	# the unique perts
	cids = gcto.get_gctx_cid(args.res)
	pert_descs = gcto.get_column_meta('pert_desc')
	doses = [float(x.split(':')[2]) for x in cids]
	perts = [x.split(':')[1] for x in cids]
	unique_perts = list(set(perts))

	# grab the rid for use below
	rids = gcto.get_gctx_rid(args.res)

	num_perts = len(unique_perts)
	for i,unique_pert in enumerate(unique_perts):
		prog.update('analyzing {0}'.format(unique_pert),i,num_perts)

		# grab the z-scores and doses for the current pert and sort the pairs
		# by dose. put the cid_inds in the same sorted order
		cid_inds = [i for i,x in enumerate(cids) if unique_pert in x]
		pert_desc = pert_descs[cid_inds[0]] #set pert desc to the first dose
		pert_doses = [doses[x] for x in cid_inds]
		tmp_tup = zip(pert_doses,cid_inds)
		tmp_tup.sort()
		pert_doses,cid_inds = zip(*tmp_tup)

		if len(pert_doses) > 1:
			# build prototype curves if there is more than one dose
			linear = numpy.linspace(1,10,len(pert_doses))
			log_gen = _log_gen(1)
			log_curve = [log_gen.next() for x in range(len(pert_doses))]
			log_gen = _log_gen(.5)
			half_log_curve = [log_gen.next() for x in range(len(pert_doses))]
			log_gen = _log_gen(.25)
			quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))]

			curves = numpy.array([linear,log_curve,
								  half_log_curve,quarter_log_curve])

			# correlate all of the probes in the data to the prototype curves
			pert_data = gcto.matrix[:,cid_inds]
			num_probes = pert_data.shape[0]
			cc = numpy.corrcoef(pert_data,curves)

			# grab the correlation values for all the probes against prototype curves
			linear_probe_corrs = cc[0:num_probes,num_probes]
			log_probe_corrs = cc[0:num_probes,num_probes + 1]
			half_log_probe_corrs = cc[0:num_probes,num_probes + 2]
			quarter_log_probe_corrs = cc[0:num_probes,num_probes + 3]

			# compute the random correlation profile for this pert
			num_probes = gcto.matrix.shape[0]
			probe_inds = range(num_probes)
			linear_perm_cc = []
			log_perm_cc = []
			half_log_perm_cc = []
			quarter_log_perm_cc = []
			for i in range(1000):
				perm_curve_inds = [random.sample(probe_inds,1)[0] for x in range(len(pert_doses))]
				perm_curve = [pert_data[perm_curve_inds[x],x] for x in range(len(pert_doses))]
				perm_covar = numpy.corrcoef(perm_curve,curves)
				linear_perm_cc.append(perm_covar[0][1])
				log_perm_cc.append(perm_covar[0][2])
				half_log_perm_cc.append(perm_covar[0][3])
				quarter_log_perm_cc.append(perm_covar[0][4])

			# compute the nominal p values for all correlation values
			linear_probe_corrs_p = numpy.array([stats.percentileofscore(linear_perm_cc,x) 
									for x in linear_probe_corrs])
			log_probe_corrs_p = numpy.array([stats.percentileofscore(log_perm_cc,x) 
									for x in log_probe_corrs])
			half_log_probe_corrs_p = numpy.array([stats.percentileofscore(half_log_perm_cc,x) 
									for x in half_log_probe_corrs])
			quarter_log_probe_corrs_p = numpy.array([stats.percentileofscore(quarter_log_perm_cc,x) 
									for x in quarter_log_probe_corrs])

			# write the p values and correlations out to file
			with open(os.path.join(work_dir,unique_pert + '_template_match_summary.txt'),'w') as f:
				f.write('\t'.join(['probeset','linear corr', 'linear p','log corr', 'log p',
					'half-log corr', 'half-log p','quarter-log corr', 'quarter-log p']) + '\n')
				for j in range(len(linear_probe_corrs)):
					f.write('\t'.join([rids[j],str(linear_probe_corrs[j]), str(linear_probe_corrs_p[j])
						,str(log_probe_corrs[j]), str(log_probe_corrs_p[j])
						,str(half_log_probe_corrs[j]), str(half_log_probe_corrs_p[j])
						,str(quarter_log_probe_corrs[j]), str(quarter_log_probe_corrs_p[j])]) + '\n')


			# build the linear heatmap
			linear_probe_corrs_sort_ind = numpy.argsort(linear_probe_corrs_p)[::-1]
			top = pert_data[linear_probe_corrs_sort_ind[0:50],:]
			bot = pert_data[linear_probe_corrs_sort_ind[-50:],:]
			combined = numpy.vstack([top,bot])
			combined_row_normalized =  combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T)
			row_sums = combined_row_normalized.sum(axis=1)
			combined_row_normalized =  combined_row_normalized / row_sums[:,numpy.newaxis]
			plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu')
			plt.axis('off')
			plt.savefig(os.path.join(work_dir,unique_pert + '_linear_heatmap.png'))

			# build the log heatmap
			log_probe_corrs_sort_ind = numpy.argsort(log_probe_corrs_p)[::-1]
			top = pert_data[log_probe_corrs_sort_ind[0:50],:]
			bot = pert_data[log_probe_corrs_sort_ind[-50:],:]
			combined = numpy.vstack([top,bot])
			combined_row_normalized =  combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T)
			row_sums = combined_row_normalized.sum(axis=1)
			combined_row_normalized =  combined_row_normalized / row_sums[:,numpy.newaxis]
			plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu')
			plt.axis('off')
			plt.savefig(os.path.join(work_dir,unique_pert + '_log_heatmap.png'))

			# build the half log heatmap
			half_log_probe_corrs_sort_ind = numpy.argsort(half_log_probe_corrs_p)[::-1]
			top = pert_data[half_log_probe_corrs_sort_ind[0:50],:]
			bot = pert_data[half_log_probe_corrs_sort_ind[-50:],:]
			combined = numpy.vstack([top,bot])
			combined_row_normalized =  combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T)
			row_sums = combined_row_normalized.sum(axis=1)
			combined_row_normalized =  combined_row_normalized / row_sums[:,numpy.newaxis]
			plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu')
			plt.axis('off')
			plt.savefig(os.path.join(work_dir,unique_pert + '_half_log_heatmap.png'))

			# build the quarter log heatmap
			quarter_log_probe_corrs_sort_ind = numpy.argsort(quarter_log_probe_corrs_p)[::-1]
			top = pert_data[quarter_log_probe_corrs_sort_ind[0:50],:]
			bot = pert_data[quarter_log_probe_corrs_sort_ind[-50:],:]
			combined = numpy.vstack([top,bot])
			combined_row_normalized =  combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T)
			row_sums = combined_row_normalized.sum(axis=1)
			combined_row_normalized =  combined_row_normalized / row_sums[:,numpy.newaxis]
			plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu')
			plt.axis('off')
			plt.savefig(os.path.join(work_dir,pert_desc + '_quarter_log_heatmap.png'))

			# clear that progress bar
			prog.clear()
def rates_of_DMSO_connections(inSum,outSum,dmsoSum,matrixType,rnkptRange,graph=True):
    '''
    -calculate the rate of false positives for bioactive signatures vs. DMSO
    -make heatmap

    '''
    # goldSum = pd.concat([inSum,outSum],axis=0)
    ratioThresh = 3 #
    fpThresh = .25
    ratioDict = {}
    fpDict = {}
    fpFrame = pd.DataFrame()
    progress_bar = update.DeterminateProgressBar('connection ratio-calculation')
    for ii,rnkpt_thresh in enumerate(rnkptRange):
        progress_bar.update('observed to dmso', ii, len(rnkptRange))
        # rnkpt_thresh = 90
        grtrThresh = inSum >= rnkpt_thresh
        grtrSum = grtrThresh.sum(axis=1)
        connRate = grtrSum/float(inSum.shape[1])
        # dmso
        grtrDMSO = dmsoSum >= rnkpt_thresh
        dSum = grtrDMSO.sum(axis=1)
        dConnRate = dSum/float(dmsoSum.shape[1])        
        # summly space: dmso connection rate
        obsToDmso = connRate/dConnRate
        # falsePosR = dConnRate / (dConnRate + connRate) # dmso / (dmso + obs)
        falsePosR = dConnRate / connRate # dmso / obs
        falsePosR.name = rnkpt_thresh
        fpFrame = pd.concat([fpFrame,pd.DataFrame(falsePosR)],axis=1)
        highRatioCount = (obsToDmso >= ratioThresh).sum()
        ratioDict[rnkpt_thresh] = highRatioCount
        fpDict[rnkpt_thresh] = (falsePosR <= fpThresh).sum()
        # deal with inf
        # isInf = np.isinf(obsToDmso)
        # obsToDmso[isInf] = grtrSum[isInf] # replace inf with obs sum
        # obsToDmso = obsToDmso[~np.isnan(obsToDmso)]# remove nan    
    #heatmap
    # order acording to highest false positive rate @ rnkpt 90
    fpSort = fpFrame.sort(90)
    # plot result
    if graph == True:
        fig = plt.figure(1, figsize=(10, 10))
        plt.imshow(fpSort.values,
            interpolation='nearest',
            aspect='auto',
            cmap=cm.gray_r)
            # vmin=0, 
            # vmax=1,
        tickRange = range(0,40,5)
        xtcks = [str(x) for x in fpFrame.columns[tickRange]]
        plt.xticks(tickRange, xtcks)
        # plt.yticks(np.arange(len(ytcks)),ytcks)
        plt.colorbar()
        plt.xlabel(matrixType + ' threshold')
        plt.ylabel('unique perturbations')
        plt.title('summly false positive rate - based on DMSO')
        out = wkdir + '/false_positive_matrix_' + matrixType + '_threshold.png'
        plt.savefig(out, bbox_inches='tight')
        plt.close()
        # heatmap by pert_type
        fpGrped = fpFrame.groupby(level='pert_type')
        for grp in fpGrped.groups:
            grpFrm = fpGrped.get_group(grp)
            grpSort = grpFrm.sort(90)
            fig = plt.figure(1, figsize=(10, 10))
            plt.imshow(grpSort.values,
                interpolation='nearest',
                aspect='auto',
                cmap=cm.gray_r)
                # vmin=0, 
                # vmax=1,
            tickRange = range(0,40,5)
            xtcks = [str(x) for x in grpSort.columns[tickRange]]
            plt.xticks(tickRange, xtcks)
            # plt.yticks(np.arange(len(ytcks)),ytcks)
            plt.colorbar()
            plt.xlabel(matrixType + ' threshold')
            plt.ylabel('unique perturbations')
            plt.title(grp +' summly false positive rate - based on DMSO')
            out = wkdir + '/' + grp + '_false_positive_matrix_' + matrixType + '_threshold.png'
            plt.savefig(out, bbox_inches='tight')
            plt.close()
        # graph false positive rate
        fpSer = pd.Series(fpDict)
        plt.plot(fpSer.index,fpSer.values)
        plt.ylabel('number of perturbations')
        plt.xlabel(matrixType + 'threshold')
        plt.title('false positive rates bellow .25 - (out of 7147)')
        outF = os.path.join(wkdir,'false_positive_rates_by_' + matrixType + '_threshold.png')
        plt.savefig(outF, bbox_inches=0)
        plt.close()
        # graph - obs:dmso ratio
        ratioSer = pd.Series(ratioDict)
        plt.plot(ratioSer.index,ratioSer.values)
        plt.ylabel('number of connections')
        plt.xlabel(matrixType + ' threshold')
        plt.title('observed:dmso connection ratios above 3 - (out of 7147)')
        outF = os.path.join(wkdir,'connection_ratio_by_' + matrixType + '_threshold.png')
        plt.savefig(outF, bbox_inches=0)
        plt.close()
    return fpFrame
Beispiel #23
0
def build_SC(args,work_dir):
	'''
	builds SC plots for the dose analysis
	'''
	# instantiate a progress object
	prog = progress.DeterminateProgressBar('Dose Analysis')

	# make an SC object from the given gctx file
	sco = sc.SC()
	sco.add_sc_from_gctx_meta(args.res, verbose=False)
	sco.set_thresh_by_specificity(0.8)

	# find all of the unique pert_ids in the data
	#perts = [':'.join(x.split('::')[0].split(':')[0:2]) for x in sco.pid] $perts is pert_id
	perts = [x.split(':::')[0].split('::')[1] for x in sco.pid] #perts is pert_desc
	pert_ids = [x.split(':')[1] for x in sco.pid]
	unique_perts = set(perts)
	ctl_perts = []
	for i, unique_pert in enumerate(unique_perts):
		#pert_id = unique_pert.split(':')[1]
		#if pert_id == 'DMSO' or pert_id =='CMAP-000':
			#ctl_perts.append(unique_pert)
		if unique_pert == 'DMSO':
			ctl_perts.append(unique_pert)
	unique_perts.difference_update(set(ctl_perts))

	# grab the dose information
	dose = [float(x.split('::')[0].split(':')[2]) for x in sco.pid]

	# grab pert_descs
	desc = [x.split('::')[1].split(':::')[0] for x in sco.pid]

	# write sc plots to file
	num_perts = len(unique_perts)
	for i,unique_pert in enumerate(unique_perts):
		prog.update('making SC plots',i,num_perts)
		sco.plot(include=unique_pert,size=dose,title=unique_pert,pos_con=['None'],out=os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),'SC.png'])))

	# write SC summary table
	with open(os.path.join(work_dir,'SC_summary.txt'),'w') as f:
		headers = ['pert_id','pert_desc','base_dose','base_ss',
				   'base_cc','best_dose','best_ss','best_cc',
				   'best_ss_lfc','best_cc_lfc','best_sc_lfc_distance']
		f.write('\t'.join(headers) + '\n')
		for i,unique_pert in enumerate(unique_perts):
			prog.update('making SC summary',i,num_perts)
			pert_inds = [i for i,x in enumerate(perts) if unique_pert in x]
			pert_dose = [dose[x] for x in pert_inds]
			pert_desc = desc[pert_inds[0]]
			pert_ss = [sco.s[x] for x in pert_inds]
			pert_cc = [sco.c[x] for x in pert_inds]
			pert_cc = [x if x != -666 else 0 for x in pert_cc]
			
			base_dose = numpy.min(pert_dose)
			base_ind = pert_dose.index(base_dose)
			base_ss = pert_ss[base_ind]
			base_cc = pert_cc[base_ind]
			
			ss_ratio = numpy.log(numpy.array(pert_ss)/base_ss)
			cc_ratio = numpy.log((numpy.array(pert_cc)+1)/(base_cc +1))
			sc_distance = (ss_ratio**2 + cc_ratio**2)**.5
			sc_distance = sc_distance.tolist()
			
			best_ind = sc_distance.index(numpy.max(sc_distance))
			best_dose = pert_dose[best_ind]
			best_ss = pert_ss[best_ind]
			best_cc = pert_cc[best_ind]
			best_ss_ratio = ss_ratio[best_ind]
			best_cc_ratio = cc_ratio[best_ind]
			best_sc_distance = sc_distance[best_ind]

			data = [unique_pert,pert_desc,str(base_dose),str(base_ss),
					str(base_cc),str(best_dose),str(best_ss),str(best_cc),
					str(best_ss_ratio),str(best_cc_ratio),str(best_sc_distance)]
			f.write('\t'.join(data) + '\n')
Beispiel #24
0
    ])
    processes.add(subprocess.Popen(cmd, shell=True))
    if len(processes) >= max_processes:
        os.wait()
        processes.difference_update(p for p in processes
                                    if p.poll() is not None)

### make result frame
# dg.make_result_frames(gp_type='OE',metric='spearman')
gp_type = 'KD'
work_dir = dg.outputdir
#which cell lines have a result dir
cellDirs = [
    f for f in os.listdir(work_dir) if os.path.isdir(work_dir + '/' + f)
]
prog = progress.DeterminateProgressBar('dataframe read')
df = pd.DataFrame()
dfRank = pd.DataFrame()
#loop through each cell line add to df
for icell, cell1 in enumerate(cellDirs):
    #define directories and load in outputs
    outdir = os.path.join(work_dir, cell1, 'sig_query_out')
    if not glob.glob(outdir + '/result_*.gctx'):
        print cell1 + ' no query result file'
        continue  #if no results file, skip loop
    if metric == 'wtcs':
        rsltFile = glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx')[0]
    if metric == 'spearman':
        rsltFile = glob.glob(outdir + '/result_SPEARMAN_n*.gctx')[0]
    rslt = gct.GCT()
    rslt.read(rsltFile)
Beispiel #25
0
from cmap.tools import sig_slice_tool
from cmap.io import gct, plategrp, rnk
import cmap.analytics.dgo as dgo
import cmap.util.progress as progress
import subprocess
import datetime
import cmap.util.tool_ops as to
import random

metric = 'wtcs'

work_dir = '/xchip/cogs/projects/target_id/OE_KD_25June2013'
if not os.path.exists(work_dir):
    os.mkdir(work_dir)

prog = progress.DeterminateProgressBar('perturbation cid query')
#cell lines in which OEs were recorded
CM = mu.CMapMongo()
allOE = CM.find({
    'pert_type': 'trt_oe',
    'is_gold': True
}, {
    'sig_id': True,
    'pert_iname': True,
    'cell_id': True
})
cell_lines_tested = []
cellsAll = [sig['cell_id'] for sig in allOE]
uniqCells = list(set(cellsAll))

cell_lines_tested = []
def ecdf_calc(inSum, dmsoSum, matrixType, graph=True, fpr_max=True):
    '''
    -create empirical cdf for observed and dmso 

    Parameters:
    -----------

    '''
    #look at edcf by row
    seriesList = []
    progress_bar = update.DeterminateProgressBar('ecdf calculation')
    for ii, ix in enumerate(inSum.index):
        progress_bar.update('count', ii, len(inSum.index))
        pID = ix[1]
        obsVec = inSum.ix[ix]
        dmsoVec = dmsoSum.ix[ix]
        # flip sign of rnkpt values
        # evaluate ecdf
        oecdf = ECDF(obsVec)
        decdf = ECDF(dmsoVec)
        # min1 = np.min([np.min(obsVec),np.min(dmsoVec)])
        # max1 = np.max([np.max(obsVec),np.max(dmsoVec)])
        # vals = np.linspace(min1,max1,100)
        vals = np.linspace(-100, 100, 201)
        oEval = oecdf(vals)
        dEval = decdf(vals)
        # make individual plots
        # fdrVec = dEval / oEval
        fdrVec = (1 - dEval) / (1 - oEval)  # looking for positive connections
        fdrSer = pd.Series(data=fdrVec, index=vals)
        if fpr_max:
            fdrSer[fdrSer >= 1] = 1
        fdrSer.name = ix
        seriesList.append(fdrSer)
        if graph:
            fig = plt.figure(1, figsize=(10, 10))
            plt.subplot(2, 1, 1)
            a1 = plt.plot(vals,
                          oEval,
                          color='b',
                          label='observed n=' + str(len(obsVec)))
            a3 = plt.plot(vals,
                          dEval,
                          color='r',
                          label='DMSO n=' + str(len(dmsoVec)))  #
            plt.legend(loc=2)
            plt.ylabel('F(x)', fontweight='bold')
            # plt.xlabel(matrixType,fontweight='bold')
            plt.title('ecdf for summly row - ' + pID)
            plt.subplot(2, 1, 2)
            h1 = plt.hist(obsVec,
                          30,
                          color='b',
                          range=[-100, 100],
                          label=['observed'],
                          alpha=.4,
                          normed=True)
            h2 = plt.hist(dmsoVec,
                          30,
                          color='r',
                          range=[-100, 100],
                          label='DMSO',
                          alpha=.3,
                          normed=True)
            # plt.legend()
            plt.ylabel('freq', fontweight='bold')
            plt.xlabel(matrixType, fontweight='bold')
            outF = os.path.join(wkdir, pID + '_ecdf.png')
            plt.savefig(outF, bbox_inches='tight', dpi=200)
            plt.close()
    fpFrame = pd.concat(seriesList, axis=1, keys=[s.name for s in seriesList])
    fpFrame = fpFrame.T
    mCol = pd.MultiIndex.from_tuples(fpFrame.index,
                                     names=['pert_type', 'pert_id'])
    fpFrame.index = mCol
    return fpFrame
Beispiel #27
0
    def read_gctx_matrix(self,
                         src=None,
                         cid=None,
                         rid=None,
                         col_inds=None,
                         row_inds=None,
                         verbose=True,
                         convert_to_double=False,
                         row_optimized=False):
        '''
        read just the matrix data from a gctx file
        '''
        #open an update indicator
        if verbose:
            progress_bar = update.DeterminateProgressBar('GCTX_READER')
            progress_bar.show_message('reading matrix data')

        if not src:
            src = self.src

        #get the appropriate column indices
        if not col_inds:
            col_inds = self.get_gctx_cid_inds(src, match_list=cid)

        #get the appropriate row indices
        if not row_inds:
            row_inds = self.get_gctx_rid_inds(src, match_list=rid)
        #open the gctx file
        self._open_gctx(src)

        #set up the indices
        if not col_inds:
            col_inds = range(len(self.column_id_node))
        if not row_inds:
            row_inds = range(len(self.row_id_node))

        #check if we're reading just reading the epsilon landmark genes
        #if so, can get the matrix in one read
        if row_inds == range(978):
            self.matrix = self.matrix_node[col_inds, 0:978]
        #otherwise, figure out which direction reads the fewest elements
        # then read in that orientation
        else:
            ncols, nrows = self.matrix_node.shape
            n_bycol = nrows * len(col_inds)
            n_byrow = ncols * len(row_inds)
            if row_optimized:
                # pre-allocate the matrix to be filled as we iterate over the
                # HDF5 matrix on disk
                self.matrix = numpy.zeros(
                    [len(col_inds), len(row_inds)], dtype=numpy.float32)

                # create a set of col_inds to check membership on each row
                # iteration
                col_ind_set = dict(zip(col_inds, col_inds))

                # dtermine the range of columns we must read
                col_ind_min = numpy.min(col_inds)
                col_ind_max = numpy.max(col_inds)

                # set up an iterator for the progress indicator.  This will be
                # iterated every time we read a row that is called for.  The
                # progress will be logged every time we reach 1/50th more of the
                # data
                p_iter = 0
                p_max = len(col_inds)
                num_rows = len(row_inds)
                p_mod = numpy.round(p_max / 50.0)
                for i, row in enumerate(
                        self.matrix_node.iterrows(start=col_ind_min,
                                                  stop=col_ind_max + 1)):
                    if i in col_ind_set:
                        self.matrix[p_iter, :] = numpy.take(row, row_inds)
                        p_iter += 1
                        if p_iter % p_mod == 0:
                            if verbose:
                                progress_bar.update(
                                    "reading matrix data ({0},{1})".format(
                                        num_rows, p_max), p_iter, p_max)

            else:
                if n_bycol <= n_byrow:
                    self.matrix = self.matrix_node[col_inds, :]
                    self.matrix = self.matrix[:, row_inds]
                else:
                    self.matrix = self.matrix_node[:, row_inds]
                    self.matrix = self.matrix[col_inds, :]
        # make sure the data is in the right order given the col_inds and row_inds
        self.matrix = self.matrix[col_inds.sort(), :]
        self.matrix = self.matrix[:, row_inds.sort()]
        self.matrix = numpy.reshape(self.matrix,
                                    (len(col_inds), len(row_inds)))
        self.matrix = self.matrix.transpose()
        # convert data to double precision of called for
        if convert_to_double:
            self.matrix = self.matrix.astype(numpy.float)

        #close the gctx file
        self._close_gctx()

        #clear the progress indicator
        if verbose:
            progress_bar.clear()
Beispiel #28
0
for cell in cellLst:
	for tim in timeLst:
		### make SC plots
		cellLine = cell
		timeP = tim
		refControl = 'pc' #use pc vs vc controled data
		gctfile = glob.glob('/xchip/obelix/pod/brew/%s/PRISM001_%s_%s/by_pert_id_pert_dose/PRISM001_%s_%s_COMPZ.MODZ_SCORE_LM_*.gctx' % (refControl,cellLine,timeP,cellLine,timeP))
		gctfile = gctfile[0]
		work_dir = '/xchip/cogs/hogstrom/analysis/scratch/prism/%s_%s_%s' % (cell,timeP,refControl)
		if not os.path.exists(work_dir):
			os.mkdir(work_dir)
		db = gct.GCT() #make a gct object
		db.read(gctfile)
		### copy stuff from query tool 
		# instantiate a progress object
		prog = progress.DeterminateProgressBar('Dose Analysis')
		# make an SC object from the given gctx file
		sco = sc.SC()
		sco.add_sc_from_gctx_meta(gctfile, verbose=False)
		sco.set_thresh_by_specificity(0.8)
		# find all of the unique pert_ids in the data
		perts = [x.split(':::')[0].split('::')[1] for x in sco.pid] #perts is pert_desc
		pert_ids = [x.split(':')[1] for x in sco.pid]
		# unique_perts = set(perts)
		unique_perts = set(pert_ids)
		ctl_perts = []
		for i, unique_pert in enumerate(unique_perts):
			if unique_pert == 'DMSO':
				ctl_perts.append(unique_pert)
		unique_perts.difference_update(set(ctl_perts))
		#make pairing of pert id and pert_desc
Beispiel #29
0
        os.wait()
        processes.difference_update(p for p in processes
                                    if p.poll() is not None)

# Create a pandas dataframe that lets you see connection results across
# cell lines it is structured as follows:
# 	index1 = BRD short
# 	index2 = perurbation sig_id
# 	each column - a unique gene ID/ time point - representing the CGS for that gene, matching cell line
# 	cell line listed as a column
gp_type = 'KD'  # genetic perturbation type
#which cell lines have a result dir
cellDirs = [
    f for f in os.listdir(work_dir) if os.path.isdir(work_dir + '/' + f)
]
prog = progress.DeterminateProgressBar('Drug-target')
df = pd.DataFrame()
dfRank = pd.DataFrame()
#loop through each cell line add to df
# for icell, cell1 in enumerate(cgsCells):
for icell, cell1 in enumerate(cellDirs):
    #define directories and load in outputs
    outdir = os.path.join(work_dir, cell1, 'sig_query_out')
    if not glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx'):
        print cell1 + 'no query result file'
        continue  #if no results file, skip loop
    rsltFile = glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx')[0]
    rslt = gct.GCT()
    rslt.read(rsltFile)
    prog.update('analyzing {0}', icell, len(cellDirs))
    rsltF = rslt.frame
Beispiel #30
0
# load in results
frslt = '/xchip/cogs/hogstrom/analysis/scratch/Nov20/dose_analysis_tool.1353449771597/nov20/my_analysis.query_tool.2012112017162991/result_ESLM.COMBINED_n85x398050.gctx'
rslt = gct.GCT()
rslt.read(frslt)
rSigIds = rslt.get_rids()

rsltSigID = rslt.get_rids()  #sig IDs from result file
qPert = db.get_column_meta('pert_desc')
qPertID = db.get_column_meta('pert_id')
qDose = db.get_column_meta('pert_dose')
ESmat = rslt.matrix
iES = ESmat.argsort(axis=0)[::-1]  #sort ascending
n_inst = len(iES[:, 1])

#loop through each of the perts - graph ranks of query
prog1 = progress.DeterminateProgressBar('creating self-connection graphs')
avRnk = []
medRnk = []
#loop through each of the UNIQUE perts - graph ranks of query
pertSet = set(qPert)
for pert in pertSet:
    cmpd1 = pert
    iP = _all_indices(pert, qPert)  #index of doses on plate
    if len(iP) < 2:
        print pert + ' has only one instance'
        continue
    uDose = [qDose[i] for i in iP]
    fDose = [float(x) for x in uDose]  #convert strings to float
    aDose = numpy.asarray(fDose)  #convert to numpy array
    iD = aDose.argsort()  #local ordering
    sDose = [fDose[j] for j in iD]  #sort local doses