Esempio n. 1
0
def analyze_query(args, work_dir):
    '''
	Analyze the output from query_tool - find self-connections and create graphs
	'''
    #make a gct object
    db = gct.GCT()
    db.read(args.res)

    ##load query result - gctx file
    rslt = gct.GCT()
    #if specific result directory is specified, use that - otherwise get gctx from working dir
    #try:
    #args.resultDir
    #except NameError:
    #outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step
    ##rslt.read(outGctx[0])
    #print 'read gct from working dir'
    #else:
    #print args.resultDir
    #print args.result
    ##rslt.read(args.resultDir)
    #print 'read gct from explicitly stated result dir'

    if args.result:
        outGctx = glob.glob(
            os.path.join(work_dir, '*COMBINED*.gctx')
        )  #select combined result gctx in working dir created from build_query step
        #rslt.read(outGctx[0])
        print 'read gct from working dir'
        print args.result
    else:
        print args.resultDir
        #print args.result
        #rslt.read(args.resultDir)
        print 'read gct from explicitly stated result dir'
Esempio n. 2
0
def load_summly_independent(iGold, mtrxSummly):
    "load dos compounds that have independent mode results - return dataframe"
    IST = gct.GCT(mtrxSummly)
    IST.read(col_inds=list(iGold.values))
    inSum = IST.frame
    gctSigs = IST.get_column_meta('sig_id')
    gctPertIDs = IST.get_column_meta('pert_id')
    # inSum.columns = gctSigs #index is just sig_id
    # hierarchical index - sig_id and pert_id
    iZip = zip(*[gctPertIDs, gctSigs])
    mCol = pd.MultiIndex.from_tuples(iZip, names=['pert_id', 'sig_id'])
    inSum.columns = mCol
    #read all non-dos summlies
    gt = gct.GCT()
    gt.read_gctx_col_meta(mtrxSummly)
    gt.read_gctx_row_meta(mtrxSummly)
    indSummSigs = gt.get_column_meta('sig_id')
    iNonDos = np.arange(len(indSummSigs))
    iNonDos = np.delete(iNonDos, iGold.values)
    # read in non-dos results
    ISO = gct.GCT(mtrxSummly)
    ISO.read(col_inds=list(iNonDos))
    outSum = ISO.frame
    gctSigs = ISO.get_column_meta('sig_id')
    gctPertIDs = ISO.get_column_meta('pert_id')
    # outSum.columns = gctSigs
    iZip = zip(*[gctPertIDs, gctSigs])
    mCol = pd.MultiIndex.from_tuples(iZip, names=['pert_id', 'sig_id'])
    outSum.columns = mCol
    return inSum, outSum  #return dataframe of rankpt values
def get_summly_ind_compounds(dosGold, mtrxSummly):
    '''1) return non-DOS sig_ids of compounds that are in summly space \
    summly matrix

    Returns:
    pandas series - index = sig_ids, values = indices in summly matrix
    '''
    gt = gct.GCT()
    gt.read_gctx_col_meta(mtrxSummly)
    gt.read_gctx_row_meta(mtrxSummly)
    indSummSigs = gt.get_column_meta('sig_id')
    indSummPType = gt.get_column_meta('pert_type')
    indSummInames = gt.get_column_meta('pert_iname')
    # sigSer = pd.Series(index=indSummSigs, data=indSummInames)
    typeSer = pd.Series(index=indSummSigs, data=indSummPType)
    isCp = typeSer[typeSer == 'trt_cp']
    #which dos cps are in the summly indpend
    summBrds = set(isCp.index)
    goldDosBrds = set(dosGold['sig_id'].values)
    summGold = summBrds.difference(goldDosBrds)
    indSummSer = pd.Series(indSummSigs)
    indSer = pd.Series(index=indSummSer.values, data=indSummSer.index)
    iNonDos = indSer[indSer.index.isin(summGold)]
    # iNonDos = pd.Series(list(summGold))
    return iNonDos
Esempio n. 4
0
def main():
    GTEx_gctobj = gct.GCT(GTEx_GCTX)
    GTEx_gctobj.read()
    GTEx_genes = map(lambda x: x.split('.')[0], GTEx_gctobj.get_rids())

    lm_id = []
    infile = open(BGEDV2_LM_ID)
    for line in infile:
        ID = line.strip('\n').split('\t')[0]
        lm_id.append(ID)

    infile.close()
    lm_idx = map(GTEx_genes.index, lm_id)

    tg_id = []
    infile = open(BGEDV2_TG_ID)
    for line in infile:
        ID = line.strip('\n').split('\t')[0]
        tg_id.append(ID)

    infile.close()
    tg_idx = map(GTEx_genes.index, tg_id)

    genes_idx = lm_idx + tg_idx

    data = GTEx_gctobj.matrix[genes_idx, :].astype('float64')

    np.save('GTEx_float64.npy', data)
Esempio n. 5
0
def build_html(work_dir):
	'''
	builds summary html files from templates
	'''
	# instantiate a progress object
	# prog = progress.DeterminateProgressBar('HTML report')

	# grab the cids from the file.  Find all of 
	# the unique perts
	rpt_dict = tool_ops.parse_rpt(glob.glob(work_dir + '/*.rpt')[0])
	gcto = gct.GCT(rpt_dict['res'])
	gcto.read()
	cids = gcto.get_gctx_cid()
	pert_descs = gcto.get_column_meta('pert_desc')
	perts = [x.split(':')[1] for x in cids]
	pert_desc_dict = dict(zip(perts,pert_descs))
	unique_perts = list(set(perts))
	unique_perts.sort()
	
	# buld an environment for jinja2
	cmap_base_dir = '/'.join(os.path.dirname(cmap.__file__).split('/')[0:-1])
	env = jinja2.Environment(loader=jinja2.FileSystemLoader(cmap_base_dir + '/templates'))

	# build an index page
	index_page_template = env.get_template('Link_List_Template.html')
	index_links = [pert_desc_dict[x] + '_detail.html' for x in unique_perts]
	with open(os.path.join(work_dir,'index.html'),'w') as f:
		f.write(index_page_template.render(title='Dose Analysis Results',
											links=index_links,
											labels=unique_perts))

	# for each unique_pert, make a detail page
	dose_response_compound_summary_template = env.get_template('Dose_Response_Compound_Summary_Template.html')
	for unique_pert in unique_perts:
		query_images = glob.glob(os.path.join(work_dir,pert_desc_dict[unique_pert] + '*query_rank.png'))
		doses = [float(os.path.basename(x).split('_')[1]) for x in query_images]
		tmp_tup = zip(doses,query_images)
		tmp_tup.sort()
		doses,query_images = zip(*tmp_tup)

		if unique_pert != 'DMSO':
			qq_images = glob.glob(os.path.join(work_dir,pert_desc_dict[unique_pert] + '*um_internal-external_qq.png'))
			doses = [float(os.path.basename(x).split('_')[1].rstrip('um')) for x in qq_images]
			tmp_tup = zip(doses,qq_images)
			tmp_tup.sort()
			doses,qq_images = zip(*tmp_tup)

			with open(os.path.join(work_dir,pert_desc_dict[unique_pert] + '_detail.html'),'w') as f:
				f.write(dose_response_compound_summary_template.render(
						title=pert_desc_dict[unique_pert],
						query_images=query_images,
						qq_images=qq_images))
		else:

			with open(os.path.join(work_dir,pert_desc_dict[unique_pert] + '_detail.html'),'w') as f:
				f.write(dose_response_compound_summary_template.render(
						title=pert_desc_dict[unique_pert],
						query_images=query_images,
						qq_images=[]))
Esempio n. 6
0
def gct2gctx(filepath):
	g = gct.GCT()
	try:
		print "Reading..."
		g._read_gct(filepath)
		print "Writing..."
		g.write(filepath.replace('.gct','.gctx'), mode='gctx')
	except:
		print "ERROR: could not process",filepath
Esempio n. 7
0
def main():
    infile = sys.argv[1]
    outfile = sys.argv[2]

    gctobj = gct.GCT(infile)
    gctobj.read()

    data = gctobj.matrix[:, :].astype('float64')

    np.save(outfile, data)
def get_summly_dos_indeces(dosBrds, mtrxSummly):
    "1) return dos compounds that are in summly matched space "
    gt = gct.GCT()
    gt.read(mtrxSummly)
    # indSummSigs = gt.get_column_meta('sig_id')
    # indSummInames = gt.get_column_meta('pert_iname')
    summFrm = gt.frame
    sigSer = pd.Series(index=summFrm.index, data=summFrm.columns)
    dosSer = sigSer[sigSer.index.isin(dosBrds)]
    return dosSer
def load_file(filename):
  '''
  load the gct file using cmap/Zichen python script
  '''
  import cmap.io.gct as gct
  import cmap.io.plategrp as grp

  GCTObject = gct.GCT(filename)
  GCTObject.read(verbose=False)

  return GCTObject
Esempio n. 10
0
def build_query(args, work_dir):
    '''
	build query results
	'''
    #make signature for each dose
    fup = os.path.join(work_dir, 'up_list.gmt')
    fdn = os.path.join(work_dir, 'dn_list.gmt')
    open(fup, 'w')  #overwrite existing grp file
    open(fdn, 'w')  #overwrite existing grp file
    n_edge = 50
    db = gct.GCT()
    #db.read(gctfile)
    db.read(args.res)
    cids = db.get_cids()
    pertIDs = [x.split(':')[1] for x in cids]
    doses = [float(x.split(':')[2]) for x in cids]
    perts = db.get_column_meta('pert_desc')
    probes = db.get_rids()
    cellLs = db.get_column_meta('cell_id')
    timePs = db.get_column_meta('pert_time')
    mtrx = db.matrix  #matrix of data from gct file
    #loop through each column of data
    for i, pertID in enumerate(pertIDs):
        profile = mtrx[:, i]
        n_prof = len(profile)
        iprofile = profile.argsort()  #indices that sort array
        iprofile = iprofile[::-1]  #switch indicies to decend
        sprofile = profile[iprofile]
        itop = iprofile[0:(n_edge)]
        ibot = iprofile[-n_edge:n_prof]
        col_name = perts[i] + '_' + str(
            doses[i]) + 'um_' + cellLs[i] + '_' + timePs[i]
        ptop = []
        pbot = []
        for j, it in enumerate(itop):
            ptop.append(probes[it])  #make probe id list
        for j, ip in enumerate(ibot):
            pbot.append(probes[ip])  #make probe id list
        #write to gmt list
        with open(fup, 'a') as f:
            f.write(col_name + '\t' + col_name + '\t')
            for pt in ptop:
                f.write(pt + '\t')
            f.write('\n')
        with open(fdn, 'a') as f:
            f.write(col_name + '\t' + col_name + '\t')
            for pb in pbot:
                f.write(pb + '\t')
            f.write('\n')
    #python system call
    os.chdir(work_dir)
    #cmd = 'rum -q local query_tool --uptag ' + fup + ' --dntag ' + fdn + ' --metric eslm'
    cmd = 'rum -q local query_tool --uptag ' + fup + ' --dntag ' + fdn + ' --metric wteslm --mkdir false'
    os.system(cmd)
def build_probe_curves(args,work_dir):
	'''
	builds dose response curves for the specified probe
	'''
	gcto = gct.GCT()
	probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True)
	gcto.read_gctx_matrix(args.res,row_inds=probe_ind)
	cids = gcto.get_gctx_cid(args.res)
	doses = [float(x.split(':')[2]) for x in cids]
	CM = mu.CMapMongo()
	with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f:
		headers = ['pert_id','pert_desc','base_dose','base_z_score',
				   'best_dose','best_z_score', 'best_z_score_delta']
		f.write('\t'.join(headers) + '\n')
		for i,unique_pert in enumerate(unique_perts):
			prog.update('analyzing {0}'.format(args.probe),i,num_perts)
			cid_inds = [i for i,x in enumerate(cids) if unique_pert in x]
			pert_scores = gcto.matrix[0,cid_inds]
			pert_doses = [doses[x] for x in cid_inds]
			tmp_tup = zip(pert_doses,pert_scores)
			tmp_tup.sort()
			pert_doses,pert_scores = zip(*tmp_tup)
			plt.plot(pert_doses,pert_scores)
			plt.title('::'.join([unique_pert,args.probe]))
			plt.xlabel('dose')
			plt.ylabel('z-score')
			plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png'])))
			plt.close()
			
			pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1)
			if not pert_desc:
				pert_desc = ['-666']
			pert_desc = pert_desc[0]

			base_dose = pert_doses[0]
			base_z_score = pert_scores[0]

			z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10)
			abs_z_delta = numpy.abs(z_delta)
			z_delta =  z_delta.tolist()
			abs_z_delta = abs_z_delta.tolist()
			
			best_ind = z_delta.index(numpy.min(z_delta))
			best_dose = pert_doses[best_ind]
			best_z_score = pert_scores[best_ind]
			best_z_score_delta = z_delta[best_ind]

			data = [unique_pert,pert_desc,str(base_dose),str(base_z_score),
					str(best_dose),str(best_z_score),str(best_z_score_delta)]
			f.write('\t'.join(data) + '\n')
	prog.clear()
Esempio n. 12
0
def get_summly_dos_indeces(dosGold,mtrxSummly):
    "1) obtain indices for all DOS compounds in the pre computed \
    summly matrix"
    gt = gct.GCT()
    gt.read_gctx_col_meta(mtrxSummly)
    gt.read_gctx_row_meta(mtrxSummly)
    indSummSigs = gt.get_column_meta('sig_id')
    indSummInames = gt.get_column_meta('pert_iname')
    sigSer = pd.Series(index=indSummSigs, data=indSummInames)
    #which dos cps are in the summly indpend
    summBrds = set(sigSer.index)
    goldDosBrds = set(dosGold['sig_id'].values)
    summGold = summBrds.intersection(goldDosBrds)
    # for the what is the median of the top n connections in summly independent mode?
    #get indices of gold DOS
    indSummSer = pd.Series(indSummSigs)
    indSer = pd.Series(index=indSummSer.values,data=indSummSer.index)
    iGold = indSer.reindex(list(summGold))
    return iGold
    def write_pairwise_mtrx(self, inames_zip, mtrx, out):
        '''
        Write a matrix to file

        Parameters
        ----------
        inames_zip : list of tuples
            brds paired with inames
        mtrx : numpy.ndarray
            matrix of data
        out : str
            output path - no file extension       
        '''
        Hindex = pd.MultiIndex.from_tuples(inames_zip, names=['brd', 'iname'])
        sumScoreFrm = pd.DataFrame(mtrx, index=Hindex, columns=Hindex)
        sumScoreFrm.to_csv(out + '.txt', sep='\t')
        gc = gct.GCT()
        gc.build_from_DataFrame(sumScoreFrm)
        gc.write(out)
Esempio n. 14
0
 def add_from_gct(self,src,ss_column_name='distil_ss', cc_column_name='distil_cc_q75'):
     '''
     reads the meta data of the given gct or gctx file 
     '''
     #set the src for the SC object
     self.src = src
     
     #read in the gct data
     gct_obj = gct.GCT(src=src)
     gct_obj.read()
     
     #grab the pid, ss, and cc data as well as ss and cc cutoffs
     s = gct_obj.get_column_meta(ss_column_name)
     c = gct_obj.get_column_meta(cc_column_name)
     pert_descs = gct_obj.get_column_meta('pert_desc')
     pert_ids = gct_obj.get_column_meta('pert_id')
     doses = gct_obj.get_column_meta('pert_dose')
     id_list = gct_obj.get_column_meta('id')
     pert_desc_list = gct_obj.get_column_meta('pert_desc')
     pid = [x + '::' + pert_desc_list[i] for i,x in enumerate(id_list)]  
     
     #ensure that s and c are lists not numpy arrays
     self.c = list(self.c)
     self.s = list(self.s)
     
     #convert ss and cc into float values
     s = [float(x) for x in s]
     c = [float(x) for x in c]
     
     #add pid, ss, and cc to the existing data
     self.pid.extend(pid)
     self.s.extend(s)
     self.c.extend(c)
     self.pert_ids.extend(pert_ids)
     self.pert_descs.extend(pert_descs)
     self.doses.extend(doses)
Esempio n. 15
0
# wkdir = '/xchip/cogs/projects/NMF/TA_lung_OE_May_2014/TA_OE_qnorm'
wkdir = '/xchip/cogs/projects/NMF/TA_lung_OE_June_2014/TA_OE_ZSPCINF'
# wkdir = '/xchip/cogs/projects/NMF/TA_lung_OE_May_2014/TA_OE_ZSPC_LM'
if not os.path.exists(wkdir):
    os.mkdir(wkdir)

################
### load data ##
################

file_modz = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_COMPZ.MODZ_SCORE_n13974x22268.gctx'
file_qnorm = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_QNORM_n38534x978.gctx'
file_zspcinf = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_ZSPCINF_n38534x22268.gctx'

gt = gct.GCT(src=file_zspcinf)
gt.read()
ds = gt.frame

# signature subset
# file_lung_grp = '/cga/meyerson/brooks/TA/all_TA_for_jun10/all_TA_Lung_sig_ids.grp'
file_lung_grp = '/xchip/cga_home/brooks/TA/all_TA_for_jun10/all_TA_Lung_distil_ids.grp'
lungSigs = pd.read_csv(file_lung_grp, header=None, names=['sig_id'])
ds_lung = ds.reindex(columns=lungSigs.sig_id.values)

# signature annotations
sFile = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/inst.info'
sigInfo = pd.read_csv(sFile, sep='\t')
sigInfo.index = sigInfo.distil_id

#####################################
Esempio n. 16
0
def main():

    opt_parser = OptionParser()

    # Add Options. Required options should have default=None
    opt_parser.add_option("--pred_file",
                          dest="pred_file",
                          type="string",
                          help="""File containing the mutation impact
                                  predictions""",
                          default=None)
    #   opt_parser.add_option("--col",
    #                         dest="pred_col",
    #                         type="string",
    #                         help="""Prediciton files have predictions based on
    #                                 multiple scenarios. The scenario needs to be
    #                                 specified because figures will be plotted in
    #                                 the order of GOF, LOF, COF,Inert, NI calls. This
    #                                 specifies the name of the column that contains
    #                                 the prediction. DEF=%s""" % DEF_PRED_COL,
    #                         default=DEF_PRED_COL)
    opt_parser.add_option(
        "--sig_info",
        dest="sig_info",
        type="string",
        help="""sig info file with gene information and distil
                                  information""",
        default=None)
    opt_parser.add_option("--gctx",
                          dest="gctx",
                          type="string",
                          help="GCTX file with correlations",
                          default=None)
    opt_parser.add_option(
        "--sig_gctx",
        dest="sig_gctx",
        type="string",
        help="""GCTX containing signature data. For L1000, this
                                  would  the Z-score data""",
        default=None)
    opt_parser.add_option("--ref_allele_mode",
                          dest="ref_allele_mode",
                          action="store_true",
                          help="""Instead of organizing plots by gene, will use
                                  the wt column to determine what are the
                                  reference alleles.""",
                          default=False)
    opt_parser.add_option(
        "--null_conn",
        dest="null_conn",
        type="string",
        help="""File of null connectivity values. This file is
                                  given as output from
                                  eVIP_compare.py. The file ends with
                                  conn_null.txt""",
        default=None)
    opt_parser.add_option("--out_dir",
                          dest="out_dir",
                          type="string",
                          help="Output directory to put figures",
                          default=None)
    opt_parser.add_option("--ymin",
                          dest="ymin",
                          type="int",
                          help="Minimum y-value of rep value. DEF=%d" %
                          DEF_YMIN,
                          default=DEF_YMIN)
    opt_parser.add_option("--ymax",
                          dest="ymax",
                          type="int",
                          help="Maximum y-value of rep value. DEF=%d" %
                          DEF_YMAX,
                          default=DEF_YMAX)
    opt_parser.add_option(
        "--corr_val_str",
        dest="corr_val_str",
        type="string",
        help="String used to label the correlation value. DEF=\"%s\"" %
        DEF_CORR_VAL_STR,
        default=DEF_CORR_VAL_STR)
    opt_parser.add_option("--allele_col",
                          dest="allele_col",
                          type="string",
                          help="""Column name that indicates the allele names.
                                  DEF=%s""" % DEF_ALLELE_COL,
                          default=DEF_ALLELE_COL)
    opt_parser.add_option("--use_c_pval",
                          dest="use_c_pval",
                          action="store_true",
                          help="Use corrected p-val instead of raw pval",
                          default=False)
    opt_parser.add_option("--pdf",
                          dest="pdf",
                          action="store_true",
                          help="Makes figures in pdf format instead of png",
                          default=False)
    opt_parser.add_option(
        "--cell_id",
        dest="cell_id",
        type="string",
        help="""Indicates which cell line. Helps for filtering
                                  sig_info file""",
        default=None)
    opt_parser.add_option(
        "--plate_id",
        dest="plate_id",
        type="string",
        help="""Indicates which cell line. Helps for filtering
                                  sig_info file""",
        default=None)

    (options, args) = opt_parser.parse_args()

    # validate the command line arguments
    opt_parser.check_required("--pred_file")
    #    opt_parser.check_required("--col")
    opt_parser.check_required("--sig_info")
    opt_parser.check_required("--gctx")
    opt_parser.check_required("--null_conn")
    opt_parser.check_required("--out_dir")

    pred_file = open(options.pred_file)
    pred_col = DEF_PRED_COL

    if os.path.exists(options.out_dir):
        out_dir = os.path.abspath(options.out_dir)
    else:
        os.mkdir(options.out_dir)
        out_dir = os.path.abspath(options.out_dir)
        print "Creating output directory: %s" % out_dir

    pdf = options.pdf
    use_c_pval = options.use_c_pval

    ymin = options.ymin
    ymax = options.ymax

    allele_col = options.allele_col

    ref_allele_mode = options.ref_allele_mode

    corr_val_str = options.corr_val_str

    cell_id = options.cell_id
    plate_id = options.plate_id

    sig_info = open(options.sig_info)

    null_conn = getNullConnDist(options.null_conn)

    #   null_x_vals = []
    #   for val in null_conn:
    #       null_x_vals.append(random.uniform(NULL_CONN_RANGE[0], NULL_CONN_RANGE[1]))

    this_gctx = gct.GCT(options.gctx)
    this_gctx.read()

    sig_gctx = gct.GCT(options.sig_gctx)
    sig_gctx.read()

    # Process predictions
    # allele2pvals = {allele:[mut vs wt pval,
    #                         wt vs mut-wt pval,
    #                         mut-wt conn pval]
    (gene2wt, gene2allele_call, gene2num_alleles,
     allele2pvals) = parse_pred_file(pred_file, pred_col, use_c_pval,
                                     ref_allele_mode)

    allele2distil_ids = parse_sig_info(sig_info, allele_col, cell_id, plate_id)

    for gene in gene2wt:

        this_fig = plt.figure()
        this_fig.set_size_inches((gene2num_alleles[gene] + 1) * 4, 4 * 3)

        grid_size = (4, gene2num_alleles[gene] + 1)

        wt_heatmap_ax = plt.subplot2grid(grid_size, (0, 0))
        wt_im = plot_rep_heatmap(wt_heatmap_ax, this_gctx.frame,
                                 allele2distil_ids[gene2wt[gene]],
                                 allele2distil_ids[gene2wt[gene]],
                                 gene2wt[gene], ymin, ymax)

        # WT self connectivity
        wt_self, wt_self_row_medians = getSelfConnectivity(
            this_gctx, allele2distil_ids[gene2wt[gene]],
            len(allele2distil_ids[gene2wt[gene]]))

        # Create consistent x values for the wt reps when plotting
        wt_x_vals = []
        for val in wt_self_row_medians:
            wt_x_vals.append(random.randint(WT_RANGE[0], WT_RANGE[1]))

        # Plot color bar on this axis
        plt.colorbar(wt_im, ax=wt_heatmap_ax, shrink=0.7)

        # Plot allele data
        col_counter = 1

        for type in PRED_TYPE:
            for allele in gene2allele_call[gene][type]:

                # CREATE SCATTERPLOT FIGURE
                plot_signatures(pdf, out_dir, sig_gctx.frame, gene2wt[gene],
                                allele, allele2distil_ids[gene2wt[gene]],
                                allele2distil_ids[allele])

                # PLOT HEATMAP
                this_hm_ax = plt.subplot2grid(grid_size, (0, col_counter))
                plot_rep_heatmap(this_hm_ax, this_gctx.frame,
                                 allele2distil_ids[allele],
                                 allele2distil_ids[allele],
                                 type + " - " + allele, ymin, ymax)

                # PLOT WT MUT heatmap
                this_wt_mut_ax = plt.subplot2grid(grid_size, (1, col_counter))

                plot_rep_heatmap(this_wt_mut_ax, this_gctx.frame,
                                 allele2distil_ids[gene2wt[gene]],
                                 allele2distil_ids[allele],
                                 gene2wt[gene] + " vs " + allele, ymin, ymax)

                # PLOT RANKPOINT ROWS
                this_jitter_ax = plt.subplot2grid(grid_size, (2, col_counter))

                mut_self, mt_self_row_medians = getSelfConnectivity(
                    this_gctx, allele2distil_ids[allele],
                    len(allele2distil_ids[allele]))
                wt_mut, wt_mut_row_medians = getConnectivity(
                    this_gctx, allele2distil_ids[gene2wt[gene]],
                    allele2distil_ids[allele], len(allele2distil_ids[allele]))

                plot_jitter(
                    this_jitter_ax,
                    col_counter,
                    wt_x_vals,
                    wt_self_row_medians,
                    mt_self_row_medians,
                    wt_mut_row_medians,
                    #                            null_x_vals,
                    #                            null_conn,
                    allele2pvals[allele][0],
                    allele2pvals[allele][1],
                    use_c_pval,
                    ymin,
                    ymax,
                    corr_val_str)

                # Compared to random connectivity
                conn_ax = plt.subplot2grid(grid_size, (3, col_counter))

                plot_conn(conn_ax, col_counter, null_conn, wt_mut_row_medians,
                          allele2pvals[allele][2], use_c_pval, corr_val_str)

                col_counter += 1

        if pdf:
            this_fig.savefig("%s/%s_impact_pred_plots.pdf" % (out_dir, gene),
                             format="pdf")
        else:
            this_fig.savefig("%s/%s_impact_pred_plots.png" % (out_dir, gene))
        plt.close(this_fig)

    sys.exit(0)
Esempio n. 17
0
def eVIP_run_main(pred_file=None,
                  sig_info=None,
                  gctx=None,
                  sig_gctx=None,
                  ref_allele_mode=None,
                  null_conn=None,
                  out_dir=None,
                  ymin=None,
                  ymax=None,
                  allele_col=None,
                  use_c_pval=None,
                  pdf=None,
                  cell_id=None,
                  plate_id=None,
                  corr_val_str=None):

    #setting default values
    # ymin = int(ymin) if ymin != None else int(-100)
    # ymax = int(ymax) if ymax != None else int(100)

    ymin = int(ymin) if ymin != None else int(-1.00)
    ymax = int(ymax) if ymax != None else int(1.00)

    pred_file = open(pred_file)
    pred_col = DEF_PRED_COL

    if os.path.exists(out_dir):
        out_dir = os.path.abspath(out_dir)
    else:
        os.mkdir(out_dir)
        out_dir = os.path.abspath(out_dir)
        print "Creating output directory: %s" % out_dir

    sig_info = open(sig_info)
    null_conn = getNullConnDist(null_conn)

    this_gctx = gct.GCT(gctx)
    this_gctx.read()

    sig_gctx = gct.GCT(sig_gctx)

    sig_gctx.read()

    (gene2wt, gene2allele_call, gene2num_alleles,
     allele2pvals) = parse_pred_file(pred_file, pred_col, use_c_pval,
                                     ref_allele_mode)

    allele2distil_ids = parse_sig_info(sig_info, allele_col, cell_id, plate_id)

    for gene in gene2wt:

        this_fig = plt.figure()
        this_fig.set_size_inches((gene2num_alleles[gene] + 1) * 4, 4 * 3)

        grid_size = (4, gene2num_alleles[gene] + 1)

        wt_heatmap_ax = plt.subplot2grid(grid_size, (0, 0))
        wt_im = plot_rep_heatmap(wt_heatmap_ax, this_gctx.frame,
                                 allele2distil_ids[gene2wt[gene]],
                                 allele2distil_ids[gene2wt[gene]],
                                 gene2wt[gene], ymin, ymax)

        # WT self connectivity
        wt_self, wt_self_row_medians = getSelfConnectivity(
            this_gctx, allele2distil_ids[gene2wt[gene]],
            len(allele2distil_ids[gene2wt[gene]]))

        # Create consistent x values for the wt reps when plotting
        wt_x_vals = []
        for val in wt_self_row_medians:
            wt_x_vals.append(random.randint(WT_RANGE[0], WT_RANGE[1]))

        # Plot color bar on this axis
        plt.colorbar(wt_im, ax=wt_heatmap_ax, shrink=0.7)

        # Plot allele data
        col_counter = 1

        for type in PRED_TYPE:
            for allele in gene2allele_call[gene][type]:

                # CREATE SCATTERPLOT FIGURE
                plot_signatures(pdf, out_dir, sig_gctx.frame, gene2wt[gene],
                                allele, allele2distil_ids[gene2wt[gene]],
                                allele2distil_ids[allele])

                # PLOT HEATMAP
                this_hm_ax = plt.subplot2grid(grid_size, (0, col_counter))

                plot_rep_heatmap(this_hm_ax, this_gctx.frame,
                                 allele2distil_ids[allele],
                                 allele2distil_ids[allele],
                                 type + " - " + allele, ymin, ymax)

                # PLOT WT MUT heatmap
                this_wt_mut_ax = plt.subplot2grid(grid_size, (1, col_counter))

                plot_rep_heatmap(this_wt_mut_ax, this_gctx.frame,
                                 allele2distil_ids[gene2wt[gene]],
                                 allele2distil_ids[allele],
                                 gene2wt[gene] + " vs " + allele, ymin, ymax)

                # PLOT RANKPOINT ROWS
                this_jitter_ax = plt.subplot2grid(grid_size, (2, col_counter))

                mut_self, mt_self_row_medians = getSelfConnectivity(
                    this_gctx, allele2distil_ids[allele],
                    len(allele2distil_ids[allele]))
                wt_mut, wt_mut_row_medians = getConnectivity(
                    this_gctx, allele2distil_ids[gene2wt[gene]],
                    allele2distil_ids[allele], len(allele2distil_ids[allele]))

                plot_jitter(
                    this_jitter_ax,
                    col_counter,
                    wt_x_vals,
                    wt_self_row_medians,
                    mt_self_row_medians,
                    wt_mut_row_medians,
                    #                            null_x_vals,
                    #                            null_conn,
                    allele2pvals[allele][0],
                    allele2pvals[allele][1],
                    use_c_pval,
                    ymin,
                    ymax,
                    corr_val_str)

                # Compared to random connectivity
                conn_ax = plt.subplot2grid(grid_size, (3, col_counter))

                plot_conn(conn_ax, col_counter, null_conn, wt_mut_row_medians,
                          allele2pvals[allele][2], use_c_pval, corr_val_str)

                col_counter += 1

        if pdf:
            this_fig.savefig("%s/%s_impact_pred_plots.pdf" % (out_dir, gene),
                             format="pdf")
        else:
            this_fig.savefig("%s/%s_impact_pred_plots.png" % (out_dir, gene))
        plt.close(this_fig)
Esempio n. 18
0
    os.mkdir(wkdir)

# load cliques
classGMT = '/xchip/cogs/projects/pharm_class/cp_cliques_current.gmt'
gmtDict = gmt.read(classGMT)
cliqueLabels = pd.DataFrame(gmtDict)
# create set of all clique members
cList = [item for sublist in cliqueLabels['sig'] for item in sublist]
cSet = set(cList)

# load observed score data
# thresholded
# rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/dmso_q_thresholded_asym_lass_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814320559/summly/self_rankpt_n379x379.gctx'
# non-thresholded asym
rFile = '/xchip/cogs/projects/connectivity/null/clique_analysis/baseline_lass_asym_matrix/jan28/my_analysis.sig_cliqueselect_tool.2014012814364180/summly/self_rankpt_n379x379.gctx'
gt1 = gct.GCT()
gt1.read(rFile)
sFrm = gt1.frame
sFrm.columns = gt1.get_column_meta('pert_id')
#check that all clique members are in the observed matrix
if not (sFrm.index.isin(cSet)).all():
    print "not all clique data loaded"

# load null
dFile = '/xchip/cogs/projects/connectivity/null/dmso/lass_n1000x7147.gctx'
gt = gct.GCT(dFile)
gt.read()
dmsoFrm = gt.frame
dmsoFrm.columns = gt.get_column_meta('id')
dmsoCM = dmsoFrm[dmsoFrm.index.isin(cSet)]
rowMedian = dmsoCM.median(axis=1)
Esempio n. 19
0
'''
This script contains examples for reading .gctx files in Python.
'''

import cmap.io.gct as gct
import cmap.io.plategrp as grp

# give input file
path_to_gctx_file = '/cmap/tools/l1ktools/data/modzs_n272x978.gctx'

# read the full data file
GCTObject = gct.GCT(path_to_gctx_file)
GCTObject.read()
print(GCTObject.matrix)

# read the first 100 rows and 10 columns of the data
GCTObject = gct.GCT(path_to_gctx_file)
GCTObject.read(row_inds=range(100), col_inds=range(10))
print(GCTObject.matrix)

# read the first 10 columns of the data, identified by their
# column ids, stored in a grp file given below
path_to_column_ids = '/cmap/tools/l1ktools/data/cids_n10.grp'
# read the column ids as a list
column_ids = grp.read_grp(path_to_column_ids)
GCTObject = gct.GCT(path_to_gctx_file)
# extract only the specified columns from the matrix
GCTObject.read(cid=column_ids)
print(GCTObject.matrix)

# get the available meta data headers for data columns and row
Esempio n. 20
0
    # keep only n instances of each compound
    for brd in grpedBRD.groups:
        sigs = grpedBRD.groups[brd]
        if brd == 'DMSO':
            keepList.extend(sigs)  # keep all DMSO sigs
        else:
            keepList.extend(sigs[:nKeep])
    reducedSigFrm = goldQuery.reindex(index=keepList)
    outF = wkdir + '/' + cellLine + '_top_intra_connecting_compound_classes.v2.txt'
    reducedSigFrm.to_csv(outF, sep='\t', header=False)
    ### read in signatures ###
    ### write to file ####
    sigList = reducedSigFrm['sig_id'].values
    ### load in expression data for the two sets of signatures
    afPath = cmap.score_path
    gt = gct.GCT()
    gt.read(src=afPath, cid=sigList, rid='lm_epsilon')
    outGCT = wkdir + '/' + cellLine + '_top_intra_connecting_compound_classes'
    gt.write(outGCT, mode='gctx')
    zFrm = gt.frame
    # zFrm = zFrm.T
    # probeIDs = zFrm.columns
    # ## merge data with
    # zFrm = pd.concat([zFrm,droppedQ],axis=1)

# convert gctx to gct
#use java-1.7
# convert gctx to gct so it can be read by R "convert-dataset -i MCF7_top_intra_connecting_compound_classes_n130x978.gctx"
cmd1 = 'use Java-1.7'
os.system(cmd1)
globRes = glob.glob(outGCT + '*.gctx')
 def group_probe_frq_plot(self,
                          make_heatmaps=True,
                          sum_score_metric='sum_score_4',
                          rankpt_metric='mean_rankpt_4'):
     '''
     test relative occurance of up/dn regulation of probes for a specific group
     
     '''
     brd = 'BRD-K02130563'
     sigs = po.sigIDdict[brd]
     sig = sigs[0]
     #
     afPath = cmap.score_path
     gt = gct.GCT()
     gt.read(src=afPath, cid=sigs, rid='lm_epsilon')
     zFrm = gt.frame
     # zFrm = pd.DataFrame(data=gt.matrix,
     #                     index=gt.get_rids(),
     #                     columns=sigs)
     # take modz of signature group
     modZed = modzsig.modzsig(zFrm)
     modZed = modZed.order()
     #pick a group
     # grpName = 'tubulin'
     grpName = 'HDAC-inhibitor'
     #get all sig_ids for that group
     grpSigList = []
     for brd in self.pclResultDict[grpName]:
         grpSigList.extend(self.sigIDdict[brd])
     #query for up/dn probes
     cm = mu.CMapMongo()
     regFrm = cm.find({'sig_id': {
         '$in': list(grpSigList)
     }}, {
         'sig_id': True,
         'pert_id': True,
         'pert_iname': True,
         'up50_lm': True,
         'dn50_lm': True
     },
                      toDataFrame=True)
     # count dn probe freq
     nInstances = regFrm.shape[0]
     dnNested = regFrm['dn50_lm'].values
     dnArray = [item for sublist in dnNested for item in sublist]
     dnSer = pd.Series(dnArray)
     dnCounts = dnSer.value_counts()
     zDnCounts = dnCounts.reindex_like(modZed)
     # count dn probe freq
     upNested = regFrm['up50_lm'].values
     upArray = [item for sublist in upNested for item in sublist]
     upSer = pd.Series(upArray)
     upCounts = upSer.value_counts()
     zUpCounts = upCounts.reindex_like(modZed)
     # adjust marker size
     upPercMkrs = np.divide(
         zUpCounts, nInstances
     )  #divide by total instances to make for relative frequency
     dnPercMkrs = np.divide(zDnCounts, nInstances)
     upMkrs = np.multiply(upPercMkrs, 100)
     dnMkrs = np.multiply(dnPercMkrs, 100)
     upMkrs = upMkrs.replace(np.nan, 0)
     dnMkrs = dnMkrs.replace(np.nan, 0)
     # make plot
     fig = plt.figure()
     ax = fig.add_subplot(111)
     # ax.plot(s,s,'b')
     for j, sl in enumerate(modZed):
         ax.plot(j, 1, 'r.', markersize=upMkrs[j], alpha=.25)
         ax.plot(j, 1, 'b.', markersize=dnMkrs[j], alpha=.25)
Esempio n. 22
0
import cmap.analytics.signature_strength as ss
import numpy
import scipy
import cmap.io.gct as gct
#import ljh_dose_analysis_tool as dose

#plot tool
import pylab as pl
import matplotlib.pyplot as plt

cellLine = 'MCF7'
timeP = '24H'
gctfile = '/xchip/obelix/pod/brew/pc/ASG001_%s_%s/by_pert_id_pert_dose/ASG001_%s_%s_COMPZ.MODZ_SCORE_LM_n85x978.gctx' % (cellLine,timeP,cellLine,timeP) 

#make a gct object
db = gct.GCT()
db.read(gctfile)

#make sc object for signature strength
sco = sc.SC()
sco.add_sc_from_gctx_meta(gctfile)
ss = sco.s

#make signature for each dose
work_dir = '/xchip/cogs/hogstrom/analysis/scratch'
#work_dir = os.getcwd() #set work_dir var as pwd
fup = '/xchip/cogs/hogstrom/analysis/scratch/tmp_up_list.gmt'
fdn = '/xchip/cogs/hogstrom/analysis/scratch/tmp_dn_list.gmt'
open(fup,'w') #overwrite existing grp file
open(fdn, 'w') #overwrite existing grp file
n_edge = 50
Esempio n. 23
0
cellDirs = [
    f for f in os.listdir(work_dir) if os.path.isdir(work_dir + '/' + f)
]
prog = progress.DeterminateProgressBar('Drug-target')
df = pd.DataFrame()
dfRank = pd.DataFrame()
#loop through each cell line add to df
# for icell, cell1 in enumerate(cgsCells):
for icell, cell1 in enumerate(cellDirs):
    #define directories and load in outputs
    outdir = os.path.join(work_dir, cell1, 'sig_query_out')
    if not glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx'):
        print cell1 + 'no query result file'
        continue  #if no results file, skip loop
    rsltFile = glob.glob(outdir + '/result_WTCS.LM.COMBINED_n*.gctx')[0]
    rslt = gct.GCT()
    rslt.read(rsltFile)
    prog.update('analyzing {0}', icell, len(cellDirs))
    rsltF = rslt.frame
    rsltF = rsltF.T
    indVals = rsltF.index.values
    pertVals = [ind.split(':')[1][:13] for ind in indVals]
    #make the column name gene and pert time
    geneVals = []
    for ind in rsltF.columns:
        gene = ind.split(':')[1]
        tp = ind.split(':')[0].split('_')[-1]
        gname = '_'.join([gene, tp])
        geneVals.append(gname)
    if len(geneVals) > len(set(geneVals)):
        print 'duplicate CGS for this celline'
Esempio n. 24
0
def build_probe_curves_and_summary(args,work_dir):
	'''
	builds dose response curves for each for the specified probe
	'''
	# instantiate a progress object
	prog = progress.DeterminateProgressBar('Dose Analysis')

	# read the specified probe from the input gctx file
	gcto = gct.GCT()
	probe_ind = gcto.get_gctx_rid_inds(args.res,match_list=args.probe,exact=True)
	gcto.read_gctx_matrix(args.res,row_inds=probe_ind)

	# grab the cids from the file and mine dose information from them.  Find all of 
	# the unique perts
	cids = gcto.get_gctx_cid(args.res)
	doses = [float(x.split(':')[2]) for x in cids]
	perts = [x.split(':')[1] for x in cids]
	unique_perts = list(set(perts))
	
	# for each unique pert_id, find the dose that deviates from the base dose the most.
	# Do template matching to prototype curves. Output a report
	num_perts = len(unique_perts)
	CM = mu.CMapMongo()
	with open(os.path.join(work_dir,args.probe + '_summary.txt'),'w') as f:
		headers = ['pert_id','pert_desc','base_dose','base_z_score',
				   'best_dose','best_z_score', 'best_z_score_delta',
				   'linear','log','half-log','quarter-log','called shape']
		f.write('\t'.join(headers) + '\n')
		for i,unique_pert in enumerate(unique_perts):
			prog.update('analyzing {0}'.format(args.probe),i,num_perts)
			
			# grab the z-scores and doses for the current pert and sort the pairs
			# by dose
			cid_inds = [i for i,x in enumerate(cids) if unique_pert in x]
			pert_scores = gcto.matrix[0,cid_inds]
			pert_doses = [doses[x] for x in cid_inds]
			tmp_tup = zip(pert_doses,pert_scores)
			tmp_tup.sort()
			pert_doses,pert_scores = zip(*tmp_tup)

			# build the dose response plot for the current pert and save it to disk
			plt.plot(pert_doses,pert_scores)
			plt.title('::'.join([unique_pert,args.probe]))
			plt.xlabel('dose')
			plt.ylabel('z-score')
			plt.savefig(os.path.join(work_dir,'_'.join([unique_pert.replace(':','_'),args.probe,'dose_curve.png'])))
			plt.close()

			# grab the pert_desc from mongo
			pert_desc = CM.find({'pert_id':unique_pert},{'pert_desc':True},limit=1)
			if not pert_desc:
				pert_desc = ['-666']
			pert_desc = pert_desc[0]

			# find the best dose and cast them to lists
			base_dose = pert_doses[0]
			base_z_score = pert_scores[0]

			z_delta = (numpy.array(pert_scores) + 10) - (base_z_score + 10)
			abs_z_delta = numpy.abs(z_delta)
			z_delta =  z_delta.tolist()
			abs_z_delta = abs_z_delta.tolist()
			
			best_ind = z_delta.index(numpy.min(z_delta))
			best_dose = pert_doses[best_ind]
			best_z_score = pert_scores[best_ind]
			best_z_score_delta = z_delta[best_ind]

			if len(pert_doses) > 1:
				# build prototype curves if there is more than one dose
				linear = numpy.linspace(1,10,len(pert_doses))
				log_gen = _log_gen(1)
				log_curve = [log_gen.next() for x in range(len(pert_doses))]
				log_gen = _log_gen(.5)
				half_log_curve = [log_gen.next() for x in range(len(pert_doses))]
				log_gen = _log_gen(.25)
				quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))]

				curves = numpy.array([linear,log_curve,
									  half_log_curve,quarter_log_curve])

				# get the correlation coeficient for each of the curves and the
				# current pert dose curve
				corrs = numpy.corrcoef(pert_scores,curves)
				linear_corr = corrs[0][1]
				log_corr = corrs[0][2]
				half_log_corr = corrs[0][3]
				quarter_log_corr = corrs[0][4]

				#report the best shape by finding the best absolute correlation
				abs_corr = numpy.abs(corrs[0][1:])
				if numpy.where(abs_corr > .8)[0].size > 0:
					abs_corr_max = max(abs_corr)
					abs_corr_max_ind = numpy.where(abs_corr == abs_corr_max)[0][0]
					curve_names = ['linear','log','half-log','quarter-log']
					max_curve_name = curve_names[abs_corr_max_ind]
				else:
					max_curve_name = 'none'

			else:
				# if there is only one dose, set all corrs to 'nan'
				linear_corr = 'nan'
				log_corr = 'nan'
				half_log_corr = 'nan'
				quarter_log_corr = 'nan'
				max_curve_name = 'none'



			# write the dose data to the summary file
			data = [unique_pert,pert_desc,str(base_dose),str(base_z_score),
					str(best_dose),str(best_z_score),str(best_z_score_delta),
					str(linear_corr),str(log_corr),str(half_log_corr),
					str(quarter_log_corr),max_curve_name]
			f.write('\t'.join(data) + '\n')
	prog.clear()
Esempio n. 25
0
# read the full data file
import cmap.io.gct as gct
GCTObject = gct.GCT('path_to_gctx_file')
GCTObject.read()
print(GCTObject.matrix)

# read the first 100 rows and 10 columns of the data
import cmap.io.gct as gct
GCTObject = gct.GCT('path_to_gctx_file')
GCTObject.read(row_inds=range(100), col_inds=range(10))
print(GCTObject.matrix)

# get the available meta data headers for data columns and row
column_headers = GCTObject.get_chd()
row_headers = GCTObject.get_rhd()

# get the perturbagen description meta data field from the column data
descs = GCTObject.get_column_meta('pert_desc')

# get the gene symbol meta data field from the row data
symbols = GCTObject.get_row_meta('pr_gene_symbol')
Esempio n. 26
0
plt.xlabel('median summly connection overlap (out of 50)')
plt.ylabel('freq')
plt.title('connection consistency across signatures')
outF = os.path.join(wkdir, 'median_summly_connection_consistency.png')
plt.savefig(outF, bbox_inches=0)
plt.close

def get_medians(x):
    return np.median(x)


## which DOS compounds are in summly space?
# mtrxSummly = '/xchip/cogs/projects/connectivity/summly/matrices/matched_lass_sym_n7322x7322.gctx'
mtrxSummly = '/xchip/cogs/projects/connectivity/summly/matrices/matched_lass_n7147x7147.gctx'
# mtrxSummly = '/xchip/cogs/projects/connectivity/summly/matrices/indep_lass_n39560x7147.gctx'
gt = gct.GCT()
# gt.read_gctx_col_meta(mtrxSummly)
# gt.read_gctx_row_meta(mtrxSummly)
gt.read(mtrxSummly)
columnPerts = gt.get_column_meta('pert_id')
summFrm = gt.frame
summFrm.columns = columnPerts
# find dos cps in summly space
summBrds = summFrm.index.values
summSet = set(summBrds)
dosSet = set(countSer.index)
overlapSet = dosSet.intersection(summSet)
#plot hist of cell counts - dos in summly 
overlapSer = countSerGold.reindex(list(overlapSet))
overlapCount = len(overlapSer)
## plot
Esempio n. 27
0
def template_heatmap(args,work_dir):
	'''
	uses template matching to find the most does responsive probesets for each compound in
	the dataset and generates a list of the top 50 and bottom 50 most dose responsive probes.
	heatmaps across all of the doses are made using these probesets
	'''
	# instantiate a progress object
	prog = progress.DeterminateProgressBar('Template Heatmaps')

	# read the data
	gcto = gct.GCT(args.res)
	gcto.read()

	# grab the cids from the file and mine dose information from them.  Find all of 
	# the unique perts
	cids = gcto.get_gctx_cid(args.res)
	pert_descs = gcto.get_column_meta('pert_desc')
	doses = [float(x.split(':')[2]) for x in cids]
	perts = [x.split(':')[1] for x in cids]
	unique_perts = list(set(perts))

	# grab the rid for use below
	rids = gcto.get_gctx_rid(args.res)

	num_perts = len(unique_perts)
	for i,unique_pert in enumerate(unique_perts):
		prog.update('analyzing {0}'.format(unique_pert),i,num_perts)

		# grab the z-scores and doses for the current pert and sort the pairs
		# by dose. put the cid_inds in the same sorted order
		cid_inds = [i for i,x in enumerate(cids) if unique_pert in x]
		pert_desc = pert_descs[cid_inds[0]] #set pert desc to the first dose
		pert_doses = [doses[x] for x in cid_inds]
		tmp_tup = zip(pert_doses,cid_inds)
		tmp_tup.sort()
		pert_doses,cid_inds = zip(*tmp_tup)

		if len(pert_doses) > 1:
			# build prototype curves if there is more than one dose
			linear = numpy.linspace(1,10,len(pert_doses))
			log_gen = _log_gen(1)
			log_curve = [log_gen.next() for x in range(len(pert_doses))]
			log_gen = _log_gen(.5)
			half_log_curve = [log_gen.next() for x in range(len(pert_doses))]
			log_gen = _log_gen(.25)
			quarter_log_curve = [log_gen.next() for x in range(len(pert_doses))]

			curves = numpy.array([linear,log_curve,
								  half_log_curve,quarter_log_curve])

			# correlate all of the probes in the data to the prototype curves
			pert_data = gcto.matrix[:,cid_inds]
			num_probes = pert_data.shape[0]
			cc = numpy.corrcoef(pert_data,curves)

			# grab the correlation values for all the probes against prototype curves
			linear_probe_corrs = cc[0:num_probes,num_probes]
			log_probe_corrs = cc[0:num_probes,num_probes + 1]
			half_log_probe_corrs = cc[0:num_probes,num_probes + 2]
			quarter_log_probe_corrs = cc[0:num_probes,num_probes + 3]

			# compute the random correlation profile for this pert
			num_probes = gcto.matrix.shape[0]
			probe_inds = range(num_probes)
			linear_perm_cc = []
			log_perm_cc = []
			half_log_perm_cc = []
			quarter_log_perm_cc = []
			for i in range(1000):
				perm_curve_inds = [random.sample(probe_inds,1)[0] for x in range(len(pert_doses))]
				perm_curve = [pert_data[perm_curve_inds[x],x] for x in range(len(pert_doses))]
				perm_covar = numpy.corrcoef(perm_curve,curves)
				linear_perm_cc.append(perm_covar[0][1])
				log_perm_cc.append(perm_covar[0][2])
				half_log_perm_cc.append(perm_covar[0][3])
				quarter_log_perm_cc.append(perm_covar[0][4])

			# compute the nominal p values for all correlation values
			linear_probe_corrs_p = numpy.array([stats.percentileofscore(linear_perm_cc,x) 
									for x in linear_probe_corrs])
			log_probe_corrs_p = numpy.array([stats.percentileofscore(log_perm_cc,x) 
									for x in log_probe_corrs])
			half_log_probe_corrs_p = numpy.array([stats.percentileofscore(half_log_perm_cc,x) 
									for x in half_log_probe_corrs])
			quarter_log_probe_corrs_p = numpy.array([stats.percentileofscore(quarter_log_perm_cc,x) 
									for x in quarter_log_probe_corrs])

			# write the p values and correlations out to file
			with open(os.path.join(work_dir,unique_pert + '_template_match_summary.txt'),'w') as f:
				f.write('\t'.join(['probeset','linear corr', 'linear p','log corr', 'log p',
					'half-log corr', 'half-log p','quarter-log corr', 'quarter-log p']) + '\n')
				for j in range(len(linear_probe_corrs)):
					f.write('\t'.join([rids[j],str(linear_probe_corrs[j]), str(linear_probe_corrs_p[j])
						,str(log_probe_corrs[j]), str(log_probe_corrs_p[j])
						,str(half_log_probe_corrs[j]), str(half_log_probe_corrs_p[j])
						,str(quarter_log_probe_corrs[j]), str(quarter_log_probe_corrs_p[j])]) + '\n')


			# build the linear heatmap
			linear_probe_corrs_sort_ind = numpy.argsort(linear_probe_corrs_p)[::-1]
			top = pert_data[linear_probe_corrs_sort_ind[0:50],:]
			bot = pert_data[linear_probe_corrs_sort_ind[-50:],:]
			combined = numpy.vstack([top,bot])
			combined_row_normalized =  combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T)
			row_sums = combined_row_normalized.sum(axis=1)
			combined_row_normalized =  combined_row_normalized / row_sums[:,numpy.newaxis]
			plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu')
			plt.axis('off')
			plt.savefig(os.path.join(work_dir,unique_pert + '_linear_heatmap.png'))

			# build the log heatmap
			log_probe_corrs_sort_ind = numpy.argsort(log_probe_corrs_p)[::-1]
			top = pert_data[log_probe_corrs_sort_ind[0:50],:]
			bot = pert_data[log_probe_corrs_sort_ind[-50:],:]
			combined = numpy.vstack([top,bot])
			combined_row_normalized =  combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T)
			row_sums = combined_row_normalized.sum(axis=1)
			combined_row_normalized =  combined_row_normalized / row_sums[:,numpy.newaxis]
			plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu')
			plt.axis('off')
			plt.savefig(os.path.join(work_dir,unique_pert + '_log_heatmap.png'))

			# build the half log heatmap
			half_log_probe_corrs_sort_ind = numpy.argsort(half_log_probe_corrs_p)[::-1]
			top = pert_data[half_log_probe_corrs_sort_ind[0:50],:]
			bot = pert_data[half_log_probe_corrs_sort_ind[-50:],:]
			combined = numpy.vstack([top,bot])
			combined_row_normalized =  combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T)
			row_sums = combined_row_normalized.sum(axis=1)
			combined_row_normalized =  combined_row_normalized / row_sums[:,numpy.newaxis]
			plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu')
			plt.axis('off')
			plt.savefig(os.path.join(work_dir,unique_pert + '_half_log_heatmap.png'))

			# build the quarter log heatmap
			quarter_log_probe_corrs_sort_ind = numpy.argsort(quarter_log_probe_corrs_p)[::-1]
			top = pert_data[quarter_log_probe_corrs_sort_ind[0:50],:]
			bot = pert_data[quarter_log_probe_corrs_sort_ind[-50:],:]
			combined = numpy.vstack([top,bot])
			combined_row_normalized =  combined + numpy.abs(numpy.array([numpy.min(combined,1)]).T)
			row_sums = combined_row_normalized.sum(axis=1)
			combined_row_normalized =  combined_row_normalized / row_sums[:,numpy.newaxis]
			plt.imshow(combined_row_normalized,interpolation='nearest',cmap='RdBu')
			plt.axis('off')
			plt.savefig(os.path.join(work_dir,pert_desc + '_quarter_log_heatmap.png'))

			# clear that progress bar
			prog.clear()
Esempio n. 28
0
import cmap.analytics.signature_strength as ss
import cmap.util.mongo_utils as mu

#plot tool
import pylab as pl
import matplotlib.pyplot as plt

#work_dir = '/xchip/cogs/hogstrom/analysis/scratch/Nov27' #MCF7 24h
work_dir = '/xchip/cogs/hogstrom/analysis/scratch/Nov29/dose_analysis_tool.1354211763774'  #pc3 6h
cellLine = 'PC3'
timeP = '6H'
gctfile = '/xchip/obelix/pod/brew/pc/ASG001_%s_%s/by_pert_id_pert_dose/ASG001_%s_%s_COMPZ.MODZ_SCORE_LM_n85x978.gctx' % (
    cellLine, timeP, cellLine, timeP)

#make a gct object
db = gct.GCT()
db.read(gctfile)

### ss calculations ###
SS1 = ss.SigStrength()
SS1.sig_strength_from_gct_file(gctfile, do_zthresh=False)
SS2 = ss.SigStrength()
SS2.sig_strength_from_gct_file(gctfile, do_zthresh=True)  #ss with threshold

qPert = db.get_column_meta('pert_desc')
qPertID = db.get_column_meta('pert_id')
qDose = db.get_column_meta('pert_dose')

## plot ss orig with dose
SSin = SS1.ss
ssMax = numpy.nanmax(SSin)
Esempio n. 29
0
import cmap.util.mongo_utils as mu
import cmap.io.gct as gct
import cmap.io.gmt as gmt
import cmap.analytics.NMF_benchmarks as nmfb

### cell line gcts w/ annotation
# gFile = '/xchip/cogs/web/icmap/custom/TA/brew/pc/TA.OE013_A549_96H/TA.OE013_A549_96H_QNORM_n1117x978.gctx'
# gt_plate = gct.GCT(src=gFile)
# gt_plate.read()
# ds_plate = gt_plate.frame

file_modz = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_COMPZ.MODZ_SCORE_n13974x22268.gctx'
file_qnorm = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_QNORM_n38534x978.gctx'
file_zspcinf = '/xchip/cogs/web/icmap/custom/TA/tnwork/datasets/for_jun10/TA_JUN10_ZSPCINF_n38534x22268.gctx'

gt = gct.GCT(src=file_modz)
gt.read()
ds = gt.frame

wkdir = '/xchip/cogs/projects/NMF/TA_lung_OE_June_2014/gctx_files_annotated/MODZ_INF'
if not os.path.exists(wkdir):
    os.mkdir(wkdir)

# # save matrix for each cell line in OE experiments
# is_oe = colFrame.plate.str.match('TA.OE0')
# oe = colFrame[is_oe]
# cell_grped = oe.groupby('cell_line')
# for grpT in cell_grped:
#     cell = grpT[0]
#     cellDir = wkdir + '/' + cell
#     if not os.path.exists(cellDir):
Esempio n. 30
0
def analyze_query(args,work_dir):
	'''
	Analyze the output from query_tool - find self-connections and create graphs
	'''
	#make a gct object
	db = gct.GCT()
	db.read(args.res)

	##load query result - gctx file
	rslt = gct.GCT()
	#if specific result directory is specified, use that - otherwise get gctx from working dir
	if args.result:
		outGctx = glob.glob(os.path.join(work_dir, '*COMBINED*.gctx')) #select combined result gctx in working dir created from build_query step
		rslt.read(outGctx[0])
	else:
		rslt.read(args.resultDir)

	rsltSigID = rslt.get_rids() #sig IDs from result file

	qPert = db.get_column_meta('pert_desc')
	qPertID = db.get_column_meta('pert_id')
	qDose = db.get_column_meta('pert_dose')
	ESmat = rslt.matrix
	iES = ESmat.argsort(axis=0)[::-1] #sort ascending
	n_inst = len(iES[:,1])

	#loop through each of the perts - graph ranks of query
	prog1 = progress.DeterminateProgressBar('creating self-connection graphs')
	avRnk = []
	medRnk = []
	for i, x in enumerate(qPert):
		prog1.update('graphing {0}',i,len(qPert))
		iE = iES[:,i] #ES sort index for one column
		sSigID = []
		for y in iE:
			sSigID.append(rsltSigID[y]) #make sorted sig ID list
		qStr = qPertID[i]
		cmpd1 = x
		dose1 = qDose[i]
		if len(qStr) >= 13:
			qStr = qStr[0:13] #shorten qPertID
		#i1 = IDsorted.index(qStr) #give first index of match

		#run pymongo query
		CM = mu.CMapMongo()
		#cmpdSigIds = CM.find({'pert_id':qStr},{'sig_id':True})
		cmpdSigIds = CM.find({'pert_id':{'$regex':qStr}},{'sig_id':True}) #search for the BRD-xxxxxxxxxxx within the pert_id field in the db

		#i1 = __all_indices(qStr,sSigID)
		i1 = [sSigID.index(y) for y in cmpdSigIds] #where instances of the compound of interest sit on the rank list
		if len(i1) < 1:
			print cmpd1 + ' has no instances in the cmap database'
			continue
		i2 = numpy.array(i1) #convert list to numpy array
		avr = sum(i2)/len(i2) #what is the average ES rank
		md = numpy.median(i2) # what is the median ES rank
		nAv = float(avr)/n_inst #normalize acording to number of instances in db
		nMd = float(md)/len(iES[:,1]) #normalized median
		avRnk.append(nAv) #store average ES rank
		medRnk.append(nMd)
		#plot
		fname = cmpd1 + '_' + dose1 + '_query_rank.png'
		outf = os.path.join(work_dir,fname)
		fig = plt.figure(figsize=(8.0, 2.0))
		ax = fig.add_subplot(111)
		# the histogram of the data
		n, bins, patches = ax.hist(i2, 30, facecolor='green', alpha=0.75)
		#ax.set_xlim(0, n_inst)
		ax.set_xlim(0, int(round(n_inst,-5))) #round instances to nearest 100k
		ax.set_xlabel('query rank')
		ax.set_ylabel('freq')
		ax.set_title('dose = '+ str(dose1) +'um')
		ax.grid(True)
		plt.savefig(outf, bbox_inches=0)