import numpy as np from sqlalchemy.sql import select from htsint.database import db_connect, Taxon, Gene from htsint.tools import read_matrix, read_de_results, print_rest_table_contents from htsint.blast import BlastMapper #assembly = 'dn' threshold = 0.05 homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "pieris") transcript = 'isoforms' featuresDir = os.path.join(homeDir, "features") ## load the differential expression results evalue = 0.00001 deseqResultsPath = os.path.join(featuresDir, "deseq.csv") deseqMatIds, deseqMatColumns, deseqMat = read_de_results(deseqResultsPath, tool='DESeq') ## create a summary table and a csv file outPath = os.path.join(featuresDir, 'de-summary-%s.csv' % (transcript)) fidout = open(outPath, 'w') writer = csv.writer(fidout) writer.writerow([ 'transcript ID', 'hitId', 'hitNcbiId', 'hitSpecies', 'e-value', 'DESeq-pval', 'DESeq-adj-pval' ]) ## load ref2gene reader = csv.reader(open("../gene2ref.tab", "r"), delimiter="\t") ref2gene = {} for linja in reader: geneId = linja[1]
s = select([Taxon.id,Taxon.ncbi_id,Taxon.name]).where(Taxon.ncbi_id.in_(['8364'])) _taxaQueries = conn.execute(s) taxaQueries = _taxaQueries.fetchall() gene2taxa,gene2desc,gene2sym = {},{},{} for tquery in taxaQueries: s = select([Gene.taxa_id,Gene.ncbi_id,Gene.description,Gene.symbol],Gene.taxa_id==tquery['id']) _geneQueries = conn.execute(s) geneQueries = _geneQueries.fetchall() gene2taxa.update(dict([(str(r['ncbi_id']),str(r['taxa_id'])) for r in geneQueries])) gene2desc.update(dict([(str(r['ncbi_id']),str(r['description'])) for r in geneQueries])) gene2sym.update(dict([(str(r['ncbi_id']),str(r['symbol'])) for r in geneQueries])) ## load feature data featuresDir = os.path.join(homeDir,"%s-trinity"%assembly,"features") edgerResultsPath = os.path.join(featuresDir,"edger_%s_behavior_de.csv"%(transcript)) edgerIds, edgerColumns, edgerMat = read_de_results(edgerResultsPath,tool='edgeR') deseqResultsPath = os.path.join(featuresDir,"deseq_%s_behavior_de.csv"%(transcript)) deseqIds, deseqColumns, deseqMat = read_de_results(deseqResultsPath,tool='DESeq') dfeMatrixPath = os.path.join(featuresDir,"deseq_%s_behavior_de_samples.csv"%(transcript)) dfeIds,dfeColumns,dfeMat = read_matrix(dfeMatrixPath,mtype='float') ## load the blast map if transcript == 'genes': blastMap = get_blast_map(os.path.join("..","..","blast","summary_blast_%s.csv"%assembly),\ taxaList=["8364"],asGenes=True) else: blastMap = get_blast_map(os.path.join("..","..","blast","summary_blast_%s.csv"%assembly),\ taxaList=["8364"],asGenes=False) ## setup filters for the transcripts threshold = 0.1
#!/usr/bin/python """ """ import sys import numpy as np from htsint.tools import read_matrix,read_de_results,Heatmap ## load differential expression data deseqIds, deseqColumns, deseqMat = read_de_results('deseq.csv',tool='DESeq') dfeIds,dfeColumns,dfeMat = read_matrix('deseq-samples.csv',mtype='float') padjInd = np.where(deseqColumns == 'padj')[0] ## filter out nans print deseqMat.shape padjInd = np.where(deseqColumns == 'padj')[0] size1 = deseqIds.shape[0] filter1 = np.where(~np.isnan(deseqMat[:,padjInd]))[0] deseqIds = deseqIds[filter1] deseqMat = deseqMat[filter1,:] mask = np.in1d(dfeIds,deseqIds) dfeIds = dfeIds[mask] dfeMat = dfeMat[mask,:] print("... %s/%s transcripts pass nan filter"%(filter1.size,size1)) ## filter for only the most significant transcripts (max 50) threshold = 0.5 size2 = deseqIds.shape[0] filter2 = np.where(deseqMat[:,padjInd] <= threshold)[0][:50] deseqIds = deseqIds[filter2]
s = select([Gene.taxa_id, Gene.ncbi_id, Gene.description, Gene.symbol], Gene.taxa_id == tquery['id']) _geneQueries = conn.execute(s) geneQueries = _geneQueries.fetchall() gene2taxa.update( dict([(str(r['ncbi_id']), str(r['taxa_id'])) for r in geneQueries])) gene2desc.update( dict([(str(r['ncbi_id']), str(r['description'])) for r in geneQueries])) gene2sym.update( dict([(str(r['ncbi_id']), str(r['symbol'])) for r in geneQueries])) ## load feature data featuresDir = os.path.join(homeDir, "%s-trinity" % assembly, "features") edgerResultsPath1 = os.path.join(featuresDir, "edger_%s_de.csv" % (transcript)) edgerIds1, edgerColumns1, edgerMat1 = read_de_results(edgerResultsPath1, tool='edgeR') deseqResultsPath1 = os.path.join(featuresDir, "deseq_%s_de.csv" % (transcript)) deseqIds1, deseqColumns1, deseqMat1 = read_de_results(deseqResultsPath, tool='DESeq') edgerResultsPath2 = os.path.join(featuresDir, "edger_%s_behavior_de.csv" % (transcript)) edgerIds2, edgerColumns2, edgerMat2 = read_de_results(edgerResultsPath2, tool='edgeR') deseqResultsPath2 = os.path.join(featuresDir, "deseq_%s_behavior_de.csv" % (transcript)) deseqIds2, deseqColumns2, deseqMat2 = read_de_results(deseqResultsPath2, tool='DESeq') #dfeMatrixPath = os.path.join(featuresDir,"deseq_%s_behavior_de_samples.csv"%(transcript)) #dfeIds,dfeColumns,dfeMat = read_matrix(dfeMatrixPath,mtype='float')
#!/usr/bin/python """ """ import sys import numpy as np from htsint.tools import read_matrix, read_de_results, Heatmap ## load differential expression data deseqIds, deseqColumns, deseqMat = read_de_results('deseq.csv', tool='DESeq') dfeIds, dfeColumns, dfeMat = read_matrix('deseq-samples.csv', mtype='float') padjInd = np.where(deseqColumns == 'padj')[0] ## filter out nans print deseqMat.shape padjInd = np.where(deseqColumns == 'padj')[0] size1 = deseqIds.shape[0] filter1 = np.where(~np.isnan(deseqMat[:, padjInd]))[0] deseqIds = deseqIds[filter1] deseqMat = deseqMat[filter1, :] mask = np.in1d(dfeIds, deseqIds) dfeIds = dfeIds[mask] dfeMat = dfeMat[mask, :] print("... %s/%s transcripts pass nan filter" % (filter1.size, size1)) ## filter for only the most significant transcripts (max 50) threshold = 0.5 size2 = deseqIds.shape[0] filter2 = np.where(deseqMat[:, padjInd] <= threshold)[0][:50] deseqIds = deseqIds[filter2]
def write_summary(name, aspect, transcript, assembly, geneset): ## load the go dictionaries termsPath = os.path.join("..", "results", "go-terms-%s-%s.pickle" % (name, aspect)) tmp = open(termsPath, 'r') gene2go, go2gene = cPickle.load(tmp) tmp.close() ## load the blast map bm = BlastMapper() homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus") sizeMin, sizeMax = 5, 100 summaryFile = os.path.join(homeDir, "%s-trinity" % (assembly), 'blast-%s-parsed_summary.csv' % assembly) if transcript == 'genes': bmap = bm.load_summary(summaryFile, trinityGene=True, best=False, taxaList=['8364', '8355', '9606'], evalue=0.0001) else: bmap = bm.load_summary(summaryFile, trinityGene=False, best=False, taxaList=['8364', '8355', '9606'], evalue=0.0001) ## get gene level differencial exp results featuresDir = os.path.join(homeDir, "%s-trinity" % assembly, "features") deseqResultsPath = os.path.join(featuresDir, "deseq_%s_de.csv" % (transcript)) deseqIds, deseqColumns, deseqMat = read_de_results(deseqResultsPath, tool='DESeq') padjInd = np.where(deseqColumns == 'padj')[0] pvalInd = np.where(deseqColumns == 'pvalue')[0] ## input/output genesetSummaryFile = os.path.join( "..", "results", "genesets", "%s-%s-%s-%s-%s.csv" % (name, aspect, transcript, assembly, re.sub("gs-", "", geneset))) genesetFile = os.path.join( "..", "results", "%s-%s-%s-%s.gmt" % (name, aspect, assembly, transcript)) if not os.path.exists(genesetFile): raise Exception("cannot find gene set file") allGenesets = {} fid = open(genesetFile, 'r') for linja in fid: linja = [re.sub("\s+", "", l) for l in linja.split("\t")] allGenesets[linja[0]] = linja[2:] fid.close() gsTranscripts = allGenesets[geneset] ## map back to gene space and collect go terms transcript2genes = {} for t in gsTranscripts: transcript2genes[t] = {} species = list(set([hit[2] for hit in bmap[t]])) ## organize the hits by species for hit in bmap[t]: if not transcript2genes[t].has_key(hit[2]): transcript2genes[t][hit[2]] = [] transcript2genes[t][hit[2]].append(hit[1]) ## get inferred go terms for each transcript transcript2go = {} for t, hit in transcript2genes.iteritems(): transcript2go[t] = [] for genes in hit.itervalues(): #gene = v[1] for gene in genes: if gene2go.has_key(gene): transcript2go[t].extend(gene2go[gene]) transcript2go[t] = list(set(transcript2go[t])) transcript2go[t].sort() ## write to file writer = csv.writer(open(genesetSummaryFile, 'w')) writer.writerow(["transcript", "p-value", "genes", "go-terms"]) allTerms = [] for ts in gsTranscripts: pvalue = deseqMat[np.where(deseqIds == ts)[0], pvalInd][0] reportedGenes = [] for taxa, genes in transcript2genes[ts].iteritems(): reportedGenes.extend(genes[:2]) reportedGenes = list(set(reportedGenes)) if len(reportedGenes) > 1: genes = ";".join(reportedGenes) else: genes = reportedGenes[0] terms = transcript2go[ts] if terms: allTerms.extend(terms) if not terms: terms = "None" elif len(terms) > 1: terms = ";".join(terms) else: terms = terms[0] writer.writerow([ts, pvalue, genes, terms]) writer.writerow(["--------"]) ## write a summary of the go terms allTerms = np.array(list(set(allTerms))) allTermCounts = np.zeros(allTerms.size, ) for t, term in enumerate(allTerms): for ts in gsTranscripts: allTermCounts[t] += np.where( np.array(transcript2go[ts]) == term)[0].size sortedTerms = allTerms[np.argsort(allTermCounts)[::-1]] sortedCounts = allTermCounts[np.argsort(allTermCounts)[::-1]] writer.writerow(["ID", "Counts", "Description"]) for t, term in enumerate(sortedTerms): desc = session.query(GoTerm).filter(GoTerm.go_id == term).first().name writer.writerow([term, sortedCounts[t], desc])