parsedFilePath = os.path.realpath( os.path.join(homeDir, "dn-trinity", "blast-dn-parsed.csv")) summaryFile1 = os.path.join(homeDir, "blast", "blast-up-parsed_summary.csv") summaryFile2 = os.path.join(homeDir, "blast", 'blast-dm-parsed_summary.csv') summaryFile3 = os.path.join(homeDir, "blast", 'blast-dp-parsed_summary.csv') bm = BlastMapper() ## load the gene and isoform maps bmapSP = bm.load_summary(summaryFile1, trinityGene=False, best=True) bmapDM = bm.load_summary(summaryFile2, trinityGene=False, best=True) bmapDP = bm.load_summary(summaryFile3, trinityGene=False, best=True) print("-----------") print("SwissProt - isoforms") bm.print_summary(bmapSP) print("D. melanogaster - isoforms") bm.print_summary(bmapDM) print("Danaus plexippus - isoforms") bm.print_summary(bmapDP) bm.make_taxa_pie_chart_and_table( bmapSP, removeStrain=True, figName="dn-trinity-blast-pie-isoforms.png", csvName="dn-trinity-blast-species-isoforms.csv") #bm.make_taxa_pie_chart_and_table(bmapInsects,removeStrain=True, # figName="dn-trinity-blast-pie-insects.png", # csvName="dn-trinity-blast-species-insects.csv")
def write(self, blastMap=None, transcriptMin=9, transcriptMax=1000, outFile="genesets.gmt"): """ outFile: specifies the output file path (*.gmt) also a *.csv file with gene transcript mapping will be created if a bmap is provided blastMap: BlastMap returned after loading summary file in BlastMapper transcriptMin: minimum size for a gene set transcriptMax: maximum size for a gene set outFile: outfile path """ print("---------------------") if self.gene2go: print('There are %s genes with at least one annotation' % (len(self.gene2go.keys()))) print('There are %s genes in the labels file' % (len(self.genes))) if blastMap: bm = BlastMapper() bmGenes = bm.print_summary(blastMap) gene2transcript = bm.get_gene_dict(blastMap) if self.gene2go: usableGenes = list( set(bmGenes).intersection(set(self.gene2go.keys()))) else: usableGenes = bmGenes if blastMap: print('There are %s genes with at least one BLAST hit' % (len(bmGenes))) print( 'There are %s genes that have both a BLAST hit and an annotation' % (len(usableGenes))) #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys()))))) ## prepare outfiles writer = csv.writer(open(outFile, 'w'), delimiter="\t") if blastMap: outFileMap = re.sub("\.gmt", ".csv", outFile) writerMap = csv.writer(open(outFileMap, 'w')) writerMap.writerow(["gene_set", "gene_id", "mapped_transcripts"]) ## save gene sets to file failedThreshold = 0 for _k in self.allClusters: clusterInds = np.where(self.labels == _k)[0] clusterGenes = self.genes[clusterInds] gsName = "gs-" + str(_k) if self.gene2go: description = self.get_description(clusterGenes) else: description = "kegg pathway" ## map the genes if blastMap: mapped = set([]) for gene in clusterGenes: if not gene2transcript.has_key(gene): continue geneTranscripts = gene2transcript[gene] geneTranscripts = list( set([ re.sub("\.[0-9]$", "", g) for g in geneTranscripts ])) if blastMap: writerMap.writerow( [gsName, gene, ";".join(list(geneTranscripts))]) mapped.update(geneTranscripts) mapped = list(mapped) else: mapped = clusterGenes ### remove non-unique and versioned genes if type(mapped) == type(np.array([])): mapped = mapped.tolist() if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax: writer.writerow([gsName, description] + mapped) else: failedThreshold += clusterGenes.size print("-----------------") #print("sigma: %s"%self.sigma) #print("k: %s"%self.k) print('Total clusters: %s ' % self.allClusters.size) percentAccepted = float(self.genes.size - failedThreshold) / float( self.genes.size) print( "genes pass threshold %s/%s (%s)" % (self.genes.size - failedThreshold, self.genes.size, round(percentAccepted, 2)) + "%)")
def write(self,blastMap=None, transcriptMin=9, transcriptMax=1000,outFile="genesets.gmt"): """ outFile: specifies the output file path (*.gmt) also a *.csv file with gene transcript mapping will be created if a bmap is provided blastMap: BlastMap returned after loading summary file in BlastMapper transcriptMin: minimum size for a gene set transcriptMax: maximum size for a gene set outFile: outfile path """ print("---------------------") if self.gene2go: print('There are %s genes with at least one annotation'%(len(self.gene2go.keys()))) print('There are %s genes in the labels file'%(len(self.genes))) if blastMap: bm = BlastMapper() bmGenes = bm.print_summary(blastMap) gene2transcript = bm.get_gene_dict(blastMap) if self.gene2go: usableGenes = list(set(bmGenes).intersection(set(self.gene2go.keys()))) else: usableGenes = bmGenes if blastMap: print('There are %s genes with at least one BLAST hit'%(len(bmGenes))) print('There are %s genes that have both a BLAST hit and an annotation'%(len(usableGenes))) #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys()))))) ## prepare outfiles writer = csv.writer(open(outFile,'w'),delimiter="\t") if blastMap: outFileMap = re.sub("\.gmt",".csv",outFile) writerMap = csv.writer(open(outFileMap,'w')) writerMap.writerow(["gene_set","gene_id","mapped_transcripts"]) ## save gene sets to file failedThreshold = 0 for _k in self.allClusters: clusterInds = np.where(self.labels==_k)[0] clusterGenes = self.genes[clusterInds] gsName = "gs-"+str(_k) if self.gene2go: description = self.get_description(clusterGenes) else: description = "kegg pathway" ## map the genes if blastMap: mapped = set([]) for gene in clusterGenes: if not gene2transcript.has_key(gene): continue geneTranscripts = gene2transcript[gene] geneTranscripts = list(set([re.sub("\.[0-9]$","",g) for g in geneTranscripts])) if blastMap: writerMap.writerow([gsName,gene,";".join(list(geneTranscripts))]) mapped.update(geneTranscripts) mapped = list(mapped) else: mapped = clusterGenes ### remove non-unique and versioned genes #if len(mapped) > 0: if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax: writer.writerow([gsName,description] + mapped) else: failedThreshold+=clusterGenes.size print("-----------------") #print("sigma: %s"%self.sigma) #print("k: %s"%self.k) print('Total clusters: %s '%self.allClusters.size) percentAccepted = float(self.genes.size-failedThreshold) / float(self.genes.size) print("genes pass threshold %s/%s (%s)"%(self.genes.size-failedThreshold,self.genes.size,round(percentAccepted,2)) + "%)")
summaryFile1 = os.path.join(homeDir, sourceDir, "blast-%s-parsed_summary.csv" % (source)) summaryFile2 = os.path.join(homeDir, sourceDir, "blast-xt-parsed_summary.csv") bm = BlastMapper() ## load the gene and isoform maps bmapIsoforms = bm.load_summary(summaryFile1, trinityGene=False, best=True) bmapFrog = bm.load_summary(summaryFile1, trinityGene=False, best=True, taxaList=['8355', '8364']) bmapXT = bm.load_summary(summaryFile2, trinityGene=False, best=True) print("-----------") print("SwissProt - isoforms") bm.print_summary(bmapIsoforms) print("SwissProt [8355,8364] - isoforms") bm.print_summary(bmapFrog) print("X. tropicalis - isoforms") bm.print_summary(bmapXT) bm.make_taxa_pie_chart_and_table( bmapIsoforms, removeStrain=True, figName="%s-trinity-blast-pie-isoforms.png" % (source), csvName="%s-trinity-blast-species-isoforms.csv" % (source)) bm.make_taxa_pie_chart_and_table( bmapFrog, removeStrain=True, figName="%s-trinity-blast-pie-frog.png" % (source),