Exemple #1
0
parsedFilePath = os.path.realpath(
    os.path.join(homeDir, "dn-trinity", "blast-dn-parsed.csv"))
summaryFile1 = os.path.join(homeDir, "blast", "blast-up-parsed_summary.csv")
summaryFile2 = os.path.join(homeDir, "blast", 'blast-dm-parsed_summary.csv')
summaryFile3 = os.path.join(homeDir, "blast", 'blast-dp-parsed_summary.csv')

bm = BlastMapper()

## load the gene and isoform maps
bmapSP = bm.load_summary(summaryFile1, trinityGene=False, best=True)
bmapDM = bm.load_summary(summaryFile2, trinityGene=False, best=True)
bmapDP = bm.load_summary(summaryFile3, trinityGene=False, best=True)

print("-----------")
print("SwissProt - isoforms")
bm.print_summary(bmapSP)
print("D. melanogaster - isoforms")
bm.print_summary(bmapDM)
print("Danaus plexippus - isoforms")
bm.print_summary(bmapDP)

bm.make_taxa_pie_chart_and_table(
    bmapSP,
    removeStrain=True,
    figName="dn-trinity-blast-pie-isoforms.png",
    csvName="dn-trinity-blast-species-isoforms.csv")

#bm.make_taxa_pie_chart_and_table(bmapInsects,removeStrain=True,
#                                 figName="dn-trinity-blast-pie-insects.png",
#                                 csvName="dn-trinity-blast-species-insects.csv")
Exemple #2
0
    def write(self,
              blastMap=None,
              transcriptMin=9,
              transcriptMax=1000,
              outFile="genesets.gmt"):
        """
        outFile: specifies the output file path (*.gmt)
        also a *.csv file with gene transcript mapping will be created if a bmap is provided

        blastMap: BlastMap returned after loading summary file in BlastMapper 
        transcriptMin: minimum size for a gene set
        transcriptMax: maximum size for a gene set
        outFile: outfile path

        """

        print("---------------------")
        if self.gene2go:
            print('There are %s genes with at least one annotation' %
                  (len(self.gene2go.keys())))
        print('There are %s genes in the labels file' % (len(self.genes)))

        if blastMap:
            bm = BlastMapper()
            bmGenes = bm.print_summary(blastMap)
            gene2transcript = bm.get_gene_dict(blastMap)
            if self.gene2go:
                usableGenes = list(
                    set(bmGenes).intersection(set(self.gene2go.keys())))
            else:
                usableGenes = bmGenes

        if blastMap:
            print('There are %s genes with at least one BLAST hit' %
                  (len(bmGenes)))
            print(
                'There are %s genes that have both a BLAST hit and an annotation'
                % (len(usableGenes)))
            #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys())))))

        ## prepare outfiles
        writer = csv.writer(open(outFile, 'w'), delimiter="\t")

        if blastMap:
            outFileMap = re.sub("\.gmt", ".csv", outFile)
            writerMap = csv.writer(open(outFileMap, 'w'))
            writerMap.writerow(["gene_set", "gene_id", "mapped_transcripts"])

        ## save gene sets to file
        failedThreshold = 0

        for _k in self.allClusters:
            clusterInds = np.where(self.labels == _k)[0]
            clusterGenes = self.genes[clusterInds]
            gsName = "gs-" + str(_k)
            if self.gene2go:
                description = self.get_description(clusterGenes)
            else:
                description = "kegg pathway"

            ## map the genes
            if blastMap:
                mapped = set([])
                for gene in clusterGenes:
                    if not gene2transcript.has_key(gene):
                        continue
                    geneTranscripts = gene2transcript[gene]
                    geneTranscripts = list(
                        set([
                            re.sub("\.[0-9]$", "", g) for g in geneTranscripts
                        ]))

                    if blastMap:
                        writerMap.writerow(
                            [gsName, gene, ";".join(list(geneTranscripts))])
                    mapped.update(geneTranscripts)
                mapped = list(mapped)
            else:
                mapped = clusterGenes

            ### remove non-unique and versioned genes
            if type(mapped) == type(np.array([])):
                mapped = mapped.tolist()

            if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax:
                writer.writerow([gsName, description] + mapped)
            else:
                failedThreshold += clusterGenes.size

        print("-----------------")
        #print("sigma: %s"%self.sigma)
        #print("k: %s"%self.k)
        print('Total clusters: %s ' % self.allClusters.size)
        percentAccepted = float(self.genes.size - failedThreshold) / float(
            self.genes.size)
        print(
            "genes pass threshold %s/%s (%s)" %
            (self.genes.size - failedThreshold, self.genes.size,
             round(percentAccepted, 2)) + "%)")
Exemple #3
0
    def write(self,blastMap=None, transcriptMin=9, transcriptMax=1000,outFile="genesets.gmt"):
        """
        outFile: specifies the output file path (*.gmt)
        also a *.csv file with gene transcript mapping will be created if a bmap is provided

        blastMap: BlastMap returned after loading summary file in BlastMapper 
        transcriptMin: minimum size for a gene set
        transcriptMax: maximum size for a gene set
        outFile: outfile path

        """

        print("---------------------")
        if self.gene2go:
            print('There are %s genes with at least one annotation'%(len(self.gene2go.keys())))
        print('There are %s genes in the labels file'%(len(self.genes)))

        if blastMap:
            bm = BlastMapper()
            bmGenes = bm.print_summary(blastMap)
            gene2transcript = bm.get_gene_dict(blastMap)
            if self.gene2go:
                usableGenes = list(set(bmGenes).intersection(set(self.gene2go.keys())))
            else:
                usableGenes = bmGenes
                
        if blastMap:
            print('There are %s genes with at least one BLAST hit'%(len(bmGenes)))
            print('There are %s genes that have both a BLAST hit and an annotation'%(len(usableGenes)))
            #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys())))))

        ## prepare outfiles
        writer = csv.writer(open(outFile,'w'),delimiter="\t")

        if blastMap:
            outFileMap = re.sub("\.gmt",".csv",outFile)
            writerMap = csv.writer(open(outFileMap,'w'))
            writerMap.writerow(["gene_set","gene_id","mapped_transcripts"])

        ## save gene sets to file
        failedThreshold = 0
        
        for _k in self.allClusters:
            clusterInds = np.where(self.labels==_k)[0]
            clusterGenes = self.genes[clusterInds]
            gsName = "gs-"+str(_k)
            if self.gene2go:
                description = self.get_description(clusterGenes)
            else:
                description = "kegg pathway"
                
            ## map the genes
            if blastMap:
                mapped = set([])
                for gene in clusterGenes:
                    if not gene2transcript.has_key(gene):
                        continue
                    geneTranscripts = gene2transcript[gene]
                    geneTranscripts = list(set([re.sub("\.[0-9]$","",g) for g in geneTranscripts]))

                    if blastMap:
                        writerMap.writerow([gsName,gene,";".join(list(geneTranscripts))])
                    mapped.update(geneTranscripts)
                mapped = list(mapped)
            else:
                mapped = clusterGenes

            ### remove non-unique and versioned genes
            #if len(mapped) > 0:
                

            if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax: 
                writer.writerow([gsName,description] + mapped)
            else:
                failedThreshold+=clusterGenes.size

        print("-----------------")
        #print("sigma: %s"%self.sigma)
        #print("k: %s"%self.k)
        print('Total clusters: %s '%self.allClusters.size)
        percentAccepted = float(self.genes.size-failedThreshold) / float(self.genes.size)
        print("genes pass threshold %s/%s (%s)"%(self.genes.size-failedThreshold,self.genes.size,round(percentAccepted,2)) + "%)")
Exemple #4
0
summaryFile1 = os.path.join(homeDir, sourceDir,
                            "blast-%s-parsed_summary.csv" % (source))
summaryFile2 = os.path.join(homeDir, sourceDir, "blast-xt-parsed_summary.csv")
bm = BlastMapper()

## load the gene and isoform maps
bmapIsoforms = bm.load_summary(summaryFile1, trinityGene=False, best=True)
bmapFrog = bm.load_summary(summaryFile1,
                           trinityGene=False,
                           best=True,
                           taxaList=['8355', '8364'])
bmapXT = bm.load_summary(summaryFile2, trinityGene=False, best=True)

print("-----------")
print("SwissProt - isoforms")
bm.print_summary(bmapIsoforms)
print("SwissProt [8355,8364] - isoforms")
bm.print_summary(bmapFrog)
print("X. tropicalis - isoforms")
bm.print_summary(bmapXT)

bm.make_taxa_pie_chart_and_table(
    bmapIsoforms,
    removeStrain=True,
    figName="%s-trinity-blast-pie-isoforms.png" % (source),
    csvName="%s-trinity-blast-species-isoforms.csv" % (source))

bm.make_taxa_pie_chart_and_table(
    bmapFrog,
    removeStrain=True,
    figName="%s-trinity-blast-pie-frog.png" % (source),