Ejemplo n.º 1
0
    def setUp(self):
        """
        connect to the database
        """

        self.parsedFile = os.path.join(os.path.dirname(__file__),"blast-parsed.csv")
        self.bm = BlastMapper()
Ejemplo n.º 2
0
class BlastMapperTest(unittest.TestCase):
    """
    Run a number of tests using taxa id
    """

    def setUp(self):
        """
        connect to the database
        """

        self.parsedFile = os.path.join(os.path.dirname(__file__),"blast-parsed.csv")
        self.bm = BlastMapper()
        
    def test01Summarize(self):
        """
        test the summarize function
        """
    
        summaryFile = re.sub("\.csv","",self.parsedFile)+"_summary.csv"
        if os.path.exists(summaryFile):
            os.remove(summaryFile)

        self.bm.create_summarized(self.parsedFile,uniprot=True)
        self.assertTrue(os.path.exists(summaryFile))

    def test02Something(self):
        """
        read in the results summary
        """

        summaryFile = re.sub("\.csv","",self.parsedFile)+"_summary.csv"
        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"])
        self.assertEqual(bmap['GG11117|c2_g1_i1'][0],'INT1_MOUSE')
        self.assertEqual(bmap['GG11117|c2_g1_i1'][1],'68510')
        
        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"],trinityGene=True)
        self.assertEqual(bmap['GG11117|c2_g1'][0],'INT1_MOUSE')

        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"],trinityGene=True,best=False)
        self.assertEqual(bmap['GG11117|c2_g1'][0][0],'INT1_MOUSE')
        self.assertEqual(bmap['GG11117|c2_g1'][0][4],0.0)
Ejemplo n.º 3
0
for tquery in taxaQueries:
    s = select([Gene.taxa_id, Gene.ncbi_id, Gene.description, Gene.symbol],
               Gene.taxa_id == tquery['id'])
    _geneQueries = conn.execute(s)
    geneQueries = _geneQueries.fetchall()
    gene2taxa.update(
        dict([(str(r['ncbi_id']), str(r['taxa_id'])) for r in geneQueries]))
    gene2desc.update(
        dict([(str(r['ncbi_id']), str(r['description']))
              for r in geneQueries]))
    gene2sym.update(
        dict([(str(r['ncbi_id']), str(r['symbol'])) for r in geneQueries]))

## load the blast map
bm = BlastMapper()
summaryFile1 = os.path.join(homeDir, "dn-trinity",
                            'blast-dn-parsed_summary.csv')
summaryFile2 = os.path.join(homeDir, "dn-trinity",
                            'blast-dm-parsed_summary.csv')
summaryFile3 = os.path.join(homeDir, "dn-trinity",
                            "blast-mc-parsed_summary.csv")
summaryFile4 = os.path.join(homeDir, "dn-trinity",
                            'blast-dp-parsed_summary.csv')

bmapSP = bm.load_summary(summaryFile1, trinityGene=False, best=True)
bmapDM = bm.load_summary(summaryFile2, trinityGene=False, best=True)
bmapMC = bm.load_summary(summaryFile3, trinityGene=False, best=True)
bmapDP = bm.load_summary(summaryFile4, trinityGene=False, best=True)

## prepare supplment output
Ejemplo n.º 4
0
Take the parsed blast results and create a summary file to be read by BlastMapper

"""

import os, sys, csv, re, getopt, time

from htsint.blast import BlastMapper

homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "pieris")
parsedFilePath = os.path.realpath(
    os.path.join(homeDir, "dn-trinity", "blast-dn-parsed.csv"))
summaryFile1 = os.path.join(homeDir, "blast", "blast-up-parsed_summary.csv")
summaryFile2 = os.path.join(homeDir, "blast", 'blast-dm-parsed_summary.csv')
summaryFile3 = os.path.join(homeDir, "blast", 'blast-dp-parsed_summary.csv')

bm = BlastMapper()

## load the gene and isoform maps
bmapSP = bm.load_summary(summaryFile1, trinityGene=False, best=True)
bmapDM = bm.load_summary(summaryFile2, trinityGene=False, best=True)
bmapDP = bm.load_summary(summaryFile3, trinityGene=False, best=True)

print("-----------")
print("SwissProt - isoforms")
bm.print_summary(bmapSP)
print("D. melanogaster - isoforms")
bm.print_summary(bmapDM)
print("Danaus plexippus - isoforms")
bm.print_summary(bmapDP)

bm.make_taxa_pie_chart_and_table(
Ejemplo n.º 5
0
    def write(self,blastMap=None, transcriptMin=9, transcriptMax=1000,outFile="genesets.gmt"):
        """
        outFile: specifies the output file path (*.gmt)
        also a *.csv file with gene transcript mapping will be created if a bmap is provided

        blastMap: BlastMap returned after loading summary file in BlastMapper 
        transcriptMin: minimum size for a gene set
        transcriptMax: maximum size for a gene set
        outFile: outfile path

        """

        print("---------------------")
        if self.gene2go:
            print('There are %s genes with at least one annotation'%(len(self.gene2go.keys())))
        print('There are %s genes in the labels file'%(len(self.genes)))

        if blastMap:
            bm = BlastMapper()
            bmGenes = bm.print_summary(blastMap)
            gene2transcript = bm.get_gene_dict(blastMap)
            if self.gene2go:
                usableGenes = list(set(bmGenes).intersection(set(self.gene2go.keys())))
            else:
                usableGenes = bmGenes
                
        if blastMap:
            print('There are %s genes with at least one BLAST hit'%(len(bmGenes)))
            print('There are %s genes that have both a BLAST hit and an annotation'%(len(usableGenes)))
            #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys())))))

        ## prepare outfiles
        writer = csv.writer(open(outFile,'w'),delimiter="\t")

        if blastMap:
            outFileMap = re.sub("\.gmt",".csv",outFile)
            writerMap = csv.writer(open(outFileMap,'w'))
            writerMap.writerow(["gene_set","gene_id","mapped_transcripts"])

        ## save gene sets to file
        failedThreshold = 0
        
        for _k in self.allClusters:
            clusterInds = np.where(self.labels==_k)[0]
            clusterGenes = self.genes[clusterInds]
            gsName = "gs-"+str(_k)
            if self.gene2go:
                description = self.get_description(clusterGenes)
            else:
                description = "kegg pathway"
                
            ## map the genes
            if blastMap:
                mapped = set([])
                for gene in clusterGenes:
                    if not gene2transcript.has_key(gene):
                        continue
                    geneTranscripts = gene2transcript[gene]
                    geneTranscripts = list(set([re.sub("\.[0-9]$","",g) for g in geneTranscripts]))

                    if blastMap:
                        writerMap.writerow([gsName,gene,";".join(list(geneTranscripts))])
                    mapped.update(geneTranscripts)
                mapped = list(mapped)
            else:
                mapped = clusterGenes

            ### remove non-unique and versioned genes
            #if len(mapped) > 0:
                

            if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax: 
                writer.writerow([gsName,description] + mapped)
            else:
                failedThreshold+=clusterGenes.size

        print("-----------------")
        #print("sigma: %s"%self.sigma)
        #print("k: %s"%self.k)
        print('Total clusters: %s '%self.allClusters.size)
        percentAccepted = float(self.genes.size-failedThreshold) / float(self.genes.size)
        print("genes pass threshold %s/%s (%s)"%(self.genes.size-failedThreshold,self.genes.size,round(percentAccepted,2)) + "%)")
Ejemplo n.º 6
0
    scps = SpectralClusterParamSearch(geneDistancePath,dtype='distance')
    scps.run(chunks=15)

## plot the parameter search 
psFigureFile = os.path.join(homeDir,"param-scan-%s.png"%(_aspect))
if not os.path.exists(psFigureFile):
    scr = SpectralClusterResults(silvalFile,clustersFile)
    scr.plot(figName=psFigureFile)

## run spectral clustering
k = 20
sigma = 0.08

labelsPath = os.path.join(homeDir,"sc-labels-%s.csv"%(_aspect))
if not os.path.exists(labelsPath):
    sc = SpectralCluster(geneDistancePath,dtype='distance')
    sc.run(k,sk=None,sigma=sigma,verbose=True)
    sc.save(labelsPath=labelsPath)

## Save gene sets
bm = BlastMapper()
bmap = bm.load_summary('blast-parsed-summary.csv',best=False,taxaList=['8355','8364'])

transcriptMin,transcriptMax = 9,1000  
gsFile = os.path.join(homeDir,"%s.gmt"%(_aspect))                                                                                                       
if not os.path.exists(gsFile):
    gsc = GeneSetCollection(labelsPath,gene2go)
    gsc.write(blastMap=bmap,transcriptMin=transcriptMin,transcriptMax=transcriptMax,outFile=gsFile)

print("process complete.")
Ejemplo n.º 7
0
    taxaList = ['13037']
    uniprot = False
elif db == 'dm':
    species = 'Drosophila melanogaster'
    taxaList = ['7227']
    uniprot = False
else:
    species = None
    taxaList = []
    uniprot = True

homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "pieris",
                       "blast")
parsedFilePath = os.path.realpath(
    os.path.join(homeDir, "blast-%s-parsed.csv" % (db)))
bm = BlastMapper()

## read in the gene2ensemble file
if db == 'dm':
    fid = open(os.path.join(homeDir, 'gene2ensembl'), 'r')
    reader = csv.reader(fid, delimiter="\t")
    header = reader.next()
    id2gene = {}
    for linja in reader:
        if linja[0] != taxaList[0]:
            continue
        id2gene[linja[4]] = linja[1]
elif db == 'dp':
    transcript2uniprot = {}
    fid = open(
        os.path.join(homeDir, 'Danaus_plexippus.DanPle_1.0.28.uniprot.tsv'),
Ejemplo n.º 8
0
    def write(self,
              blastMap=None,
              transcriptMin=9,
              transcriptMax=1000,
              outFile="genesets.gmt"):
        """
        outFile: specifies the output file path (*.gmt)
        also a *.csv file with gene transcript mapping will be created if a bmap is provided

        blastMap: BlastMap returned after loading summary file in BlastMapper 
        transcriptMin: minimum size for a gene set
        transcriptMax: maximum size for a gene set
        outFile: outfile path

        """

        print("---------------------")
        if self.gene2go:
            print('There are %s genes with at least one annotation' %
                  (len(self.gene2go.keys())))
        print('There are %s genes in the labels file' % (len(self.genes)))

        if blastMap:
            bm = BlastMapper()
            bmGenes = bm.print_summary(blastMap)
            gene2transcript = bm.get_gene_dict(blastMap)
            if self.gene2go:
                usableGenes = list(
                    set(bmGenes).intersection(set(self.gene2go.keys())))
            else:
                usableGenes = bmGenes

        if blastMap:
            print('There are %s genes with at least one BLAST hit' %
                  (len(bmGenes)))
            print(
                'There are %s genes that have both a BLAST hit and an annotation'
                % (len(usableGenes)))
            #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys())))))

        ## prepare outfiles
        writer = csv.writer(open(outFile, 'w'), delimiter="\t")

        if blastMap:
            outFileMap = re.sub("\.gmt", ".csv", outFile)
            writerMap = csv.writer(open(outFileMap, 'w'))
            writerMap.writerow(["gene_set", "gene_id", "mapped_transcripts"])

        ## save gene sets to file
        failedThreshold = 0

        for _k in self.allClusters:
            clusterInds = np.where(self.labels == _k)[0]
            clusterGenes = self.genes[clusterInds]
            gsName = "gs-" + str(_k)
            if self.gene2go:
                description = self.get_description(clusterGenes)
            else:
                description = "kegg pathway"

            ## map the genes
            if blastMap:
                mapped = set([])
                for gene in clusterGenes:
                    if not gene2transcript.has_key(gene):
                        continue
                    geneTranscripts = gene2transcript[gene]
                    geneTranscripts = list(
                        set([
                            re.sub("\.[0-9]$", "", g) for g in geneTranscripts
                        ]))

                    if blastMap:
                        writerMap.writerow(
                            [gsName, gene, ";".join(list(geneTranscripts))])
                    mapped.update(geneTranscripts)
                mapped = list(mapped)
            else:
                mapped = clusterGenes

            ### remove non-unique and versioned genes
            if type(mapped) == type(np.array([])):
                mapped = mapped.tolist()

            if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax:
                writer.writerow([gsName, description] + mapped)
            else:
                failedThreshold += clusterGenes.size

        print("-----------------")
        #print("sigma: %s"%self.sigma)
        #print("k: %s"%self.k)
        print('Total clusters: %s ' % self.allClusters.size)
        percentAccepted = float(self.genes.size - failedThreshold) / float(
            self.genes.size)
        print(
            "genes pass threshold %s/%s (%s)" %
            (self.genes.size - failedThreshold, self.genes.size,
             round(percentAccepted, 2)) + "%)")
Ejemplo n.º 9
0
def write_summary(name, aspect, transcript, assembly, geneset):
    ## load the go dictionaries
    termsPath = os.path.join("..", "results",
                             "go-terms-%s-%s.pickle" % (name, aspect))
    tmp = open(termsPath, 'r')
    gene2go, go2gene = cPickle.load(tmp)
    tmp.close()

    ## load the blast map
    bm = BlastMapper()
    homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus")
    sizeMin, sizeMax = 5, 100

    summaryFile = os.path.join(homeDir, "%s-trinity" % (assembly),
                               'blast-%s-parsed_summary.csv' % assembly)
    if transcript == 'genes':
        bmap = bm.load_summary(summaryFile,
                               trinityGene=True,
                               best=False,
                               taxaList=['8364', '8355', '9606'],
                               evalue=0.0001)
    else:
        bmap = bm.load_summary(summaryFile,
                               trinityGene=False,
                               best=False,
                               taxaList=['8364', '8355', '9606'],
                               evalue=0.0001)

    ## get gene level differencial exp results
    featuresDir = os.path.join(homeDir, "%s-trinity" % assembly, "features")
    deseqResultsPath = os.path.join(featuresDir,
                                    "deseq_%s_de.csv" % (transcript))
    deseqIds, deseqColumns, deseqMat = read_de_results(deseqResultsPath,
                                                       tool='DESeq')
    padjInd = np.where(deseqColumns == 'padj')[0]
    pvalInd = np.where(deseqColumns == 'pvalue')[0]

    ## input/output
    genesetSummaryFile = os.path.join(
        "..", "results", "genesets", "%s-%s-%s-%s-%s.csv" %
        (name, aspect, transcript, assembly, re.sub("gs-", "", geneset)))
    genesetFile = os.path.join(
        "..", "results",
        "%s-%s-%s-%s.gmt" % (name, aspect, assembly, transcript))

    if not os.path.exists(genesetFile):
        raise Exception("cannot find gene set file")

    allGenesets = {}
    fid = open(genesetFile, 'r')
    for linja in fid:
        linja = [re.sub("\s+", "", l) for l in linja.split("\t")]
        allGenesets[linja[0]] = linja[2:]

    fid.close()

    gsTranscripts = allGenesets[geneset]

    ## map back to gene space and collect go terms
    transcript2genes = {}
    for t in gsTranscripts:
        transcript2genes[t] = {}
        species = list(set([hit[2] for hit in bmap[t]]))

        ## organize the hits by species
        for hit in bmap[t]:
            if not transcript2genes[t].has_key(hit[2]):
                transcript2genes[t][hit[2]] = []

            transcript2genes[t][hit[2]].append(hit[1])

    ## get inferred go terms for each transcript
    transcript2go = {}
    for t, hit in transcript2genes.iteritems():
        transcript2go[t] = []
        for genes in hit.itervalues():
            #gene = v[1]
            for gene in genes:
                if gene2go.has_key(gene):
                    transcript2go[t].extend(gene2go[gene])
        transcript2go[t] = list(set(transcript2go[t]))
        transcript2go[t].sort()

    ## write to file
    writer = csv.writer(open(genesetSummaryFile, 'w'))
    writer.writerow(["transcript", "p-value", "genes", "go-terms"])
    allTerms = []

    for ts in gsTranscripts:
        pvalue = deseqMat[np.where(deseqIds == ts)[0], pvalInd][0]
        reportedGenes = []
        for taxa, genes in transcript2genes[ts].iteritems():
            reportedGenes.extend(genes[:2])
        reportedGenes = list(set(reportedGenes))

        if len(reportedGenes) > 1:
            genes = ";".join(reportedGenes)
        else:
            genes = reportedGenes[0]

        terms = transcript2go[ts]

        if terms:
            allTerms.extend(terms)

        if not terms:
            terms = "None"
        elif len(terms) > 1:
            terms = ";".join(terms)
        else:
            terms = terms[0]

        writer.writerow([ts, pvalue, genes, terms])

    writer.writerow(["--------"])
    ## write a summary of the go terms
    allTerms = np.array(list(set(allTerms)))
    allTermCounts = np.zeros(allTerms.size, )

    for t, term in enumerate(allTerms):
        for ts in gsTranscripts:
            allTermCounts[t] += np.where(
                np.array(transcript2go[ts]) == term)[0].size

    sortedTerms = allTerms[np.argsort(allTermCounts)[::-1]]
    sortedCounts = allTermCounts[np.argsort(allTermCounts)[::-1]]
    writer.writerow(["ID", "Counts", "Description"])
    for t, term in enumerate(sortedTerms):
        desc = session.query(GoTerm).filter(GoTerm.go_id == term).first().name
        writer.writerow([term, sortedCounts[t], desc])
Ejemplo n.º 10
0
if not os.path.exists(psFigureFile):
    scr = SpectralClusterResults(silvalFile, clustersFile)
    scr.plot(figName=psFigureFile)

## run spectral clustering
k = 20
sigma = 0.08

labelsPath = os.path.join(homeDir, "sc-labels-%s.csv" % (_aspect))
if not os.path.exists(labelsPath):
    sc = SpectralCluster(geneDistancePath, dtype='distance')
    sc.run(k, sk=None, sigma=sigma, verbose=True)
    sc.save(labelsPath=labelsPath)

## Save gene sets
bm = BlastMapper()
bmap = bm.load_summary('blast-parsed-summary.csv',
                       best=False,
                       taxaList=['8355', '8364'])

transcriptMin, transcriptMax = 9, 1000
gsFile = os.path.join(homeDir, "%s.gmt" % (_aspect))
if not os.path.exists(gsFile):
    gsc = GeneSetCollection(labelsPath, gene2go)
    gsc.write(blastMap=bmap,
              transcriptMin=transcriptMin,
              transcriptMax=transcriptMax,
              outFile=gsFile)

print("process complete.")
Ejemplo n.º 11
0
for o, a in optlist:
    if o == '-s':
        source = a

homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus")
if source in ['dn', 'gg']:
    sourceDir = "%s-trinity" % (source)
elif source == 'ref':
    sourceDir = "reference"
else:
    raise Exception("Bad source")

summaryFile1 = os.path.join(homeDir, sourceDir,
                            "blast-%s-parsed_summary.csv" % (source))
summaryFile2 = os.path.join(homeDir, sourceDir, "blast-xt-parsed_summary.csv")
bm = BlastMapper()

## load the gene and isoform maps
bmapIsoforms = bm.load_summary(summaryFile1, trinityGene=False, best=True)
bmapFrog = bm.load_summary(summaryFile1,
                           trinityGene=False,
                           best=True,
                           taxaList=['8355', '8364'])
bmapXT = bm.load_summary(summaryFile2, trinityGene=False, best=True)

print("-----------")
print("SwissProt - isoforms")
bm.print_summary(bmapIsoforms)
print("SwissProt [8355,8364] - isoforms")
bm.print_summary(bmapFrog)
print("X. tropicalis - isoforms")