Exemple #1
0
    def setUp(self):
        """
        connect to the database
        """

        self.parsedFile = os.path.join(os.path.dirname(__file__),"blast-parsed.csv")
        self.bm = BlastMapper()
Exemple #2
0
class BlastMapperTest(unittest.TestCase):
    """
    Run a number of tests using taxa id
    """

    def setUp(self):
        """
        connect to the database
        """

        self.parsedFile = os.path.join(os.path.dirname(__file__),"blast-parsed.csv")
        self.bm = BlastMapper()
        
    def test01Summarize(self):
        """
        test the summarize function
        """
    
        summaryFile = re.sub("\.csv","",self.parsedFile)+"_summary.csv"
        if os.path.exists(summaryFile):
            os.remove(summaryFile)

        self.bm.create_summarized(self.parsedFile,uniprot=True)
        self.assertTrue(os.path.exists(summaryFile))

    def test02Something(self):
        """
        read in the results summary
        """

        summaryFile = re.sub("\.csv","",self.parsedFile)+"_summary.csv"
        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"])
        self.assertEqual(bmap['GG11117|c2_g1_i1'][0],'INT1_MOUSE')
        self.assertEqual(bmap['GG11117|c2_g1_i1'][1],'68510')
        
        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"],trinityGene=True)
        self.assertEqual(bmap['GG11117|c2_g1'][0],'INT1_MOUSE')

        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"],trinityGene=True,best=False)
        self.assertEqual(bmap['GG11117|c2_g1'][0][0],'INT1_MOUSE')
        self.assertEqual(bmap['GG11117|c2_g1'][0][4],0.0)
Exemple #3
0
    def write(self,blastMap=None, transcriptMin=9, transcriptMax=1000,outFile="genesets.gmt"):
        """
        outFile: specifies the output file path (*.gmt)
        also a *.csv file with gene transcript mapping will be created if a bmap is provided

        blastMap: BlastMap returned after loading summary file in BlastMapper 
        transcriptMin: minimum size for a gene set
        transcriptMax: maximum size for a gene set
        outFile: outfile path

        """

        print("---------------------")
        if self.gene2go:
            print('There are %s genes with at least one annotation'%(len(self.gene2go.keys())))
        print('There are %s genes in the labels file'%(len(self.genes)))

        if blastMap:
            bm = BlastMapper()
            bmGenes = bm.print_summary(blastMap)
            gene2transcript = bm.get_gene_dict(blastMap)
            if self.gene2go:
                usableGenes = list(set(bmGenes).intersection(set(self.gene2go.keys())))
            else:
                usableGenes = bmGenes
                
        if blastMap:
            print('There are %s genes with at least one BLAST hit'%(len(bmGenes)))
            print('There are %s genes that have both a BLAST hit and an annotation'%(len(usableGenes)))
            #print('There are %s genes in clusters with at least one BLAST hits'%(len(set(self.genes).intersection.(set(bmGenes.keys())))))

        ## prepare outfiles
        writer = csv.writer(open(outFile,'w'),delimiter="\t")

        if blastMap:
            outFileMap = re.sub("\.gmt",".csv",outFile)
            writerMap = csv.writer(open(outFileMap,'w'))
            writerMap.writerow(["gene_set","gene_id","mapped_transcripts"])

        ## save gene sets to file
        failedThreshold = 0
        
        for _k in self.allClusters:
            clusterInds = np.where(self.labels==_k)[0]
            clusterGenes = self.genes[clusterInds]
            gsName = "gs-"+str(_k)
            if self.gene2go:
                description = self.get_description(clusterGenes)
            else:
                description = "kegg pathway"
                
            ## map the genes
            if blastMap:
                mapped = set([])
                for gene in clusterGenes:
                    if not gene2transcript.has_key(gene):
                        continue
                    geneTranscripts = gene2transcript[gene]
                    geneTranscripts = list(set([re.sub("\.[0-9]$","",g) for g in geneTranscripts]))

                    if blastMap:
                        writerMap.writerow([gsName,gene,";".join(list(geneTranscripts))])
                    mapped.update(geneTranscripts)
                mapped = list(mapped)
            else:
                mapped = clusterGenes

            ### remove non-unique and versioned genes
            #if len(mapped) > 0:
                

            if len(mapped) >= transcriptMin and len(mapped) <= transcriptMax: 
                writer.writerow([gsName,description] + mapped)
            else:
                failedThreshold+=clusterGenes.size

        print("-----------------")
        #print("sigma: %s"%self.sigma)
        #print("k: %s"%self.k)
        print('Total clusters: %s '%self.allClusters.size)
        percentAccepted = float(self.genes.size-failedThreshold) / float(self.genes.size)
        print("genes pass threshold %s/%s (%s)"%(self.genes.size-failedThreshold,self.genes.size,round(percentAccepted,2)) + "%)")
Exemple #4
0
    scps = SpectralClusterParamSearch(geneDistancePath,dtype='distance')
    scps.run(chunks=15)

## plot the parameter search 
psFigureFile = os.path.join(homeDir,"param-scan-%s.png"%(_aspect))
if not os.path.exists(psFigureFile):
    scr = SpectralClusterResults(silvalFile,clustersFile)
    scr.plot(figName=psFigureFile)

## run spectral clustering
k = 20
sigma = 0.08

labelsPath = os.path.join(homeDir,"sc-labels-%s.csv"%(_aspect))
if not os.path.exists(labelsPath):
    sc = SpectralCluster(geneDistancePath,dtype='distance')
    sc.run(k,sk=None,sigma=sigma,verbose=True)
    sc.save(labelsPath=labelsPath)

## Save gene sets
bm = BlastMapper()
bmap = bm.load_summary('blast-parsed-summary.csv',best=False,taxaList=['8355','8364'])

transcriptMin,transcriptMax = 9,1000  
gsFile = os.path.join(homeDir,"%s.gmt"%(_aspect))                                                                                                       
if not os.path.exists(gsFile):
    gsc = GeneSetCollection(labelsPath,gene2go)
    gsc.write(blastMap=bmap,transcriptMin=transcriptMin,transcriptMax=transcriptMax,outFile=gsFile)

print("process complete.")