Beispiel #1
0
class BlastMapperTest(unittest.TestCase):
    """
    Run a number of tests using taxa id
    """

    def setUp(self):
        """
        connect to the database
        """

        self.parsedFile = os.path.join(os.path.dirname(__file__),"blast-parsed.csv")
        self.bm = BlastMapper()
        
    def test01Summarize(self):
        """
        test the summarize function
        """
    
        summaryFile = re.sub("\.csv","",self.parsedFile)+"_summary.csv"
        if os.path.exists(summaryFile):
            os.remove(summaryFile)

        self.bm.create_summarized(self.parsedFile,uniprot=True)
        self.assertTrue(os.path.exists(summaryFile))

    def test02Something(self):
        """
        read in the results summary
        """

        summaryFile = re.sub("\.csv","",self.parsedFile)+"_summary.csv"
        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"])
        self.assertEqual(bmap['GG11117|c2_g1_i1'][0],'INT1_MOUSE')
        self.assertEqual(bmap['GG11117|c2_g1_i1'][1],'68510')
        
        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"],trinityGene=True)
        self.assertEqual(bmap['GG11117|c2_g1'][0],'INT1_MOUSE')

        bmap = self.bm.load_summary(summaryFile,taxaList=["10090"],trinityGene=True,best=False)
        self.assertEqual(bmap['GG11117|c2_g1'][0][0],'INT1_MOUSE')
        self.assertEqual(bmap['GG11117|c2_g1'][0][4],0.0)
Beispiel #2
0
              for r in geneQueries]))
    gene2sym.update(
        dict([(str(r['ncbi_id']), str(r['symbol'])) for r in geneQueries]))

## load the blast map
bm = BlastMapper()
summaryFile1 = os.path.join(homeDir, "dn-trinity",
                            'blast-dn-parsed_summary.csv')
summaryFile2 = os.path.join(homeDir, "dn-trinity",
                            'blast-dm-parsed_summary.csv')
summaryFile3 = os.path.join(homeDir, "dn-trinity",
                            "blast-mc-parsed_summary.csv")
summaryFile4 = os.path.join(homeDir, "dn-trinity",
                            'blast-dp-parsed_summary.csv')

bmapSP = bm.load_summary(summaryFile1, trinityGene=False, best=True)
bmapDM = bm.load_summary(summaryFile2, trinityGene=False, best=True)
bmapMC = bm.load_summary(summaryFile3, trinityGene=False, best=True)
bmapDP = bm.load_summary(summaryFile4, trinityGene=False, best=True)

## prepare supplment output
columns = [20, 22, 25, 50, 17, 17, 17]
row = "+"
head = "+"
for col in columns:
    row += "-" * col + "-+"
    head += "=" * col + "=+"

print("\nGene sets\n_____________________")
print(row)
items = [
Beispiel #3
0
    scps = SpectralClusterParamSearch(geneDistancePath,dtype='distance')
    scps.run(chunks=15)

## plot the parameter search 
psFigureFile = os.path.join(homeDir,"param-scan-%s.png"%(_aspect))
if not os.path.exists(psFigureFile):
    scr = SpectralClusterResults(silvalFile,clustersFile)
    scr.plot(figName=psFigureFile)

## run spectral clustering
k = 20
sigma = 0.08

labelsPath = os.path.join(homeDir,"sc-labels-%s.csv"%(_aspect))
if not os.path.exists(labelsPath):
    sc = SpectralCluster(geneDistancePath,dtype='distance')
    sc.run(k,sk=None,sigma=sigma,verbose=True)
    sc.save(labelsPath=labelsPath)

## Save gene sets
bm = BlastMapper()
bmap = bm.load_summary('blast-parsed-summary.csv',best=False,taxaList=['8355','8364'])

transcriptMin,transcriptMax = 9,1000  
gsFile = os.path.join(homeDir,"%s.gmt"%(_aspect))                                                                                                       
if not os.path.exists(gsFile):
    gsc = GeneSetCollection(labelsPath,gene2go)
    gsc.write(blastMap=bmap,transcriptMin=transcriptMin,transcriptMax=transcriptMax,outFile=gsFile)

print("process complete.")
Beispiel #4
0
def write_summary(name, aspect, transcript, assembly, geneset):
    ## load the go dictionaries
    termsPath = os.path.join("..", "results",
                             "go-terms-%s-%s.pickle" % (name, aspect))
    tmp = open(termsPath, 'r')
    gene2go, go2gene = cPickle.load(tmp)
    tmp.close()

    ## load the blast map
    bm = BlastMapper()
    homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus")
    sizeMin, sizeMax = 5, 100

    summaryFile = os.path.join(homeDir, "%s-trinity" % (assembly),
                               'blast-%s-parsed_summary.csv' % assembly)
    if transcript == 'genes':
        bmap = bm.load_summary(summaryFile,
                               trinityGene=True,
                               best=False,
                               taxaList=['8364', '8355', '9606'],
                               evalue=0.0001)
    else:
        bmap = bm.load_summary(summaryFile,
                               trinityGene=False,
                               best=False,
                               taxaList=['8364', '8355', '9606'],
                               evalue=0.0001)

    ## get gene level differencial exp results
    featuresDir = os.path.join(homeDir, "%s-trinity" % assembly, "features")
    deseqResultsPath = os.path.join(featuresDir,
                                    "deseq_%s_de.csv" % (transcript))
    deseqIds, deseqColumns, deseqMat = read_de_results(deseqResultsPath,
                                                       tool='DESeq')
    padjInd = np.where(deseqColumns == 'padj')[0]
    pvalInd = np.where(deseqColumns == 'pvalue')[0]

    ## input/output
    genesetSummaryFile = os.path.join(
        "..", "results", "genesets", "%s-%s-%s-%s-%s.csv" %
        (name, aspect, transcript, assembly, re.sub("gs-", "", geneset)))
    genesetFile = os.path.join(
        "..", "results",
        "%s-%s-%s-%s.gmt" % (name, aspect, assembly, transcript))

    if not os.path.exists(genesetFile):
        raise Exception("cannot find gene set file")

    allGenesets = {}
    fid = open(genesetFile, 'r')
    for linja in fid:
        linja = [re.sub("\s+", "", l) for l in linja.split("\t")]
        allGenesets[linja[0]] = linja[2:]

    fid.close()

    gsTranscripts = allGenesets[geneset]

    ## map back to gene space and collect go terms
    transcript2genes = {}
    for t in gsTranscripts:
        transcript2genes[t] = {}
        species = list(set([hit[2] for hit in bmap[t]]))

        ## organize the hits by species
        for hit in bmap[t]:
            if not transcript2genes[t].has_key(hit[2]):
                transcript2genes[t][hit[2]] = []

            transcript2genes[t][hit[2]].append(hit[1])

    ## get inferred go terms for each transcript
    transcript2go = {}
    for t, hit in transcript2genes.iteritems():
        transcript2go[t] = []
        for genes in hit.itervalues():
            #gene = v[1]
            for gene in genes:
                if gene2go.has_key(gene):
                    transcript2go[t].extend(gene2go[gene])
        transcript2go[t] = list(set(transcript2go[t]))
        transcript2go[t].sort()

    ## write to file
    writer = csv.writer(open(genesetSummaryFile, 'w'))
    writer.writerow(["transcript", "p-value", "genes", "go-terms"])
    allTerms = []

    for ts in gsTranscripts:
        pvalue = deseqMat[np.where(deseqIds == ts)[0], pvalInd][0]
        reportedGenes = []
        for taxa, genes in transcript2genes[ts].iteritems():
            reportedGenes.extend(genes[:2])
        reportedGenes = list(set(reportedGenes))

        if len(reportedGenes) > 1:
            genes = ";".join(reportedGenes)
        else:
            genes = reportedGenes[0]

        terms = transcript2go[ts]

        if terms:
            allTerms.extend(terms)

        if not terms:
            terms = "None"
        elif len(terms) > 1:
            terms = ";".join(terms)
        else:
            terms = terms[0]

        writer.writerow([ts, pvalue, genes, terms])

    writer.writerow(["--------"])
    ## write a summary of the go terms
    allTerms = np.array(list(set(allTerms)))
    allTermCounts = np.zeros(allTerms.size, )

    for t, term in enumerate(allTerms):
        for ts in gsTranscripts:
            allTermCounts[t] += np.where(
                np.array(transcript2go[ts]) == term)[0].size

    sortedTerms = allTerms[np.argsort(allTermCounts)[::-1]]
    sortedCounts = allTermCounts[np.argsort(allTermCounts)[::-1]]
    writer.writerow(["ID", "Counts", "Description"])
    for t, term in enumerate(sortedTerms):
        desc = session.query(GoTerm).filter(GoTerm.go_id == term).first().name
        writer.writerow([term, sortedCounts[t], desc])
Beispiel #5
0
if not os.path.exists(psFigureFile):
    scr = SpectralClusterResults(silvalFile, clustersFile)
    scr.plot(figName=psFigureFile)

## run spectral clustering
k = 20
sigma = 0.08

labelsPath = os.path.join(homeDir, "sc-labels-%s.csv" % (_aspect))
if not os.path.exists(labelsPath):
    sc = SpectralCluster(geneDistancePath, dtype='distance')
    sc.run(k, sk=None, sigma=sigma, verbose=True)
    sc.save(labelsPath=labelsPath)

## Save gene sets
bm = BlastMapper()
bmap = bm.load_summary('blast-parsed-summary.csv',
                       best=False,
                       taxaList=['8355', '8364'])

transcriptMin, transcriptMax = 9, 1000
gsFile = os.path.join(homeDir, "%s.gmt" % (_aspect))
if not os.path.exists(gsFile):
    gsc = GeneSetCollection(labelsPath, gene2go)
    gsc.write(blastMap=bmap,
              transcriptMin=transcriptMin,
              transcriptMax=transcriptMax,
              outFile=gsFile)

print("process complete.")
Beispiel #6
0
homeDir = os.path.join(os.path.expanduser("~"), "sequencing", "xenopus")
if source in ['dn', 'gg']:
    sourceDir = "%s-trinity" % (source)
elif source == 'ref':
    sourceDir = "reference"
else:
    raise Exception("Bad source")

summaryFile1 = os.path.join(homeDir, sourceDir,
                            "blast-%s-parsed_summary.csv" % (source))
summaryFile2 = os.path.join(homeDir, sourceDir, "blast-xt-parsed_summary.csv")
bm = BlastMapper()

## load the gene and isoform maps
bmapIsoforms = bm.load_summary(summaryFile1, trinityGene=False, best=True)
bmapFrog = bm.load_summary(summaryFile1,
                           trinityGene=False,
                           best=True,
                           taxaList=['8355', '8364'])
bmapXT = bm.load_summary(summaryFile2, trinityGene=False, best=True)

print("-----------")
print("SwissProt - isoforms")
bm.print_summary(bmapIsoforms)
print("SwissProt [8355,8364] - isoforms")
bm.print_summary(bmapFrog)
print("X. tropicalis - isoforms")
bm.print_summary(bmapXT)

bm.make_taxa_pie_chart_and_table(