Exemple #1
0
def genRandomSeqDbCat(path, length, nCopies):
    """Generate random reference sequences as SeqDbFasta.
    The first sequence is generated as random, and each next
    one is a concatenation of N copies of the first."""
    seqDb = SeqDbFasta(path, save=True)
    seqBase = randomSeq(length)
    # assert len(set(lengths)) == len(lengths),"Reference lengths must be unique"
    seqLen = {}
    for iIcm in xrange(nCopies):
        idIcm = iIcm
        writer = seqDb.fastaWriter(idIcm)
        seq = n.tile(seqBase, iIcm + 1)
        writer.record(str(idIcm), seq)
        writer.close()
        seqLen[idIcm] = len(seq)
    seqDb.opt.seqLen = seqLen
    seqDb.save()
Exemple #2
0
def genRandomSeqDbIid(path, lengths):
    """Generate random reference sequences as SeqDbFasta.
    Each generated sequence is generated independently"""
    seqDb = SeqDbFasta(path, save=True)
    # assert len(set(lengths)) == len(lengths),"Reference lengths must be unique"
    seqLen = {}
    for (iIcm, length) in enumerate(lengths):
        idIcm = iIcm
        writer = seqDb.fastaWriter(idIcm)
        seq = randomSeq(length)
        writer.record(str(idIcm), seq)
        writer.close()
        seqDb.finById(idIcm)
        seqLen[idIcm] = length
    seqDb.opt.seqLen = seqLen
    seqDb.save()
Exemple #3
0
def genInput():
    genRandomSeqDbIid(seqDbPath, seqDbLengths)
    genRandomQueryZeroMarkov(queryPath, queryLengths)


if len(sys.argv) == 1:
    genInput()

opt = Struct()
opt.runMode = "batchDep"

# training opts

opt.seqDb = seqDbPath
seqDb = SeqDbFasta(opt.seqDb)
ids = seqDb.getIdList()
immIdToSeqIds = dict(((id, [id]) for id in ids))
immIdsFile = pjoin(workDir, "test.immapp.seqids.pkl")
dumpObj(immIdToSeqIds, immIdsFile)
opt.immIdToSeqIds = immIdsFile
opt.immDb = pjoin(workDir, "icm")

# scoring opts

immIds = ImmStore.open(path=opt.immDb, mode="r").listImmIdsWithIdScoreIdent()
immIdsFile = "test.immapp.immids.pkl"
dumpObj(immIds, immIdsFile)
opt.immIds = immIdsFile
opt.nImmBatches = 10
opt.inpSeq = queryPath
Exemple #4
0
from MGT.SeqDbFasta import *

seqDbPath = pjoin(options.testDataDir,"seqdb-fasta")

seqDb = SeqDbFasta(seqDbPath)
ids = seqDb.getIdList()
for id in ids:
    seqDb.writeFastaBothStrands([id],"%s.tmp.fna.gz" % (id,))


Exemple #5
0
from MGT.ImmApp import *
from MGT.SeqDbFasta import *

seqDbPath = pjoin(options.testDataDir, "seqdb-fasta")

jobs = []

opt = Struct()
opt.runMode = "inproc"  # "batchDep"
opt.seqDb = seqDbPath
opt.immDb = pabs("test.immdb")
seqDb = SeqDbFasta(opt.seqDb)
ids = seqDb.getIdList()
for id in ids:
    seqDb.finById(id=id)
immIdToSeqIds = dict(((id, [id]) for id in ids))
immIdToSeqIdsFile = pabs("test.immapp.seqids.pkl")
dumpObj(immIdToSeqIds, immIdToSeqIdsFile)
opt.immIdToSeqIds = immIdToSeqIdsFile

opt.mode = "train"

ImmApp.fillWithDefaultOptions(opt)

# imm = ImmApp(opt=opt)
# jobs = imm.run()

for (reduceScoresEarly, cwd) in ((1, "imm.test.red_early_1"), (0, "imm.test.red_early_0")):

    immIds = ImmStore.open(path=opt.immDb, mode="r").listImmIdsWithIdScoreIdent()
    immIdsFile = pabs("test.immapp.immids.pkl")
Exemple #6
0
from MGT.TaxaTreeUtils import *

"""Count taxonomic groups represented in SeqDbFasta object.
Produces an SQLite DB file containing a table with a "Linnean" 
(in a broad sense) lineage for each SeqDb record, as well as 
the aggregate count tables for each taxonomic level."""

refDataDir = "../../atovtchi/shannon_viral_IO_paper/shannon_viral_paper_2011.v.1"
workDir = refDataDir
#dbPath = pjoin(workDir,"refseq-taxa")
dbPath = "seq-db.mic"
#taxonomyDir = pjoin(workDir,"taxonomy")

topTaxids =  micTaxids

dbSeq = SeqDbFasta(path=dbPath)
taxids = dbSeq.getTaxaList()

#taxaTree = loadTaxaTree(ncbiDumpFile=pjoin(taxonomyDir,"nodes.dmp"),
#                ncbiNamesDumpFile=pjoin(taxonomyDir,"names.dmp"))
taxaTree = loadTaxaTree()

topNodes = [ taxaTree.getNode(topTaxid) for topTaxid in topTaxids ]

linwr = LinnWriter(taxaTree=taxaTree)
wr = linwr.newWriter()

for taxid in taxids:
    node = taxaTree.getNode(taxid)
    if sum(( node.isUnder(topNode) for topNode in topNodes )):
        wr.send(dict(taxid=taxid,weight=dbSeq.seqLengths(taxid)["len"].sum()))