def genRandomSeqDbCat(path, length, nCopies): """Generate random reference sequences as SeqDbFasta. The first sequence is generated as random, and each next one is a concatenation of N copies of the first.""" seqDb = SeqDbFasta(path, save=True) seqBase = randomSeq(length) # assert len(set(lengths)) == len(lengths),"Reference lengths must be unique" seqLen = {} for iIcm in xrange(nCopies): idIcm = iIcm writer = seqDb.fastaWriter(idIcm) seq = n.tile(seqBase, iIcm + 1) writer.record(str(idIcm), seq) writer.close() seqLen[idIcm] = len(seq) seqDb.opt.seqLen = seqLen seqDb.save()
def genRandomSeqDbIid(path, lengths): """Generate random reference sequences as SeqDbFasta. Each generated sequence is generated independently""" seqDb = SeqDbFasta(path, save=True) # assert len(set(lengths)) == len(lengths),"Reference lengths must be unique" seqLen = {} for (iIcm, length) in enumerate(lengths): idIcm = iIcm writer = seqDb.fastaWriter(idIcm) seq = randomSeq(length) writer.record(str(idIcm), seq) writer.close() seqDb.finById(idIcm) seqLen[idIcm] = length seqDb.opt.seqLen = seqLen seqDb.save()
def genInput(): genRandomSeqDbIid(seqDbPath, seqDbLengths) genRandomQueryZeroMarkov(queryPath, queryLengths) if len(sys.argv) == 1: genInput() opt = Struct() opt.runMode = "batchDep" # training opts opt.seqDb = seqDbPath seqDb = SeqDbFasta(opt.seqDb) ids = seqDb.getIdList() immIdToSeqIds = dict(((id, [id]) for id in ids)) immIdsFile = pjoin(workDir, "test.immapp.seqids.pkl") dumpObj(immIdToSeqIds, immIdsFile) opt.immIdToSeqIds = immIdsFile opt.immDb = pjoin(workDir, "icm") # scoring opts immIds = ImmStore.open(path=opt.immDb, mode="r").listImmIdsWithIdScoreIdent() immIdsFile = "test.immapp.immids.pkl" dumpObj(immIds, immIdsFile) opt.immIds = immIdsFile opt.nImmBatches = 10 opt.inpSeq = queryPath
from MGT.SeqDbFasta import * seqDbPath = pjoin(options.testDataDir,"seqdb-fasta") seqDb = SeqDbFasta(seqDbPath) ids = seqDb.getIdList() for id in ids: seqDb.writeFastaBothStrands([id],"%s.tmp.fna.gz" % (id,))
from MGT.ImmApp import * from MGT.SeqDbFasta import * seqDbPath = pjoin(options.testDataDir, "seqdb-fasta") jobs = [] opt = Struct() opt.runMode = "inproc" # "batchDep" opt.seqDb = seqDbPath opt.immDb = pabs("test.immdb") seqDb = SeqDbFasta(opt.seqDb) ids = seqDb.getIdList() for id in ids: seqDb.finById(id=id) immIdToSeqIds = dict(((id, [id]) for id in ids)) immIdToSeqIdsFile = pabs("test.immapp.seqids.pkl") dumpObj(immIdToSeqIds, immIdToSeqIdsFile) opt.immIdToSeqIds = immIdToSeqIdsFile opt.mode = "train" ImmApp.fillWithDefaultOptions(opt) # imm = ImmApp(opt=opt) # jobs = imm.run() for (reduceScoresEarly, cwd) in ((1, "imm.test.red_early_1"), (0, "imm.test.red_early_0")): immIds = ImmStore.open(path=opt.immDb, mode="r").listImmIdsWithIdScoreIdent() immIdsFile = pabs("test.immapp.immids.pkl")
from MGT.TaxaTreeUtils import * """Count taxonomic groups represented in SeqDbFasta object. Produces an SQLite DB file containing a table with a "Linnean" (in a broad sense) lineage for each SeqDb record, as well as the aggregate count tables for each taxonomic level.""" refDataDir = "../../atovtchi/shannon_viral_IO_paper/shannon_viral_paper_2011.v.1" workDir = refDataDir #dbPath = pjoin(workDir,"refseq-taxa") dbPath = "seq-db.mic" #taxonomyDir = pjoin(workDir,"taxonomy") topTaxids = micTaxids dbSeq = SeqDbFasta(path=dbPath) taxids = dbSeq.getTaxaList() #taxaTree = loadTaxaTree(ncbiDumpFile=pjoin(taxonomyDir,"nodes.dmp"), # ncbiNamesDumpFile=pjoin(taxonomyDir,"names.dmp")) taxaTree = loadTaxaTree() topNodes = [ taxaTree.getNode(topTaxid) for topTaxid in topTaxids ] linwr = LinnWriter(taxaTree=taxaTree) wr = linwr.newWriter() for taxid in taxids: node = taxaTree.getNode(taxid) if sum(( node.isUnder(topNode) for topNode in topNodes )): wr.send(dict(taxid=taxid,weight=dbSeq.seqLengths(taxid)["len"].sum()))