def mapFastaRecordsToTaxaTree(inSeqs,taxaTree,giToTaxa, storeHeader=False,storeSeq=False,storeSeqLen=False): from MGT.FastaIO import FastaReader if taxaTree is None: taxaTree = loadTaxaTree() if giToTaxa is None: giToTaxa = loadGiTaxBin() taxMis = Struct() for inSeq in inSeqs: inpSeq = FastaReader(inSeq) for rec in ncbiFastaRecordsWithTaxa(fastaReader=inpSeq, taxaTree=taxaTree, giToTaxa=giToTaxa, errorCounter=taxMis): node = taxaTree.getNode(rec["meta_group"]["taxid"]) if not hasattr(node,'seq'): node.seq = [] seqRec = Struct(gi=rec["meta"]["gi"]) if storeHeader: seqRec.header = rec["seq"].header().strip() seqLen = None if storeSeq: seqRec.seq = rec["seq"].sequence() seqLen = len(seqRec.seq) if storeSeqLen: if seqLen is None: seqLen = rec["seq"].seqLen() seqRec.seqLen = seqLen node.seq.append(seqRec) inpSeq.close() return taxMis
def _multi_iter(): for inSeq in inSeqs: inpSeq = FastaReader(inSeq) for rec in ncbiFastaRecordsWithTaxa(fastaReader=filt(inpSeq), taxaTree=taxaTree, giToTaxa=giToTaxa, errorCounter=taxMis): yield rec inpSeq.close()
def loadProtIdsOrgType(self,db,idGenSeq,inserterSeq,orgType): """Load Fasta deflines for protein sequences generated by pullSeq, parse and insert them into SQL table. """ inFasta = self.store.getFilePath('%s.protein.cat.gz' % orgType) inp = FastaReader(inFasta) for rec in inp.records(): hdr = CvTreeId.splitFastaDeflineCvTree(rec.header()) idSeq = idGenSeq() seqLen = rec.seqLen() values = (idSeq,hdr.taxid,seqLen,hdr.acc,hdr.acc_gen,orgType[:4]) inserterSeq(values) inp.close()
def loadGenomicIdsOrgType(self,db,idGenSeq,inserterSeq,orgType): """Load Fasta deflines for genomic sequences, index them by accession, and drop non-NC_ and all plasmids. @todo It might be more robust to parse the GenBank file, e.g. FEATURES Location/Qualifiers source 1..208369 /organism="Bacillus cereus ATCC 10987" /mol_type="genomic DNA" /strain="ATCC 10987" /db_xref="ATCC:10987" /db_xref="taxon:222523" /plasmid="pBc10987" gene join(207497..208369,1..687) We would have to fix the gap(unk100) bug first, and also check how the "extrachromosomal" is labeled in GB file. """ if orgType == "outgroup": inFasta = self.outGroupFna else: inFasta = pjoin(options.refSeqDataDir,"%s.genomic.fna.gz" % orgType) inp = FastaReader(inFasta) for rec in inp.records(): line = rec.header()[1:] parts = line.strip().split('|',4) assert parts[0] == 'gi' gi = int(parts[1]) assert parts[2] == 'ref' acc = parts[3].strip() # we can do re.search(r'\bplasmid\b',) instead, but this is safer # (there is a record called 'megaplasmid'): if acc[:3] == 'NC_': hdr = (' '.join(parts[4:])).strip() hlow = hdr.lower() ## genel values must differ by the first letter - this is used ## by the name generation method later if 'plasmid' in hlow: genel = "pla" elif 'extrachromosomal' in hlow or 'extra-chromosomal' in hlow: genel = "ext" elif 'transposon' in hlow: genel = "tra" else: genel = "chr" idSeq = idGenSeq() seqLen = rec.seqLen() values = (idSeq,gi,0,seqLen,acc,genel,orgType[:4],hdr) #values = [ str(x) for x in values ] inserterSeq(values) inp.close()
def makeFeat(self): """Create feature vectors out of FASTA files.""" maxSampLen = 100000 kmerCnt = KmerSparseFeatures(sampLen=maxSampLen, kmerLen=2, rcPolicy=RC_POLICY.MERGE, normPolicy=NORM_POLICY.FREQ) for fastaFile in self.store.getFilePaths("*.fasta.gz"): inpFasta = FastaReader(fastaFile) iRec = 0 for rec in inpFasta.records(): kmerCnt.process(rec.sequence(format="array")) iRec += 1 if iRec > 100: break inpFasta.close() feat = kmerCnt.kmerFrequencies() id = stripSfx(os.path.basename(fastaFile),".fasta.gz") print id, feat
def fastaToSvm(inFileFasta,outName,opt): assert not isSamePath(inFileFasta,outName) if opt.outFormat == "svm": svmWriter = SvmStringFeatureWriterTxt(outName) elif opt.outFormat == "fasta": svmWriter = SvmFastaFeatureWriterTxt(outName,lineLen=opt.fastaLineLen) inpSeq = FastaReader(inFileFasta) if opt.degenLen >= 0: symCompr = SymbolRunsCompressor('N',opt.degenLen) else: symCompr = lambda s: s if opt.inFormat == "gos": meta, allLen = gosToSvm(inpSeq,svmWriter,symCompr,opt) elif opt.inFormat == "ca": meta, allLen = caToSvm(inpSeq,svmWriter,symCompr,opt) else: meta, allLen = genericFastaToSvm(inpSeq,svmWriter,symCompr,opt) inpSeq.close() svmWriter.close() print "Saved %i samples out of %i total from file %s" % (len(meta.samp),len(allLen),inFileFasta) lenHist = numpy.histogram(allLen,bins=numpy.arange(0,allLen.max()+100,100,dtype='f8')) print "Original sample length histogram:\n%s\n%s" % lenHist dumpObj(meta,outName+".meta")