def loadProtIdsOrgType(self,db,idGenSeq,inserterSeq,orgType): """Load Fasta deflines for protein sequences generated by pullSeq, parse and insert them into SQL table. """ inFasta = self.store.getFilePath('%s.protein.cat.gz' % orgType) inp = FastaReader(inFasta) for rec in inp.records(): hdr = CvTreeId.splitFastaDeflineCvTree(rec.header()) idSeq = idGenSeq() seqLen = rec.seqLen() values = (idSeq,hdr.taxid,seqLen,hdr.acc,hdr.acc_gen,orgType[:4]) inserterSeq(values) inp.close()
def loadGenomicIdsOrgType(self,db,idGenSeq,inserterSeq,orgType): """Load Fasta deflines for genomic sequences, index them by accession, and drop non-NC_ and all plasmids. @todo It might be more robust to parse the GenBank file, e.g. FEATURES Location/Qualifiers source 1..208369 /organism="Bacillus cereus ATCC 10987" /mol_type="genomic DNA" /strain="ATCC 10987" /db_xref="ATCC:10987" /db_xref="taxon:222523" /plasmid="pBc10987" gene join(207497..208369,1..687) We would have to fix the gap(unk100) bug first, and also check how the "extrachromosomal" is labeled in GB file. """ if orgType == "outgroup": inFasta = self.outGroupFna else: inFasta = pjoin(options.refSeqDataDir,"%s.genomic.fna.gz" % orgType) inp = FastaReader(inFasta) for rec in inp.records(): line = rec.header()[1:] parts = line.strip().split('|',4) assert parts[0] == 'gi' gi = int(parts[1]) assert parts[2] == 'ref' acc = parts[3].strip() # we can do re.search(r'\bplasmid\b',) instead, but this is safer # (there is a record called 'megaplasmid'): if acc[:3] == 'NC_': hdr = (' '.join(parts[4:])).strip() hlow = hdr.lower() ## genel values must differ by the first letter - this is used ## by the name generation method later if 'plasmid' in hlow: genel = "pla" elif 'extrachromosomal' in hlow or 'extra-chromosomal' in hlow: genel = "ext" elif 'transposon' in hlow: genel = "tra" else: genel = "chr" idSeq = idGenSeq() seqLen = rec.seqLen() values = (idSeq,gi,0,seqLen,acc,genel,orgType[:4],hdr) #values = [ str(x) for x in values ] inserterSeq(values) inp.close()
def makeFeat(self): """Create feature vectors out of FASTA files.""" maxSampLen = 100000 kmerCnt = KmerSparseFeatures(sampLen=maxSampLen, kmerLen=2, rcPolicy=RC_POLICY.MERGE, normPolicy=NORM_POLICY.FREQ) for fastaFile in self.store.getFilePaths("*.fasta.gz"): inpFasta = FastaReader(fastaFile) iRec = 0 for rec in inpFasta.records(): kmerCnt.process(rec.sequence(format="array")) iRec += 1 if iRec > 100: break inpFasta.close() feat = kmerCnt.kmerFrequencies() id = stripSfx(os.path.basename(fastaFile),".fasta.gz") print id, feat