Example #1
0
 def loadProtIdsOrgType(self,db,idGenSeq,inserterSeq,orgType):
     """Load Fasta deflines for protein sequences generated by pullSeq, parse and insert them into SQL table.
     """
     inFasta = self.store.getFilePath('%s.protein.cat.gz' % orgType)
     inp = FastaReader(inFasta)
     for rec in inp.records():
         hdr = CvTreeId.splitFastaDeflineCvTree(rec.header())
         idSeq = idGenSeq()
         seqLen = rec.seqLen()
         values = (idSeq,hdr.taxid,seqLen,hdr.acc,hdr.acc_gen,orgType[:4])
         inserterSeq(values)
     inp.close()
Example #2
0
    def loadGenomicIdsOrgType(self,db,idGenSeq,inserterSeq,orgType):
        """Load Fasta deflines for genomic sequences, index them by accession, and drop non-NC_ and all plasmids.
        @todo It might be more robust to parse the GenBank file, e.g.
        FEATURES             Location/Qualifiers
        source          1..208369
            /organism="Bacillus cereus ATCC 10987"
            /mol_type="genomic DNA"
            /strain="ATCC 10987"
            /db_xref="ATCC:10987"
            /db_xref="taxon:222523"
            /plasmid="pBc10987"
                                                                                                                           gene            join(207497..208369,1..687)

        We would have to fix the gap(unk100) bug first, and also check how the "extrachromosomal" is labeled
        in GB file.
        """
        if orgType == "outgroup":
            inFasta = self.outGroupFna
        else:
            inFasta = pjoin(options.refSeqDataDir,"%s.genomic.fna.gz" % orgType)
        inp = FastaReader(inFasta)
        for rec in inp.records():
            line = rec.header()[1:]
            parts = line.strip().split('|',4)
            assert parts[0] == 'gi'
            gi = int(parts[1])
            assert parts[2] == 'ref'
            acc = parts[3].strip()
            # we can do re.search(r'\bplasmid\b',) instead, but this is safer 
            # (there is a record called 'megaplasmid'):
            if acc[:3] == 'NC_':
                hdr = (' '.join(parts[4:])).strip()
                hlow = hdr.lower()
                ## genel values must differ by the first letter - this is used
                ## by the name generation method later
                if 'plasmid' in hlow:
                    genel = "pla"
                elif 'extrachromosomal' in hlow or 'extra-chromosomal' in hlow:
                    genel = "ext"
                elif 'transposon' in hlow:
                    genel = "tra"
                else:
                    genel = "chr"
                idSeq = idGenSeq()
                seqLen = rec.seqLen()
                values = (idSeq,gi,0,seqLen,acc,genel,orgType[:4],hdr)
                #values = [ str(x) for x in values ]
                inserterSeq(values)

        inp.close()
Example #3
0
 def makeFeat(self):
     """Create feature vectors out of FASTA files."""
     maxSampLen = 100000
     kmerCnt = KmerSparseFeatures(sampLen=maxSampLen,
             kmerLen=2,
             rcPolicy=RC_POLICY.MERGE,
             normPolicy=NORM_POLICY.FREQ)
     for fastaFile in self.store.getFilePaths("*.fasta.gz"):
         inpFasta = FastaReader(fastaFile)
         iRec = 0
         for rec in inpFasta.records():
             kmerCnt.process(rec.sequence(format="array"))
             iRec += 1
             if iRec > 100:
                 break
         inpFasta.close()
         feat = kmerCnt.kmerFrequencies()
         id = stripSfx(os.path.basename(fastaFile),".fasta.gz")
         print id, feat