コード例 #1
0
ファイル: FastaSplitters.py プロジェクト: andreyto/mgtaxa
def mapFastaRecordsToTaxaTree(inSeqs,taxaTree,giToTaxa,
        storeHeader=False,storeSeq=False,storeSeqLen=False):
    from MGT.FastaIO import FastaReader
    if taxaTree is None:
        taxaTree = loadTaxaTree()
    if giToTaxa is None:
        giToTaxa = loadGiTaxBin()
    taxMis = Struct()
    for inSeq in inSeqs:
        inpSeq = FastaReader(inSeq)
        for rec in ncbiFastaRecordsWithTaxa(fastaReader=inpSeq,
                taxaTree=taxaTree,
                giToTaxa=giToTaxa,
                errorCounter=taxMis):
            node = taxaTree.getNode(rec["meta_group"]["taxid"])
            if not hasattr(node,'seq'):
                node.seq = []
            seqRec = Struct(gi=rec["meta"]["gi"])
            if storeHeader:
                seqRec.header = rec["seq"].header().strip()
            seqLen = None
            if storeSeq:
                seqRec.seq = rec["seq"].sequence()
                seqLen = len(seqRec.seq)
            if storeSeqLen:
                if seqLen is None:
                    seqLen = rec["seq"].seqLen()
                seqRec.seqLen = seqLen
            node.seq.append(seqRec)
        inpSeq.close()
    return taxMis
コード例 #2
0
ファイル: FastaSplitters.py プロジェクト: andreyto/mgtaxa
 def _multi_iter():
     for inSeq in inSeqs:
         inpSeq = FastaReader(inSeq)
         for rec in ncbiFastaRecordsWithTaxa(fastaReader=filt(inpSeq),
                 taxaTree=taxaTree,
                 giToTaxa=giToTaxa,
                 errorCounter=taxMis):
             yield rec
         inpSeq.close()
コード例 #3
0
ファイル: HctApp.py プロジェクト: andreyto/mgtaxa
 def loadProtIdsOrgType(self,db,idGenSeq,inserterSeq,orgType):
     """Load Fasta deflines for protein sequences generated by pullSeq, parse and insert them into SQL table.
     """
     inFasta = self.store.getFilePath('%s.protein.cat.gz' % orgType)
     inp = FastaReader(inFasta)
     for rec in inp.records():
         hdr = CvTreeId.splitFastaDeflineCvTree(rec.header())
         idSeq = idGenSeq()
         seqLen = rec.seqLen()
         values = (idSeq,hdr.taxid,seqLen,hdr.acc,hdr.acc_gen,orgType[:4])
         inserterSeq(values)
     inp.close()
コード例 #4
0
ファイル: HctApp.py プロジェクト: andreyto/mgtaxa
    def loadGenomicIdsOrgType(self,db,idGenSeq,inserterSeq,orgType):
        """Load Fasta deflines for genomic sequences, index them by accession, and drop non-NC_ and all plasmids.
        @todo It might be more robust to parse the GenBank file, e.g.
        FEATURES             Location/Qualifiers
        source          1..208369
            /organism="Bacillus cereus ATCC 10987"
            /mol_type="genomic DNA"
            /strain="ATCC 10987"
            /db_xref="ATCC:10987"
            /db_xref="taxon:222523"
            /plasmid="pBc10987"
                                                                                                                           gene            join(207497..208369,1..687)

        We would have to fix the gap(unk100) bug first, and also check how the "extrachromosomal" is labeled
        in GB file.
        """
        if orgType == "outgroup":
            inFasta = self.outGroupFna
        else:
            inFasta = pjoin(options.refSeqDataDir,"%s.genomic.fna.gz" % orgType)
        inp = FastaReader(inFasta)
        for rec in inp.records():
            line = rec.header()[1:]
            parts = line.strip().split('|',4)
            assert parts[0] == 'gi'
            gi = int(parts[1])
            assert parts[2] == 'ref'
            acc = parts[3].strip()
            # we can do re.search(r'\bplasmid\b',) instead, but this is safer 
            # (there is a record called 'megaplasmid'):
            if acc[:3] == 'NC_':
                hdr = (' '.join(parts[4:])).strip()
                hlow = hdr.lower()
                ## genel values must differ by the first letter - this is used
                ## by the name generation method later
                if 'plasmid' in hlow:
                    genel = "pla"
                elif 'extrachromosomal' in hlow or 'extra-chromosomal' in hlow:
                    genel = "ext"
                elif 'transposon' in hlow:
                    genel = "tra"
                else:
                    genel = "chr"
                idSeq = idGenSeq()
                seqLen = rec.seqLen()
                values = (idSeq,gi,0,seqLen,acc,genel,orgType[:4],hdr)
                #values = [ str(x) for x in values ]
                inserterSeq(values)

        inp.close()
コード例 #5
0
ファイル: GbFeatApp.py プロジェクト: andreyto/mgtaxa
 def makeFeat(self):
     """Create feature vectors out of FASTA files."""
     maxSampLen = 100000
     kmerCnt = KmerSparseFeatures(sampLen=maxSampLen,
             kmerLen=2,
             rcPolicy=RC_POLICY.MERGE,
             normPolicy=NORM_POLICY.FREQ)
     for fastaFile in self.store.getFilePaths("*.fasta.gz"):
         inpFasta = FastaReader(fastaFile)
         iRec = 0
         for rec in inpFasta.records():
             kmerCnt.process(rec.sequence(format="array"))
             iRec += 1
             if iRec > 100:
                 break
         inpFasta.close()
         feat = kmerCnt.kmerFrequencies()
         id = stripSfx(os.path.basename(fastaFile),".fasta.gz")
         print id, feat
コード例 #6
0
ファイル: fastaToSvm.py プロジェクト: andreyto/mgtaxa
def fastaToSvm(inFileFasta,outName,opt):
    assert not isSamePath(inFileFasta,outName)
    if opt.outFormat == "svm":
        svmWriter = SvmStringFeatureWriterTxt(outName)
    elif opt.outFormat == "fasta":
        svmWriter = SvmFastaFeatureWriterTxt(outName,lineLen=opt.fastaLineLen)
    inpSeq = FastaReader(inFileFasta)
    if opt.degenLen >= 0:
        symCompr = SymbolRunsCompressor('N',opt.degenLen)
    else:
        symCompr = lambda s: s
    if opt.inFormat == "gos":
        meta, allLen = gosToSvm(inpSeq,svmWriter,symCompr,opt)
    elif opt.inFormat == "ca":
        meta, allLen = caToSvm(inpSeq,svmWriter,symCompr,opt)
    else:
        meta, allLen = genericFastaToSvm(inpSeq,svmWriter,symCompr,opt)
    inpSeq.close()
    svmWriter.close()
    print "Saved %i samples out of %i total from file %s" % (len(meta.samp),len(allLen),inFileFasta)
    lenHist = numpy.histogram(allLen,bins=numpy.arange(0,allLen.max()+100,100,dtype='f8'))
    print "Original sample length histogram:\n%s\n%s" % lenHist
    dumpObj(meta,outName+".meta")