def create(self, infile): """create profile library from file.""" self.mOutfileDatabase = open(self.mFilenameProfiles, "wb") outfile_index = open(self.mFilenameIndex, "w") ninput, noutput = 0, 0 while mali.readFromFile(sys.stdin, format="profile"): ninput += 1 m = Mali.convertMali2Alignlib(mali) p = alignlib_lite.py_makeProfile(m, weightor=self.mWeightor) p.prepare() self.appendProfile(mali.getName(), p) noutput += 1 return ninput, noutput
def verify(self, infile): """verify data in database against original data.""" if not self.mIndex: self.__loadIndex() ninput, nfound, nnotfound, ndifferent = 0, 0, 0, 0 while mali.readFromFile(sys.stdin, format="profile"): ninput += 1 m = Mali.convertMali2Alignlib(mali) p1 = alignlib_lite.py_makeProfile(m) p1.prepare() p2 = self.getProfile(mali.getName()) if p1.getLength() != p2.getLength() or \ str(p1) != str(p2): ndifferent += 1 continue nfound += 1 return ninput, nfound, nnotfound, ndifferent
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.add_option("-m", "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode, global=nw, local=sw [default=%default].") parser.set_defaults( gop=-12.0, gep=-2.0, format="fasta", mode="local", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info("read 2 multiple alignments") mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format) mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format) cmali1 = Mali.convertMali2Alignlib(mali1) cmali2 = Mali.convertMali2Alignlib(mali2) if options.mode == "local": mode = alignlib_lite.py_ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib_lite.py_ALIGNMENT_GLOBAL alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop, options.gep) alignlib_lite.py_setDefaultEncoder( alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20)) alignlib_lite.py_setDefaultLogOddor( alignlib_lite.py_makeLogOddorDirichlet(0.3)) alignlib_lite.py_setDefaultRegularizor( alignlib_lite.py_makeRegularizorDirichletPrecomputed()) cprofile1 = alignlib_lite.py_makeProfile(cmali1) cprofile2 = alignlib_lite.py_makeProfile(cmali2) result = alignlib_lite.py_makeAlignmentVector() alignator.align(result, cprofile1, cprofile2) E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result)) cmali1.add(cmali2, result) outmali = Mali.convertAlignlib2Mali(cmali1, identifiers=mali1.getIdentifiers() + mali2.getIdentifiers()) outmali.writeToFile(options.stdout, format=options.format) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.add_option( "-m", "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode, global=nw, local=sw [default=%default].") parser.set_defaults( gop=-12.0, gep=-2.0, format="fasta", mode="local", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info("read 2 multiple alignments") mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format) mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format) cmali1 = Mali.convertMali2Alignlib(mali1) cmali2 = Mali.convertMali2Alignlib(mali2) if options.mode == "local": mode = alignlib_lite.py_ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib_lite.py_ALIGNMENT_GLOBAL alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop, options.gep) alignlib_lite.py_setDefaultEncoder( alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20)) alignlib_lite.py_setDefaultLogOddor( alignlib_lite.py_makeLogOddorDirichlet(0.3)) alignlib_lite.py_setDefaultRegularizor( alignlib_lite.py_makeRegularizorDirichletPrecomputed()) cprofile1 = alignlib_lite.py_makeProfile(cmali1) cprofile2 = alignlib_lite.py_makeProfile(cmali2) result = alignlib_lite.py_makeAlignmentVector() alignator.align(result, cprofile1, cprofile2) E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result)) cmali1.add(cmali2, result) outmali = Mali.convertAlignlib2Mali(cmali1, identifiers=mali1.getIdentifiers() + mali2.getIdentifiers()) outmali.writeToFile(options.stdout, format=options.format) # write footer and output benchmark information. E.Stop()
def _alignToProfile(infile, outfile, min_score=0): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile(open("../data/mouse.fasta")) src_mali = Mali.convertMali2Alignlib(mali) E.debug("read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns())) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0, 2): profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n) profile_mali = Mali.convertMali2Alignlib(profile_mali) alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4)) alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform()) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile(profile_mali) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal(0, n, 0) build_mali.add(src_mali, m) outf = open(outfile, "w") outf_log = open(outfile + ".info", "w") outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append(re.sub("-", "", mali[pid])) ids.append(pid) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator(open(infile)): E.debug("adding %s" % s.title) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence(s.sequence) rseq = alignlib.makeSequence(rsequence) alignator.align(map_seq2profile, seq, profile) alignator.align(map_rseq2profile, rseq, profile) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts(m) covered = 0 for mm in r: build_mali.add(mm) sequences.append(sequence) ids.append(s.title) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write("\t".join( map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence)), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())))) + "\n") c.output += 1 #build_mali.expand( aa ) result = str( alignlib.MultAlignmentFormatPlain(build_mali, sequences, alignlib.UnalignedStacked)) for pid, data in zip(ids, result.split("\n")): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start) + 1, int(end), sequence)) outf.close() outf_log.close() E.info("%s\n" % str(c))
def _alignToProfile( infile, outfile, min_score = 0 ): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile( open("../data/mouse.fasta") ) src_mali = Mali.convertMali2Alignlib( mali ) E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() )) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0,2): profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n ) profile_mali = Mali.convertMali2Alignlib( profile_mali ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() ) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile( profile_mali ) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull( alignment_mode, -5.0, -0.5 ) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal( 0, n, 0 ) build_mali.add( src_mali, m ) outf = open( outfile, "w" ) outf_log = open( outfile + ".info", "w" ) outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append( re.sub( "-", "", mali[pid] ) ) ids.append( pid ) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator( open(infile)): E.debug("adding %s" % s.title ) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence( s.sequence ) rseq = alignlib.makeSequence( rsequence ) alignator.align( map_seq2profile, seq, profile ) alignator.align( map_rseq2profile, rseq, profile ) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts( m ) covered = 0 for mm in r: build_mali.add( mm ) sequences.append( sequence ) ids.append( s.title ) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write( "\t".join( map(str, ( s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence) ), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns()) ) ) ) + "\n" ) c.output += 1 #build_mali.expand( aa ) result = str(alignlib.MultAlignmentFormatPlain( build_mali, sequences, alignlib.UnalignedStacked )) for pid, data in zip(ids, result.split("\n") ): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) ) outf.close() outf_log.close() E.info( "%s\n" % str(c) )