def getAlignandum( self, nid ): """get the alignandum object for an nid.""" if self.mCache: if nid not in self.mCache: if self.mProfileLibrary: a = self.mProfileLibrary.getProfile(nid) else: a = self.getProfile( nid ) self.mCache[nid] = a a.prepare() if self.mMask: self.mask( nid, a) else: a = self.mCache[nid] else: try: if self.mProfileLibrary: a = alignlib.makeProfile( self.mProfileBuilder.getMali( neighbours ) ) else: a = self.getProfile( nid ) except KeyError: self.warn( "profile for sequence %s not found." % str(nid)) return None a.prepare() if self.mMask: self.mask( nid, a) if self.mLogLevel >= 5: E.debug( "alignandum for rep %s\n%s" % ( nid, str(a) ) ) return a
def getProfile( self, nid ): """build a profile for nid.""" neighbours = self.mIndexedNeighbours.getNeighbours( nid ) mali = self.mProfileBuilder.buildMali( nid, neighbours ) return alignlib.makeProfile( mali )
def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof
def applyMethod(self, neighbours): """output the graph. If mMergeRepeats is set, consecutive links are merged. Links are consecutive if they are adjacent both in the query and in the sbjct. This ensures that 1:many repeats are not merged, but will cover alignments split by transmembrane regions. """ if self.mContinueAt: if neighbours.mQueryToken == self.mContinueAt: self.info("continuing processing at %s" % str(self.mContinueAt)) self.mContinueAt = None return query_nid = neighbours.mQueryToken self.debug("working on profile %s with %i neighbours" % (query_nid, len(neighbours.mMatches))) mali = self.buildMali(query_nid, neighbours.mMatches) self.debug("built mali for %s with %i neighbours" % (query_nid, len(neighbours.mMatches))) profile = alignlib.makeProfile(mali) self.debug("built profile for %s with %i neighbours" % (query_nid, len(neighbours.mMatches))) profile.setStorageType(alignlib.Sparse) if self.mPrepareProfile: profile.prepare() self.debug("prepared profile for %s with %i neighbours" % (query_nid, len(neighbours.mMatches))) self.mProfileLibrary.add(query_nid, profile) self.debug("saved profile for %s with %i neighbours" % (query_nid, len(neighbours.mMatches)))
def create( self, infile ): """create profile library from file.""" self.mOutfileDatabase = open( self.mFilenameProfiles, "wb" ) outfile_index = open( self.mFilenameIndex, "w" ) ninput, noutput = 0, 0 while mali.readFromFile( sys.stdin, format="profile" ): ninput += 1 m = Mali.convertMali2Alignlib( mali ) p = alignlib.makeProfile( m ) p.prepare() self.add( mali.getName(), p ) noutput += 1 return ninput, noutput
def verify( self, infile ): """verify data in database against original data.""" if not self.mIndex: self.__loadIndex() ninput, nfound, nnotfound, ndifferent = 0,0,0,0 while mali.readFromFile( sys.stdin, format="profile" ): ninput += 1 m = Mali.convertMali2Alignlib( mali ) p1 = alignlib.makeProfile( m ) p1.prepare() p2 = self.getProfile( mali.getName() ) if p1.getLength() != p2.getLength() or \ str(p1) != str(p2): ndifferent += 1 continue nfound += 1 return ninput, nfound, nnotfound, ndifferent
def applyMethod(self, neighbours ): """output the graph. If mMergeRepeats is set, consecutive links are merged. Links are consecutive if they are adjacent both in the query and in the sbjct. This ensures that 1:many repeats are not merged, but will cover alignments split by transmembrane regions. """ if self.mContinueAt: if neighbours.mQueryToken == self.mContinueAt: self.info("continuing processing at %s" % str(self.mContinueAt ) ) self.mContinueAt = None return query_nid = neighbours.mQueryToken self.debug( "working on profile %s with %i neighbours" % (query_nid, len(neighbours.mMatches) ) ) mali = self.buildMali( query_nid, neighbours.mMatches ) self.debug( "built mali for %s with %i neighbours" % (query_nid, len(neighbours.mMatches) ) ) profile = alignlib.makeProfile( mali ) self.debug( "built profile for %s with %i neighbours" % (query_nid, len(neighbours.mMatches) ) ) profile.setStorageType( alignlib.Sparse ) if self.mPrepareProfile: profile.prepare() self.debug( "prepared profile for %s with %i neighbours" % (query_nid, len(neighbours.mMatches) ) ) self.mProfileLibrary.add( query_nid, profile ) self.debug( "saved profile for %s with %i neighbours" % (query_nid, len(neighbours.mMatches) ) )
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option( "--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].", ) parser.add_option( "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]." ) parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" % ( nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo(), ) ) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default]." ) parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default]." ) parser.add_option("-m", "--mode", dest="mode", type="choice", choices = ("global", "local" ), help="alignment mode, global=nw, local=sw [default=%default]." ) parser.set_defaults( gop = -12.0, gep = -2.0, format= "fasta", mode = "local", ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) if len(args) != 2: raise ValueError("please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info( "read 2 multiple alignments" ) mali1.readFromFile( IOTools.openFile( args[0], "r" ), format=options.format ) mali2.readFromFile( IOTools.openFile( args[1], "r" ), format=options.format ) cmali1 = Mali.convertMali2Alignlib( mali1 ) cmali2 = Mali.convertMali2Alignlib( mali2 ) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull( mode, options.gop, options.gep ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorDirichlet( 0.3 ) ) alignlib.setDefaultRegularizor( alignlib.makeRegularizorDirichletPrecomputed() ) cprofile1 = alignlib.makeProfile( cmali1 ) cprofile2 = alignlib.makeProfile( cmali2 ) result = alignlib.makeAlignmentVector() alignator.align( result, cprofile1, cprofile2 ) E.debug( "result=\n%s" % alignlib.AlignmentFormatEmissions( result) ) cmali1.add( cmali2, result ) outmali = Mali.convertAlignlib2Mali( cmali1, identifiers = mali1.getIdentifiers() + mali2.getIdentifiers() ) outmali.writeToFile( options.stdout, format=options.format) ## write footer and output benchmark information. E.Stop()
def _alignToProfile(infile, outfile, min_score=0): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile(open("../data/mouse.fasta")) src_mali = Mali.convertMali2Alignlib(mali) E.debug("read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns())) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0, 2): profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n) profile_mali = Mali.convertMali2Alignlib(profile_mali) alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4)) alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform()) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile(profile_mali) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal(0, n, 0) build_mali.add(src_mali, m) outf = open(outfile, "w") outf_log = open(outfile + ".info", "w") outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append(re.sub("-", "", mali[pid])) ids.append(pid) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator(open(infile)): E.debug("adding %s" % s.title) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence(s.sequence) rseq = alignlib.makeSequence(rsequence) alignator.align(map_seq2profile, seq, profile) alignator.align(map_rseq2profile, rseq, profile) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts(m) covered = 0 for mm in r: build_mali.add(mm) sequences.append(sequence) ids.append(s.title) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write("\t".join( map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence)), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())))) + "\n") c.output += 1 #build_mali.expand( aa ) result = str( alignlib.MultAlignmentFormatPlain(build_mali, sequences, alignlib.UnalignedStacked)) for pid, data in zip(ids, result.split("\n")): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start) + 1, int(end), sequence)) outf.close() outf_log.close() E.info("%s\n" % str(c))
def _alignToProfile( infile, outfile, min_score = 0 ): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile( open("../data/mouse.fasta") ) src_mali = Mali.convertMali2Alignlib( mali ) E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() )) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0,2): profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n ) profile_mali = Mali.convertMali2Alignlib( profile_mali ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() ) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile( profile_mali ) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull( alignment_mode, -5.0, -0.5 ) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal( 0, n, 0 ) build_mali.add( src_mali, m ) outf = open( outfile, "w" ) outf_log = open( outfile + ".info", "w" ) outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append( re.sub( "-", "", mali[pid] ) ) ids.append( pid ) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator( open(infile)): E.debug("adding %s" % s.title ) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence( s.sequence ) rseq = alignlib.makeSequence( rsequence ) alignator.align( map_seq2profile, seq, profile ) alignator.align( map_rseq2profile, rseq, profile ) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts( m ) covered = 0 for mm in r: build_mali.add( mm ) sequences.append( sequence ) ids.append( s.title ) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write( "\t".join( map(str, ( s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence) ), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns()) ) ) ) + "\n" ) c.output += 1 #build_mali.expand( aa ) result = str(alignlib.MultAlignmentFormatPlain( build_mali, sequences, alignlib.UnalignedStacked )) for pid, data in zip(ids, result.split("\n") ): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) ) outf.close() outf_log.close() E.info( "%s\n" % str(c) )
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default].") parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor( alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\ (nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()