def alignIndels( all_alleles, colcounts, extend_by = 0 ): '''align all indel-regions.''' aa = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, 0, 0 ) alignator = alignlib.makeMultipleAlignatorSimple( aa) ids = all_alleles.keys() for x,c in enumerate(colcounts): if c <= 1: continue sequences = alignlib.StringVector() for sid in ids: for allele in all_alleles[sid]: sequences.append( allele[x] ) mali = alignlib.makeMultAlignment() alignator.align( mali, sequences ) realigned = [] for line in str(alignlib.MultAlignmentFormatPlain( mali, sequences )).split("\n")[:-1]: data = line[:-1].split("\t") realigned.append( data[1] ) assert len(realigned) == len(sequences) l = max( [len(r) for r in realigned] ) i = 0 for sid in ids: for allele in all_alleles[sid]: if realigned[i]: allele[x] = realigned[i] else: allele[x] = "-" * l i += 1 colcounts[x] = l
def alignIndels(all_alleles, colcounts, extend_by=0): '''align all indel-regions.''' aa = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_LOCAL, 0, 0) alignator = alignlib.makeMultipleAlignatorSimple(aa) ids = all_alleles.keys() for x, c in enumerate(colcounts): if c <= 1: continue sequences = alignlib.StringVector() for sid in ids: for allele in all_alleles[sid]: sequences.append(allele[x]) mali = alignlib.makeMultAlignment() alignator.align(mali, sequences) realigned = [] for line in str(alignlib.MultAlignmentFormatPlain( mali, sequences)).split("\n")[:-1]: data = line[:-1].split("\t") realigned.append(data[1]) assert len(realigned) == len(sequences) l = max([len(r) for r in realigned]) i = 0 for sid in ids: for allele in all_alleles[sid]: if realigned[i]: allele[x] = realigned[i] else: allele[x] = "-" * l i += 1 colcounts[x] = l
def CheckAlignments( peptide_sequences, query_token, other_tokens ): """check wether query aligns to all others. """ if param_loglevel >= 3: print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens)) sys.stdout.flush() if query_token not in peptide_sequences: return True result = alignlib.makeAlignmentVector() alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0 ) row_seq = alignlib.makeSequence(peptide_sequences[query_token]) for x in other_tokens: if x not in peptide_sequences: continue col_seq = alignlib.makeSequence( peptide_sequences[x] ) alignator.align( result, row_seq, col_seq ) if param_loglevel >= 5: print "# %s - %s = %f" % (query_token, x, result.getScore()) if result.getScore() > param_min_alignment_score: return True return False
def applyMethod(self, neighbours): """apply the method.""" # build multiple alignment mali = alignlib.makeMultipleAlignment() query_nid = neighbours.mQueryToken sequence = self.mFasta.getSequence(query_nid) mali.add(alignlib.makeAlignatum(sequence)) qseq = alignlib.makeSequence(sequence) alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_GLOBAL, -10.0, -1.0, True, True, True, True) for n in neighbours.mMatches: if n.mSbjctToken == query_nid: continue sequence = self.mFasta.getSequence(n.mSbjctToken) blast_query2sbjct = n.getAlignment() if blast_query2sbjct == None: raise ValueError( "AddaRealignment.py needs a reference alignment.") realign_query2sbjct = alignlib.makeAlignmentVector() sseq = alignlib.makeSequence(sequence) qseq.useSegment(n.mQueryFrom, n.mQueryTo) sseq.useSegment(n.mSbjctFrom, n.mSbjctTo) realign_query2sbjct = alignlib.makeAlignmentVector() alignator.align(realign_query2sbjct, qseq, sseq) nidentical = alignlib.getAlignmentIdentity(realign_query2sbjct, blast_query2sbjct, alignlib.RR) nblast = blast_query2sbjct.getNumAligned() nrealigned = realign_query2sbjct.getNumAligned() self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \ (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) ) if nidentical == nblast: self.mNIdentical += 1 else: self.mNDifferent += 1
def applyMethod(self, neighbours ): """apply the method.""" # build multiple alignment mali = alignlib.makeMultipleAlignment() query_nid = neighbours.mQueryToken sequence = self.mFasta.getSequence( query_nid ) mali.add( alignlib.makeAlignatum( sequence ) ) qseq = alignlib.makeSequence( sequence ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, -10.0, -1.0, True, True, True, True) for n in neighbours.mMatches: if n.mSbjctToken == query_nid: continue sequence = self.mFasta.getSequence( n.mSbjctToken ) blast_query2sbjct = n.getAlignment() if blast_query2sbjct == None: raise ValueError( "AddaRealignment.py needs a reference alignment.") realign_query2sbjct = alignlib.makeAlignmentVector() sseq = alignlib.makeSequence( sequence ) qseq.useSegment( n.mQueryFrom, n.mQueryTo ) sseq.useSegment( n.mSbjctFrom, n.mSbjctTo ) realign_query2sbjct = alignlib.makeAlignmentVector() alignator.align( realign_query2sbjct, qseq, sseq ) nidentical = alignlib.getAlignmentIdentity( realign_query2sbjct, blast_query2sbjct, alignlib.RR ) nblast = blast_query2sbjct.getNumAligned() nrealigned = realign_query2sbjct.getNumAligned() self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \ (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) ) if nidentical == nblast: self.mNIdentical += 1 else: self.mNDifferent += 1
def AlignExhaustive( seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width = 2 ): """Align two sequences. Align in chunks to keep memory low. Both sequences are roughly the same, thus align only in diagonal. """ gop, gep = -1.0, -1.0 matrix = alignlib.makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib.getDefaultEncoder() ) alignlib.setDefaultSubstitutionMatrix( matrix ) if seq_wobble.getLength() < 10000: if options.loglevel >= 6: options.stdlog.write( "# using full dynamic programing matrix.\n" ) options.stdlog.flush() # do not penalize gaps at the end, because sometimes the last codon might be missing alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, gop, gep, 1, 1 ) else: diag_width = abs(seq_wobble.getLength() - seq_cds.getLength()) + 1 if options.loglevel >= 6: options.stdlog.write( "# using dot alignment with diagonal %i\n" % diag_width ) options.stdlog.flush() dots = alignlib.makeAlignmentMatrixRow() for x in range(0, seq_wobble.getLength()): xr = seq_wobble.asResidue( x ) for y in range( max(0, x - diag_width), min( seq_cds.getLength(), x + diag_width)): s = matrix.getValue( xr, seq_cds.asResidue(y) ) if s >= 0: dots.addPair( x, y, float(s) ) if options.loglevel >= 6: options.stdlog.write( "# finished adding %i dots" % dots.getLength() ) options.stdlog.flush() alignator_dummy = alignlib.makeAlignatorPrebuilt( dots ) alignator = alignlib.makeAlignatorDots( alignator_dummy, gop, gep ) alignator.align( map_p2c, seq_wobble, seq_cds )
def PrintCluster( cluster, cluster_id, lengths, peptide_sequences = None, regex_preferred = None): """print a cluster. Take longest sequence as representative. If preferred is given, only take genes matching preferred identifier. """ if regex_preferred: rx = re.compile(regex_preferred) else: rx = None max_al = 0 max_pl = 0 rep_a = None rep_p = None for c in cluster: l = 0 if c in lengths: l = lengths[c] if l > max_al: max_al = l rep_a = c if rx and rx.search(c) and l > max_pl: max_pl = l rep_p = c if max_pl > 0: max_l = max_pl rep = rep_p else: max_l = max_al rep = rep_a for mem in cluster: l = 0 if mem in lengths: l = lengths[mem] if peptide_sequences: map_rep2mem = alignlib.makeAlignmentVector() if rep == mem and rep in lengths: alignlib.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0) elif mem in peptide_sequences and \ rep in peptide_sequences: alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0) alignator.align( map_rep2mem, alignlib.makeSequence( peptide_sequences[rep] ), alignlib.makeSequence( peptide_sequences[mem] ) ) f = alignlib.AlignmentFormatEmissions( map_rep2mem ) print string.join( map(str, (rep, mem, l, f)), "\t" ) else: print string.join( map(str, (rep, mem, l)), "\t" ) sys.stdout.flush() return cluster_id
def _alignToProfile( infile, outfile, min_score = 0 ): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile( open("../data/mouse.fasta") ) src_mali = Mali.convertMali2Alignlib( mali ) E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() )) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0,2): profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n ) profile_mali = Mali.convertMali2Alignlib( profile_mali ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() ) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile( profile_mali ) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull( alignment_mode, -5.0, -0.5 ) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal( 0, n, 0 ) build_mali.add( src_mali, m ) outf = open( outfile, "w" ) outf_log = open( outfile + ".info", "w" ) outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append( re.sub( "-", "", mali[pid] ) ) ids.append( pid ) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator( open(infile)): E.debug("adding %s" % s.title ) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence( s.sequence ) rseq = alignlib.makeSequence( rsequence ) alignator.align( map_seq2profile, seq, profile ) alignator.align( map_rseq2profile, rseq, profile ) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts( m ) covered = 0 for mm in r: build_mali.add( mm ) sequences.append( sequence ) ids.append( s.title ) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write( "\t".join( map(str, ( s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence) ), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns()) ) ) ) + "\n" ) c.output += 1 #build_mali.expand( aa ) result = str(alignlib.MultAlignmentFormatPlain( build_mali, sequences, alignlib.UnalignedStacked )) for pid, data in zip(ids, result.split("\n") ): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) ) outf.close() outf_log.close() E.info( "%s\n" % str(c) )
def startUp( self ): if self.isComplete(): return ############################################### # create objects for algorithm alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder( alignlib.Protein20 ) ) self.mLogOddor = alignlib.makeLogOddorDirichlet( self.mScaleFactor ) self.mRegularizor = alignlib.makeRegularizorDirichletPrecomputed() self.mWeightor = alignlib.makeWeightor() alignlib.getDefaultToolkit().setRegularizor( self.mRegularizor ) alignlib.getDefaultToolkit().setLogOddor( self.mLogOddor ) alignlib.getDefaultToolkit().setWeightor( self.mWeightor ) if self.mUsePrebuiltProfiles: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r" ) self.mProfileLibrary.setWeightor( self.mWeightor ) self.mProfileLibrary.setLogOddor( self.mLogOddor ) self.mProfileLibrary.setRegularizor( self.mRegularizor ) else: self.mProfileLibrary = None self.mIndexedNeighbours = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex ) self.mChecker = self.checkLinkZScore self.mHeader = ("qdomain", "sdomain", "weight", "passed", "qstart", "qend", "qali", "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore" ) self.mAlignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, self.mGop, self.mGep ) # the cache to store alignandum objects self.mCache = {} alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) ) ## initialize counters self.mNPassed, self.mNFailed, self.mNNotFound = 0, 0, 0 self.mOutfile = self.openOutputStream( self.mFilenameAlignments ) if self.mContinueAt == None: self.mOutfile.write( "\t".join( self.mHeader ) + "\n" ) self.mOutfile.flush() self.mStartTime = time.time()
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option( "--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].", ) parser.add_option( "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]." ) parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" % ( nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo(), ) ) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
def buildMali(self, query_nid, neighbours): """build a multiple alignment from a set of neighbours. """ # build multiple alignment mali = alignlib.makeMultipleAlignment() query_sequence = self.mFasta.getSequence(query_nid) mali.add(alignlib.makeAlignatum(query_sequence)) qseq = alignlib.makeSequence(query_sequence) alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_LOCAL, -10, -2) nskipped = 0 for n in neighbours[:self.mMaxNumNeighbours]: if n.mSbjctToken == query_nid: continue if n.mEvalue > self.mMaxEvalue: nskipped += 1 continue sequence = self.mFasta.getSequence(n.mSbjctToken) E.debug("adding %s" % str(n)) map_query2sbjct = n.getAlignment() if map_query2sbjct == None: sseq = alignlib.makeSequence(sequence) qseq.useSegment(n.mQueryFrom, n.mQueryTo) sseq.useSegment(n.mSbjctFrom, n.mSbjctTo) map_query2sbjct = alignlib.makeAlignmentVector() alignator.align(map_query2sbjct, qseq, sseq) if map_query2sbjct.getLength() == 0: self.warn("empty alignment: %s" % str(n)) nskipped += 1 continue if map_query2sbjct.getRowTo() > len(query_sequence): self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\ (map_query2sbjct.getRowTo(), len(query_sequence), str(n))) nskipped += 1 continue elif map_query2sbjct.getColTo() > len(sequence): self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\ (map_query2sbjct.getColTo(), len(sequence), str(n))) nskipped += 1 continue try: mali.add(alignlib.makeAlignatum(sequence), map_query2sbjct, mali_is_in_row=True, insert_gaps_mali=False, insert_gaps_alignatum=True, use_end_mali=True, use_end_alignatum=False) except RuntimeError, msg: self.warn("problem when building alignment for %s: msg=%s" % (str(n), msg)) nskipped += 1 continue
def Align( self, method, anchor = 0, loglevel = 1 ): """align a pair of sequences. get rid of this and use a method class instead in the future """ map_a2b = alignlib.makeAlignmentVector() s1 = "A" * anchor + self.mSequence1 + "A" * anchor s2 = "A" * anchor + self.mSequence2 + "A" * anchor self.strand = "+" if method == "dialign": dialign = WrapperDialign.Dialign( self.mOptionsDialign ) dialign.Align( s1, s2, map_a2b ) elif method == "blastz": blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ ) blastz.Align( s1, s2, map_a2b ) if blastz.isReverseComplement(): self.strand = "-" self.mSequence2 = Genomics.complement( self.mSequence2 ) elif method == "dialignlgs": dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS ) dialignlgs.Align( s1, s2, map_a2b ) elif method == "dba": dba = WrapperDBA.DBA() dba.Align( s1, s2, map_a2b ) elif method == "clustal": raise NotImplementedError( "clustal wrapper needs to be updated") clustal = WrapperClustal.Clustal() clustal.Align( s1, s2, map_a2b ) elif method == "nw": seq1 = alignlib.makeSequence( s1 ) seq2 = alignlib.makeSequence( s2 ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, gop=-12.0, gep=-2.0 ) alignator.align( map_a2b, seq1, seq2 ) elif method == "sw": seq1 = alignlib.makeSequence( s1 ) seq2 = alignlib.makeSequence( s2 ) alignlib.performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw ) else: ## use callback function method(s1, s2, map_a2b) if map_a2b.getLength() == 0: raise AlignmentError("empty alignment") if anchor: map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() ) map_a2b.removeRowRegion( 1, anchor) map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() ) map_a2b.removeColRegion( 1, anchor) map_a2b.moveAlignment( -anchor, -anchor ) f = alignlib.AlignmentFormatExplicit( map_a2b, alignlib.makeSequence( self.mSequence1), alignlib.makeSequence( self.mSequence2) ) self.mMethod = method self.mAlignment = map_a2b self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment f = alignlib.AlignmentFormatEmissions( map_a2b ) self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment self.mAlignmentFrom1 = map_a2b.getRowFrom() self.mAlignmentTo1 = map_a2b.getRowTo() self.mAlignmentFrom2 = map_a2b.getColFrom() self.mAlignmentTo2 = map_a2b.getColTo() self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength() self.mAligned = self.mLength - self.mNumGaps self.SetPercentIdentity() self.SetBlockSizes()
nintrons, nsplits, nstopcodons, pidentity, psimilarity, sequence, sbjct_genome_from, sbjct_genome_to, map_query2genome FROM %s AS p WHERE p.sbjct_token = '%s' AND p.sbjct_strand = '%s' AND OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 """ alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep ) map_reference2target = alignlib.makeAlignmentVector() assignment_id = 0 for line in cr.fetchall(): reference = PredictionParser.PredictionParserEntry() reference.FillFromTable( line ) ct = dbhandle.cursor() ct.execute( statement % (param_tablename_predictions_target, reference.mSbjctToken, reference.mSbjctStrand, reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo )) reference_exons = Exons.Alignment2Exons( reference.mMapPeptide2Genome, 0,
def EliminateRedundantEntries( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = None, this_quality = None ): """eliminate redundant entries in a set.""" eliminated = [] rep_id = rep.transcript_id rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep ) result = alignlib.makeAlignmentVector() rep_seq = peptides[rep_id] rep_extended_seq = extended_peptides[rep_id] for entry in data: mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id, entry.mQueryCoverage, entry.mPid, entry.mQuality ) mem_seq = peptides[mem_id] mem_extended_seq = extended_peptides[mem_id] if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality)) if mem_id in eliminated_predictions: continue if mem_extended_seq == rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "i") ) elif mem_extended_seq in rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "p") ) else: if mem_quality != this_quality or \ mem_quality in options.quality_exclude_same: seq1 = alignlib.makeSequence( str(rep_seq) ) seq2 = alignlib.makeSequence( str(mem_seq) ) alignator.align( result, seq1, seq2 ) if options.loglevel >= 5: options.stdlog.write( "# ali\n%s\n" % alignlib.AlignmentFormatExplicit( result, seq1, seq2 ) ) pidentity = 100 * alignlib.calculatePercentIdentity( result, seq1, seq2 ) num_gaps = result.getNumGaps() if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\ ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) ) if pidentity >= options.min_identity: keep = False if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: keep = True reason = "covpid" elif num_gaps >= options.max_gaps and \ mem_coverage > rep_coverage - options.safety_coverage: keep = True reason = "gaps" elif mem_coverage >= rep_coverage - options.safety_coverage and \ 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage: keep = True reason = "memcov" if keep: options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "h") ) elif pidentity >= options.min_identity_non_genes and \ this_quality in options.quality_genes and \ mem_quality not in options.quality_genes: if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "l") ) return eliminated
def buildMali(self, query_nid, neighbours ): """build a multiple alignment from a set of neighbours. """ # build multiple alignment mali = alignlib.makeMultipleAlignment() query_sequence = self.mFasta.getSequence( query_nid ) mali.add( alignlib.makeAlignatum( query_sequence ) ) qseq = alignlib.makeSequence( query_sequence ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10, -2) nskipped = 0 for n in neighbours[:self.mMaxNumNeighbours]: if n.mSbjctToken == query_nid: continue if n.mEvalue > self.mMaxEvalue: nskipped += 1 continue sequence = self.mFasta.getSequence( n.mSbjctToken ) E.debug( "adding %s" % str(n) ) map_query2sbjct = n.getAlignment() if map_query2sbjct == None: sseq = alignlib.makeSequence( sequence ) qseq.useSegment( n.mQueryFrom, n.mQueryTo ) sseq.useSegment( n.mSbjctFrom, n.mSbjctTo ) map_query2sbjct = alignlib.makeAlignmentVector() alignator.align( map_query2sbjct, qseq, sseq ) if map_query2sbjct.getLength() == 0: self.warn( "empty alignment: %s" % str( n ) ) nskipped += 1 continue if map_query2sbjct.getRowTo() > len(query_sequence): self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\ (map_query2sbjct.getRowTo(), len(query_sequence), str(n))) nskipped += 1 continue elif map_query2sbjct.getColTo() > len(sequence): self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\ (map_query2sbjct.getColTo(), len(sequence), str(n))) nskipped += 1 continue try: mali.add( alignlib.makeAlignatum( sequence ), map_query2sbjct, mali_is_in_row = True, insert_gaps_mali = False, insert_gaps_alignatum = True, use_end_mali = True, use_end_alignatum = False ) except RuntimeError, msg: self.warn( "problem when building alignment for %s: msg=%s" % (str(n), msg)) nskipped += 1 continue
def ProcessRegion( predictions, region_id, region, peptide_sequences = None, filter_queries = {} ): """process a set of matches to a region. resolve region according to homology. """ if options.loglevel >= 3: options.stdlog.write( "###################################################################\n" ) options.stdlog.write( "# resolving %i predictions in region %s\n" % ( len(predictions), str(region)) ) sys.stdout.flush() predictions.sort( lambda x,y: cmp(x.score, y.score)) predictions.reverse() alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep ) result = alignlib.makeAlignmentVector() cluster = [] map_sequence2cluster = range(0,len(predictions)) edges = [] noutput, nskipped = 0, 0 if peptide_sequences: for x in range(len(predictions)): if options.loglevel >= 5: options.stdlog.write( "# filtering from %i with prediction %i: %s\n" % (x, predictions[x].mPredictionId, predictions[x].mQueryToken) ) sys.stdout.flush() if map_sequence2cluster[x] != x: continue region_id += 1 edges = [] if predictions[x].mQueryToken not in filter_queries: edges.append( predictions[x] ) else: nskipped += 1 for y in range(x+1,len(predictions)): if map_sequence2cluster[y] != y: continue if predictions[x].mQueryToken < predictions[y].mQueryToken: key = "%s-%s" % (predictions[x].mQueryToken, predictions[y].mQueryToken) else: key = "%s-%s" % (predictions[y].mQueryToken, predictions[x].mQueryToken) # check if predictions are overlapping on the genomic sequence if min(predictions[x].mSbjctGenomeTo, predictions[y].mSbjctGenomeTo) - \ max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0: if options.loglevel >= 4: options.stdlog.write( "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" %\ (predictions[x].mPredictionId, predictions[y].mPredictionId ) ) sys.stdout.flush() continue if not global_alignments.has_key( key ): seq1 = peptide_sequences[predictions[x].mQueryToken] seq2 = peptide_sequences[predictions[y].mQueryToken] result.clear() s1 = alignlib.makeSequence( seq1 ) s2 = alignlib.makeSequence( seq2 ) alignator.align( result, s1, s2 ) c1 = 100 * (result.getRowTo() - result.getRowFrom()) / len(seq1) c2 = 100 * (result.getColTo() - result.getColFrom()) / len(seq2) min_cov = min(c1,c2) max_cov = max(c1,c2) identity = alignlib.calculatePercentIdentity( result, s1, s2 ) * 100 # check if predictions overlap and they are homologous if result.getScore() >= options.overlap_min_score and \ max_cov >= options.overlap_max_coverage and \ min_cov >= options.overlap_min_coverage and \ identity >= options.overlap_min_identity : global_alignments[key] = True else: global_alignments[key] = False if options.loglevel >= 4: options.stdlog.write( "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" %\ (key, result.getScore(), identity, c1,c2, min_cov, max_cov, global_alignments[key]) ) sys.stdout.flush() if global_alignments[key]: map_sequence2cluster[y] = x if predictions[y].mQueryToken not in filter_queries: edges.append( predictions[y] ) else: nskipped += 1 noutput += PrintEdges( region_id, region, edges ) return region_id, noutput, nskipped
def _alignToProfile(infile, outfile, min_score=0): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile(open("../data/mouse.fasta")) src_mali = Mali.convertMali2Alignlib(mali) E.debug("read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns())) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0, 2): profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n) profile_mali = Mali.convertMali2Alignlib(profile_mali) alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4)) alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform()) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile(profile_mali) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal(0, n, 0) build_mali.add(src_mali, m) outf = open(outfile, "w") outf_log = open(outfile + ".info", "w") outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append(re.sub("-", "", mali[pid])) ids.append(pid) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator(open(infile)): E.debug("adding %s" % s.title) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence(s.sequence) rseq = alignlib.makeSequence(rsequence) alignator.align(map_seq2profile, seq, profile) alignator.align(map_rseq2profile, rseq, profile) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts(m) covered = 0 for mm in r: build_mali.add(mm) sequences.append(sequence) ids.append(s.title) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write("\t".join( map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence)), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())))) + "\n") c.output += 1 #build_mali.expand( aa ) result = str( alignlib.MultAlignmentFormatPlain(build_mali, sequences, alignlib.UnalignedStacked)) for pid, data in zip(ids, result.split("\n")): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start) + 1, int(end), sequence)) outf.close() outf_log.close() E.info("%s\n" % str(c))
if options.query_start and options.sbjct_start: options.sbjct_start = max( options.query_start, options.sbjct_start ) else: if not options.sbjct: print USAGE raise "please supply both a query and a sbjct." if options.alignment_mode == "compass": plib_query = ProfileLibraryCompass( options.query, "r" ) plib_sbjct = ProfileLibraryCompass( options.sbjct, "r" ) else: plib_query = ProfileLibrary( options.query, "r" ) plib_sbjct = ProfileLibrary( options.sbjct, "r" ) if options.alignment_mode == "iterative-profile": alignator1 = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 ) alignator = alignlib.makeAlignatorIterative( alignator1, options.iterative_min_score ) elif options.alignment_mode == "iterative-sequence": class AlignatorSequence: def __init__(self): self.mAlignator1 = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 ) self.mAlignator = alignlib.makeAlignatorIterative( self.mAlignator1, options.iterative_min_score ) def align(self, query, sbjct, map_query2sbjct): xrow = alignlib.makeSequence(query.asString()) xcol = alignlib.makeSequence(sbjct.asString()) self.mAlignator.align( xrow, xcol, map_query2sbjct) alignator = AlignatorSequence() elif options.alignment_mode == "compass":
def FilterConflicts( old_predictions, new_predictions, removed_predictions, min_overlap, peptide_sequences): """remove conflicts. Remove overlapping entries between different queries. Only remove those sequences, which are alignable. If they are alignable, take the sequence with the highest score and highest coverage. (Take both, if score and coverage are not correlated.) """ ################################################################################################## ## sort predictions by genomic region if isinstance( old_predictions, PredictionFile.PredictionFile): old_predictions.sort( ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo' ) ) else: old_predictions.sort( lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo) )) ################################################################################################## ## filter predictions and resolve conflicts based on genomic overlap ## deleted segments are put in a temporary storage space. alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep ) result = alignlib.makeAlignmentVector() alignments = {} noverlaps = 0 nredundants = 0 nnew = 0 last_prediction = None for this_prediction in old_predictions: try: this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None if not last_prediction: last_prediction = this_prediction last_query_gene = this_query_gene continue overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \ max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) union = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \ min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) # resolve overlap between different genes if overlap > 0 and \ (last_query_gene != this_query_gene or last_query_gene == None): noverlaps += 1 relative_overlap = 100 * overlap / union # Start conflict resolution, if overlap is above threshold. # Keep higher scoring segment. # # Check if queries are homologous. if relative_overlap >= param_max_percent_overlap: if peptide_sequences: if last_prediction.mQueryToken < this_prediction.mQueryToken: key = "%s-%s" % (last_prediction.mQueryToken, this_prediction.mQueryToken) else: key = "%s-%s" % (this_prediction.mQueryToken, last_prediction.mQueryToken) if not alignments.has_key( key ): result.clear() alignator.align( result, alignlib.makeSequence( peptide_sequences[this_prediction.mQueryToken]), alignlib.makeSequence( peptide_sequences[last_prediction.mQueryToken]) ) alignments[key] = result.getScore() if result.getScore() >= param_min_score_overlap: nredundants += 1 if alignments[key] >= param_min_score_overlap: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 0 if is_overlap: # take best prediction. If difference is very small, set # difference to 0 (difference does not matter). In this case, # the first prediction is taken. d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference: d1 = 0 d2 = last_prediction.score - this_prediction.score if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference: d2 = 0 if d1 >= 0 and d2 >= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(this_prediction)) if param_benchmarks: if CheckBenchmark( this_prediction, last_prediction ): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap, str(last_prediction)) removed_predictions.append( this_prediction ) continue elif d1 <= 0 and d2 <= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(last_prediction)) if param_benchmarks: if CheckBenchmark( last_prediction, this_prediction ): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap, str(this_prediction)) removed_predictions.append( last_prediction ) last_prediction = this_prediction last_query_gene = this_query_gene continue else: if param_loglevel >= 2: print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \ (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, this_prediction.score, this_prediction.mQueryCoverage, this_prediction.mPercentIdentity, last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, last_prediction.score, last_prediction.mQueryCoverage, last_prediction.mPercentIdentity) new_predictions.append(last_prediction) nnew += 1 last_query_gene = this_query_gene last_prediction = this_prediction new_predictions.append(last_prediction) nnew += 1 if param_loglevel >= 1: print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \ (len(alignments), noverlaps, nredundants) return nnew
def __init__(self): self.mAlignator1 = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 ) self.mAlignator = alignlib.makeAlignatorIterative( self.mAlignator1, options.iterative_min_score )
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default]." ) parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default]." ) parser.add_option("-m", "--mode", dest="mode", type="choice", choices = ("global", "local" ), help="alignment mode, global=nw, local=sw [default=%default]." ) parser.set_defaults( gop = -12.0, gep = -2.0, format= "fasta", mode = "local", ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) if len(args) != 2: raise ValueError("please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info( "read 2 multiple alignments" ) mali1.readFromFile( IOTools.openFile( args[0], "r" ), format=options.format ) mali2.readFromFile( IOTools.openFile( args[1], "r" ), format=options.format ) cmali1 = Mali.convertMali2Alignlib( mali1 ) cmali2 = Mali.convertMali2Alignlib( mali2 ) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull( mode, options.gop, options.gep ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorDirichlet( 0.3 ) ) alignlib.setDefaultRegularizor( alignlib.makeRegularizorDirichletPrecomputed() ) cprofile1 = alignlib.makeProfile( cmali1 ) cprofile2 = alignlib.makeProfile( cmali2 ) result = alignlib.makeAlignmentVector() alignator.align( result, cprofile1, cprofile2 ) E.debug( "result=\n%s" % alignlib.AlignmentFormatEmissions( result) ) cmali1.add( cmali2, result ) outmali = Mali.convertAlignlib2Mali( cmali1, identifiers = mali1.getIdentifiers() + mali2.getIdentifiers() ) outmali.writeToFile( options.stdout, format=options.format) ## write footer and output benchmark information. E.Stop()
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default].") parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor( alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\ (nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()