def expand( self ): if not self.mMapOld2New: self.mMapOld2New = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatEmissions( self.mOldFrom, self.mOldAli, self.mNewFrom, self.mNewAli).copy( self.mMapOld2New )
def expand(self): if not self.mMapOld2New: self.mMapOld2New = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatEmissions( self.mOldFrom, self.mOldAli, self.mNewFrom, self.mNewAli).copy(self.mMapOld2New)
def getMapPeptide2Cds(peptide_sequence, cds_sequence, options): """get map between peptide sequence and cds sequence. The returned alignment is in nucleotides. """ # remove whitespaces form protein sequence p = re.sub(" ", "", peptide_sequence) # remove gaps and whitespaces from cds c = re.sub("[ .-]", "", cds_sequence) w = Genomics.Protein2Wobble(p.upper()) if options.loglevel >= 6: options.stdlog.write("# peptide original (%5i): %s\n" % (len(p), p)) options.stdlog.write("# cds original (%5i): %s\n" % (len(c), c)) options.stdlog.write("# wobble sequence (%5i): %s\n" % (len(w), w)) options.stdlog.flush() seq_wobble = alignlib_lite.py_makeSequence(w) seq_cds = alignlib_lite.py_makeSequence(c.upper()) seq_peptide = alignlib_lite.py_makeSequence(p) map_p2c = alignlib_lite.py_makeAlignmentVector() try: AlignCodonBased(seq_wobble, seq_cds, seq_peptide, map_p2c, options=options) except ValueError, msg: raise ValueError("mapping error for sequence: %s" % (msg))
def getMapPeptide2Cds( peptide_sequence, cds_sequence, options ): """get map between peptide sequence and cds sequence. The returned alignment is in nucleotides. """ ## remove whitespaces form protein sequence p = re.sub(" ", "", peptide_sequence ) ## remove gaps and whitespaces from cds c = re.sub("[ .-]", "", cds_sequence ) w = Genomics.Protein2Wobble( p.upper() ) if options.loglevel >= 6: options.stdlog.write( "# peptide original (%5i): %s\n" % (len(p), p) ) options.stdlog.write( "# cds original (%5i): %s\n" % (len(c), c) ) options.stdlog.write( "# wobble sequence (%5i): %s\n" % (len(w), w) ) options.stdlog.flush() seq_wobble = alignlib_lite.py_makeSequence( w ) seq_cds = alignlib_lite.py_makeSequence( string.upper(c) ) seq_peptide = alignlib_lite.py_makeSequence( p ) map_p2c = alignlib_lite.py_makeAlignmentVector() try: AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options = options ) except ValueError, msg: raise ValueError( "mapping error for sequence: %s" % (msg) )
def AlignPair(pair, anchor=0): """align a pair of introns.""" map_intron_a2b = alignlib_lite.py_makeAlignmentVector() if param_loglevel >= 1: print "# aligning %s-%i with %s-%i: lengths %i and %i" % ( pair.mToken1, pair.mIntronId1, pair.mToken2, pair.mIntronId2, len(pair.mAlignedSequence1), len(pair.mAlignedSequence2)) sys.stdout.flush() s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor if param_method == "dialigned": dialign.Align(s1, s2, map_intron_a2b) elif param_method == "dialignedlgs": dialignlgs.Align(s1, s2, map_intron_a2b) elif param_method == "dbaligned": dba.Align(s1, s2, map_intron_a2b) elif param_method == "clusaligned": raise NotImplementedError("clustalw wrapper not up-to-date") clustal.Align(s1, s2, map_intron_a2b) if anchor: map_intron_a2b.removeRowRegion( anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo()) map_intron_a2b.removeRowRegion(1, anchor) map_intron_a2b.removeColRegion( anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo()) map_intron_a2b.removeColRegion(1, anchor) map_intron_a2b.moveAlignment(-anchor, -anchor) if map_intron_a2b.getLength() == 0: if param_loglevel >= 1: print "# Error: empty intron alignment" return False seq1 = alignlib_lite.py_makeSequence(pair.mAlignedSequence1) seq2 = alignlib_lite.py_makeSequence(pair.mAlignedSequence2) data = alignlib_lite.py_AlignmentFormatExplicit(map_intron_a2b, seq1, seq2) pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mMethod = param_method pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps( ), map_intron_a2b.getLength() pair.mAligned = pair.mLength - pair.mNumGaps if param_loglevel >= 2: print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2 return True
def AlignPair(pair, anchor=0): """align a pair of introns.""" map_intron_a2b = alignlib_lite.py_makeAlignmentVector() if param_loglevel >= 1: print "# aligning %s-%i with %s-%i: lengths %i and %i" % (pair.mToken1, pair.mIntronId1, pair.mToken2, pair.mIntronId2, len(pair.mAlignedSequence1), len(pair.mAlignedSequence2)) sys.stdout.flush() s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor if param_method == "dialigned": dialign.Align(s1, s2, map_intron_a2b) elif param_method == "dialignedlgs": dialignlgs.Align(s1, s2, map_intron_a2b) elif param_method == "dbaligned": dba.Align(s1, s2, map_intron_a2b) elif param_method == "clusaligned": raise NotImplementedError("clustalw wrapper not up-to-date") clustal.Align(s1, s2, map_intron_a2b) if anchor: map_intron_a2b.removeRowRegion( anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo()) map_intron_a2b.removeRowRegion(1, anchor) map_intron_a2b.removeColRegion( anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo()) map_intron_a2b.removeColRegion(1, anchor) map_intron_a2b.moveAlignment(-anchor, -anchor) if map_intron_a2b.getLength() == 0: if param_loglevel >= 1: print "# Error: empty intron alignment" return False seq1 = alignlib_lite.py_makeSequence(pair.mAlignedSequence1) seq2 = alignlib_lite.py_makeSequence(pair.mAlignedSequence2) data = alignlib_lite.py_AlignmentFormatExplicit(map_intron_a2b, seq1, seq2) pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mMethod = param_method pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps( ), map_intron_a2b.getLength() pair.mAligned = pair.mLength - pair.mNumGaps if param_loglevel >= 2: print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2 return True
def GetMap( self ): """return map between the two segments.""" if self.mAlignmentFrom1 and self.mAlignmentFrom2: map_a2b = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatEmissions( self.mAlignmentFrom1, self.mAlignment1, self.mAlignmentFrom2, self.mAlignment2 ).copy( map_a2b ) return map_a2b else: return None
def fillFromTable( self, table_row ): if len(table_row) == 25: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString) = table_row elif len(table_row) == 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] elif len(table_row) > 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] else: raise ValueError, "unknown format: %i fields" % len(data) sys.exit(0) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector() if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli ).copy( self.mMapPeptide2Translation ) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() (options, args) = E.Start(parser) iterator = FastaIterator.FastaIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n" ) while 1: try: cur_record = iterator.next() except StopIteration: break ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) map_sequence2mali = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatExplicit(0, sequence, 0, "X" * l).copy(map_sequence2mali) options.stdout.write("\t".join( (cur_record.title, "ref", str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) + "\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) E.Stop()
def Alignment2CDNA(alignment, query_from=0, sbjct_from=0, genome=None, remove_frameshifts=0): """build cDNA sequence from genomic fragment and return alignment of query to it. """ fragments = [] sbjct_pos = 0 map_query2sbjct = alignlib_lite.py_makeAlignmentVector() # count in nucleotides for query query_pos = query_from * 3 sbjct_pos = sbjct_from # position in cDNA cdna_pos = 0 for state, l_query, l_sbjct in alignment: # count as nucleotides l_query *= 3 keep = False if state == "M": keep = True elif state == "S": l_query = l_sbjct keep = True elif state == "F" and not remove_frameshifts: keep = True elif state == "G": if l_sbjct > 0: keep = True elif state == "P": keep = False if keep: if genome: fragments.append(genome[sbjct_pos:sbjct_pos + l_sbjct]) if l_query > 0 and l_sbjct > 0: alignlib_lite.py_addDiagonal2Alignment(map_query2sbjct, query_pos, query_pos + l_query, cdna_pos - query_pos) cdna_pos += l_sbjct query_pos += l_query sbjct_pos += l_sbjct return map_query2sbjct, fragments.join("")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.set_defaults( ) (options, args) = E.Start(parser) iterator = FastaIterator.FastaIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n") while 1: try: cur_record = iterator.next() except StopIteration: break ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) map_sequence2mali = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatExplicit(0, sequence, 0, "X" * l).copy(map_sequence2mali) options.stdout.write("\t".join(( cur_record.title, "ref", str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) + "\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) E.Stop()
def getCopy(self): """return a new copy. """ new_entry = Prediction() new_entry.mExpand = self.mExpand new_entry.mPredictionId = self.mPredictionId new_entry.mQueryToken = self.mQueryToken new_entry.mQueryFrom = self.mQueryFrom new_entry.mQueryTo = self.mQueryTo new_entry.mSbjctToken = self.mSbjctToken new_entry.mSbjctStrand = self.mSbjctStrand new_entry.mSbjctFrom = self.mSbjctFrom new_entry.mSbjctTo = self.mSbjctTo new_entry.mRank = self.mRank new_entry.score = self.score new_entry.mQueryLength = self.mQueryLength new_entry.mQueryCoverage = self.mQueryCoverage new_entry.mNGaps = self.mNGaps new_entry.mNFrameShifts = self.mNFrameShifts new_entry.mNIntrons = self.mNIntrons new_entry.mNSplits = self.mNSplits new_entry.mNStopCodons = self.mNStopCodons new_entry.mPercentIdentity = self.mPercentIdentity new_entry.mPercentSimilarity = self.mPercentSimilarity new_entry.mTranslation = self.mTranslation new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo new_entry.mAlignmentString = self.mAlignmentString new_entry.mQueryAli = self.mQueryAli new_entry.mSbjctAli = self.mSbjctAli if self.mExpand: new_entry.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) alignlib_lite.py_copyAlignment(new_entry.mMapPeptide2Translation, self.mMapPeptide2Translation) new_entry.mMapPeptide2Genome = Genomics.String2Alignment( new_entry.mAlignmentString) else: new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None return new_entry
def fillFromTable(self, table_row): if len(table_row) == 25: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString) = table_row elif len(table_row) == 26: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] elif len(table_row) > 26: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] else: raise ValueError, "unknown format: %i fields" % len(data) sys.exit(0) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy(self.mMapPeptide2Translation) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString)
def getCopy( self ): """return a new copy. """ new_entry = Prediction() new_entry.mExpand = self.mExpand new_entry.mPredictionId = self.mPredictionId new_entry.mQueryToken = self.mQueryToken new_entry.mQueryFrom = self.mQueryFrom new_entry.mQueryTo = self.mQueryTo new_entry.mSbjctToken = self.mSbjctToken new_entry.mSbjctStrand = self.mSbjctStrand new_entry.mSbjctFrom = self.mSbjctFrom new_entry.mSbjctTo = self.mSbjctTo new_entry.mRank = self.mRank new_entry.score = self.score new_entry.mQueryLength = self.mQueryLength new_entry.mQueryCoverage = self.mQueryCoverage new_entry.mNGaps = self.mNGaps new_entry.mNFrameShifts = self.mNFrameShifts new_entry.mNIntrons = self.mNIntrons new_entry.mNSplits = self.mNSplits new_entry.mNStopCodons = self.mNStopCodons new_entry.mPercentIdentity = self.mPercentIdentity new_entry.mPercentSimilarity = self.mPercentSimilarity new_entry.mTranslation = self.mTranslation new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo new_entry.mAlignmentString = self.mAlignmentString new_entry.mQueryAli = self.mQueryAli new_entry.mSbjctAli = self.mSbjctAli if self.mExpand: new_entry.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_copyAlignment( new_entry.mMapPeptide2Translation, self.mMapPeptide2Translation) new_entry.mMapPeptide2Genome = Genomics.String2Alignment( new_entry.mAlignmentString) else: new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None return new_entry
def __init__(self, expand=1): self.mExpand = expand self.mPredictionId = 0 self.mQueryToken = 0 self.mQueryFrom = 0 self.mQueryTo = 0 self.mSbjctToken = 0 self.mSbjctStrand = 0 self.mSbjctFrom = 0 self.mSbjctTo = 0 self.mRank = 0 self.score = 0 self.mQueryLength = 0 self.mQueryCoverage = 0 self.mNGaps = 0 self.mNFrameShifts = 0 self.mNIntrons = 0 self.mNSplits = 0 self.mNStopCodons = 0 self.mPercentIdentity = 0 self.mPercentSimilarity = 0 self.mTranslation = "" self.mSbjctGenomeFrom = 0 self.mSbjctGenomeTo = 0 self.mAlignmentString = "" self.mQueryAli = "" self.mSbjctAli = "" if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) self.mMapPeptide2Genome = [] else: self.mMapPeptide2Translation = None self.mMapPeptide2Genome = None self.mNAssembled = 0
def __init__(self, expand = 1): self.mExpand = expand self.mPredictionId = 0 self.mQueryToken = 0 self.mQueryFrom = 0 self.mQueryTo = 0 self.mSbjctToken = 0 self.mSbjctStrand = 0 self.mSbjctFrom = 0 self.mSbjctTo = 0 self.mRank = 0 self.score = 0 self.mQueryLength = 0 self.mQueryCoverage = 0 self.mNGaps = 0 self.mNFrameShifts = 0 self.mNIntrons = 0 self.mNSplits = 0 self.mNStopCodons = 0 self.mPercentIdentity = 0 self.mPercentSimilarity = 0 self.mTranslation = "" self.mSbjctGenomeFrom = 0 self.mSbjctGenomeTo = 0 self.mAlignmentString = "" self.mQueryAli = "" self.mSbjctAli = "" if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector() self.mMapPeptide2Genome = [] else: self.mMapPeptide2Translation = None self.mMapPeptide2Genome = None self.mNAssembled = 0
continue if pair.mMethod == "unaligned": unaligned_pair = pair pair.mType1 = GetIntronType(unaligned_pair.mAlignedSequence1) pair.mType2 = GetIntronType(unaligned_pair.mAlignedSequence2) do_print = param_echo_unaligned else: do_print = 1 if param_is_compressed: if unaligned_pair and \ unaligned_pair.mToken1 == pair.mToken1 and \ unaligned_pair.mToken2 == pair.mToken2 and \ unaligned_pair.mIntronId1 == pair.mIntronId1: map_a2b = alignlib_lite.py_makeAlignmentVector() f = AlignmentFormatEmissions( pair.mFrom1, pair.mAlignedSequence1, pair.mFrom2, pair.mAlignedSequence2).copy(map_a2b) map_a2b.moveAlignment(-unaligned_pair.mFrom1 + 1, -unaligned_pair.mFrom2 + 1) data = alignlib_lite.py_AlignmentFormatExplicit(map_a2b, alignlib_lite.py_makeSequence( unaligned_pair.mAlignedSequence1), alignlib_lite.py_makeSequence(unaligned_pair.mAlignedSequence2)) from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--methods", dest="methods", type="choice", action="append", choices=("summary-numbers", "jalview", "positive-site-table", "positive-site-list", "count-positive-sites"), help="methods for analysis.") parser.add_option("--selection-mode", dest="selection_mode", type="choice", choices=("all", "consistent", "emes"), help="how to select positive sites.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string", help="input pattern.") parser.add_option( "--filter-probability", dest="filter_probability", type="float", help= "threshold for probability above which to include positive sites [default=%default]." ) parser.add_option( "--filter-omega", dest="filter_omega", type="float", help= "threshold for omega above which to include positive sites [default=%default]." ) parser.add_option("--models", dest="models", type="string", help="restrict output to set of site specific models.") parser.add_option("--analysis", dest="analysis", type="string", help="restrict output to set of analysis [beb|neb].") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="significance threshold for log-likelihood test.") parser.add_option("--filter-mali", dest="filter_mali", type="choice", choices=("none", "gaps"), help="filter by mali to remove gapped positions.") parser.add_option( "--filename-mali", dest="filename_mali", type="string", help= "filename with multiple alignment used for calculating sites - used for filtering" ) parser.add_option( "--filename-map-mali", dest="filename_map_mali", type="string", help="filename with multiple alignment to map sites onto.") parser.add_option( "--jalview-titles", dest="jalview_titles", type="string", help="comma separated list of jalview annotation titles.") parser.add_option("--jalview-symbol", dest="jalview_symbol", type="string", help="symbol to use in jalview.") parser.set_defaults( methods=[], prefix=None, filter_probability=0, filter_omega=0, models="", analysis="", significance_threshold=0.05, selection_mode="consistent", filename_mali=None, filename_map_mali=None, jalview_symbol="*", jalview_titles="", filter_mali=None, ) (options, args) = E.Start(parser) if options.jalview_titles: options.jalview_titles = options.jalview_titles.split(",") else: options.jalview_titles = args options.models = options.models.split(",") options.analysis = options.analysis.split(",") for a in options.analysis: if a not in ("beb", "neb"): raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a for a in options.models: if a not in ("8", "2", "3"): raise "unknown model: '%s', possible values are 2, 3, 8" % a codeml = WrapperCodeML.CodeMLSites() ## filter and extract functions filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega extract_f = lambda x: x.mResidue ## read multiple results results = [] ninput, noutput, nskipped = 0, 0, 0 headers = [] for f in args: ninput += 1 try: results.append(codeml.parseOutput(open(f, "r").readlines())) except WrapperCodeML.UsageError: if options.loglevel >= 1: options.stdlog.write("# no input from %s\n" % f) nskipped += 1 continue noutput += 1 headers.append(f) ## map of nested model (key) to more general model map_nested_models = {'8': '7', '2': '1', '3': '0'} if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: mali = None ############################################################### ############################################################### ############################################################### ## use multiple alignment to map residues to a reference mali ## or a sequence. ############################################################### if options.filename_map_mali: if not mali: raise "please supply the input multiple alignment, if residues are to be mapped." ## translate the alignments def translate(s): sequence = s.mString seq = [] for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) s.mString = "".join(seq) tmali = Mali.Mali() tmali.readFromFile(open(options.filename_mali, "r")) tmali.apply(translate) tmap_mali = Mali.Mali() tmap_mali.readFromFile(open(options.filename_map_mali, "r")) if tmap_mali.getAlphabet() == "na": tmap_mali.apply(translate) map_old2new = alignlib_lite.py_makeAlignmentVector() mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali)) if tmap_mali.getLength() == 1: s = tmap_mali.values()[0].mString mali2 = alignlib_lite.py_makeSequence(s) ## see if you can find an identical subsequence and then align to thisD for x in tmali.values(): if s in re.sub("[- .]+", "", x.mString): mali1 = alignlib_lite.py_makeSequence(x.mString) break else: mali2 = alignlib_lite.py_makeProfileFromMali( convertMali2Mali(tmap_mali)) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0) alignator.align(map_old2new, mali1, mali2) consensus = tmap_mali.getConsensus() if options.loglevel >= 4: options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet()) options.stdlog.write("# orig : %s\n" % tmali.getConsensus()) options.stdlog.write("# mapped: %s\n" % consensus) options.stdlog.write("# alignment: %s\n" % map_old2new.Write()) else: map_old2new = None for method in options.methods: if method == "summary-numbers": options.stdlog.write( \ """# Numbers of positive sites. # # The consistent row/column contains positive sites that are significant # (above thresholds for probability and omega) for all models/analysis # that have been selected (label: cons). # # The log-likelihood ratio test is performed for model pairs, depending # on the output chosen. # Significance threshold: %6.4f # The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0. # """ % options.significance_threshold ) ## write header if options.prefix: options.stdout.write("prefix\t") options.stdout.write("method\tnseq\t") h = [] for model in options.models: for analysis in options.analysis: h.append("%s%s" % (analysis, model)) h.append("p%s" % (model)) h.append("df%s" % (model)) h.append("chi%s" % (model)) h.append("lrt%s" % (model)) options.stdout.write("\t".join(h)) options.stdout.write("\tcons\tpassed\tfilename\n") nmethod = 0 consistent_cols = [None for x in range(len(options.analysis))] passed_tests = {} for m in options.models: passed_tests[m] = 0 for result in results: row_consistent = None if options.prefix: options.stdout.write("%s" % (options.prefix)) options.stdout.write("%i" % nmethod) options.stdout.write("\t%i" % (result.mNumSequences)) npassed = 0 for model in options.models: sites = result.mSites[model] ## do significance test full_model, null_model = model, map_nested_models[model] lrt = Stats.doLogLikelihoodTest( result.mSites[full_model].mLogLikelihood, result.mSites[full_model].mNumParameters, result.mSites[null_model].mLogLikelihood, result.mSites[null_model].mNumParameters, options.significance_threshold) x = 0 for analysis in options.analysis: if analysis == "neb": s = set( map( extract_f, filter(filter_f, sites.mNEB.mPositiveSites))) elif analysis == "beb": s = set( map( extract_f, filter(filter_f, sites.mBEB.mPositiveSites))) options.stdout.write("\t%i" % (len(s))) if not lrt.mPassed: s = set() if row_consistent == None: row_consistent = s else: row_consistent = row_consistent.intersection(s) if consistent_cols[x] == None: consistent_cols[x] = s else: consistent_cols[x] = consistent_cols[ x].intersection(s) x += 1 if lrt.mPassed: c = "passed" passed_tests[model] += 1 npassed += 1 else: c = "failed" options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %\ (lrt.mProbability, lrt.mDegreesFreedom, lrt.mChiSquaredValue, c)) options.stdout.write( "\t%i\t%i\t%s\n" % (len(row_consistent), npassed, headers[nmethod])) nmethod += 1 if options.prefix: options.stdout.write("%s\t" % options.prefix) options.stdout.write("cons") row_consistent = None total_passed = 0 for model in options.models: x = 0 for analysis in options.analysis: s = consistent_cols[x] if s == None: s = set() options.stdout.write("\t%i" % (len(s))) if row_consistent == None: row_consistent = s else: row_consistent = row_consistent.intersection(s) x += 1 options.stdout.write("\tna\t%i" % passed_tests[model]) total_passed += passed_tests[model] options.stdout.write("\t%i\t%i\n" % (len(row_consistent), total_passed)) elif method == "jalview": options.stdout.write("JALVIEW_ANNOTATION\n") options.stdout.write("# Created: %s\n\n" % (time.asctime(time.localtime(time.time())))) l = 1 x = 0 for result in results: sites, significance = selectPositiveSites( [result], options.selection_mode, options, mali) codes = [""] * result.mLength if len(sites) == 0: continue for site in sites: codes[site - 1] = options.jalview_symbol options.stdout.write( "NO_GRAPH\t%s\t%s\n" % (options.jalview_titles[x], "|".join(codes))) x += 1 elif method == "count-positive-sites": sites, significance = selectPositiveSites(results, options.selection_mode, options, mali) options.stdout.write("%i\n" % (len(sites))) elif method in ("positive-site-table", ): sites, significance = selectPositiveSites(results, options.selection_mode, options, mali) headers = ["site", "P"] if map_old2new: headers.append("mapped") headers.append("Pm") options.stdout.write("\t".join(headers) + "\n") sites = list(sites) sites.sort() nmapped, nunmapped = 0, 0 for site in sites: values = [site, "%6.4f" % significance[site]] if map_old2new: r = map_old2new.mapRowToCol(site) if r == 0: values.append("na") values.append("") nunmapped += 1 if options.loglevel >= 2: options.stdlog.write("# unmapped residue: %i\n" % site) else: values.append(r) values.append(consensus[r - 1]) nmapped += 1 options.stdout.write("\t".join(map(str, (values))) + "\n") if options.loglevel >= 1: options.stdlog.write( "# sites: ninput=%i, noutput=%i, nskipped=%i\n" % (len(sites), nmapped, nunmapped)) E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def read( self, line ): data = string.split( line[:-1], "\t") if len(data) == 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, ) = data elif len(data) == 25: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 24: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 23: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, ) = data self.mAlignmentString = "" else: raise ValueError, "unknown format: %i fields in line %s" % (len(data), line[:-1]) (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity) = map (\ float, (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity)) (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled) = map (\ int, ( self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled)) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector() if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatExplicit( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy( self.mMapPeptide2Translation ) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
def getMapPeptide2Cds(peptide_sequence, cds_sequence, options): """get map between peptide sequence and cds sequence. The returned alignment is in nucleotides. """ # remove whitespaces form protein sequence p = re.sub(" ", "", peptide_sequence) # remove gaps and whitespaces from cds c = re.sub("[ .-]", "", cds_sequence) w = Genomics.Protein2Wobble(p.upper()) if options.loglevel >= 6: options.stdlog.write("# peptide original (%5i): %s\n" % (len(p), p)) options.stdlog.write("# cds original (%5i): %s\n" % (len(c), c)) options.stdlog.write("# wobble sequence (%5i): %s\n" % (len(w), w)) options.stdlog.flush() seq_wobble = alignlib_lite.py_makeSequence(w) seq_cds = alignlib_lite.py_makeSequence(c.upper()) seq_peptide = alignlib_lite.py_makeSequence(p) map_p2c = alignlib_lite.py_makeAlignmentVector() try: AlignCodonBased(seq_wobble, seq_cds, seq_peptide, map_p2c, options=options) except ValueError as msg: raise ValueError("mapping error for sequence: %s" % (msg)) # if there are more than five frameshifts - do exhaustive alignment max_gaps = 5 num_peptide_gaps = len(re.sub("[^-]", "", p)) ngaps = map_p2c.getNumGaps() - \ (num_peptide_gaps * 3) - abs(len(w) - len(c)) if options.loglevel >= 6: options.stdlog.write( "# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n" % (ngaps, num_peptide_gaps)) printPrettyAlignment(seq_wobble, seq_cds, p, map_p2c, options) if ngaps > max_gaps: if options.loglevel >= 2: options.stdlog.write( "# too many gaps (%i>%i), realigning exhaustively.\n" % (ngaps, max_gaps)) options.stdlog.flush() full_map_p2c = alignlib_lite.py_makeAlignmentVector() AlignExhaustive(seq_wobble, seq_cds, seq_peptide, full_map_p2c, options) if options.loglevel >= 6: options.stdlog.write("# full alignment between wobble and cds:\n") options.stdlog.flush() printPrettyAlignment(seq_wobble, seq_cds, p, full_map_p2c, options) map_p2c = full_map_p2c # remove incomplete codons x = 0 while x < len(p) * 3: if (map_p2c.mapRowToCol(x) < 0 or map_p2c.mapRowToCol(x + 1) < 0 or map_p2c.mapRowToCol(x + 2) < 0): map_p2c.removeRowRegion(x, x + 3) x += 3 if map_p2c.getLength() == 0: if options.loglevel >= 1: options.stdlog.write("# WARNING: empty alignment\n") if options.loglevel >= 6: options.stdlog.write("# peptide original: %s\n" % p) options.stdlog.write("# cds original : %s\n" % c) options.stdlog.write("# wobble sequence : %s\n" % w) raise ValueError("empty alignment") assert (map_p2c.getRowTo() <= seq_wobble.getLength()) assert (map_p2c.getColTo() <= seq_cds.getLength()) return map_p2c
ngaps = map_p2c.getNumGaps() - \ (num_peptide_gaps * 3) - abs(len(w) - len(c)) if options.loglevel >= 6: options.stdlog.write( "# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n" % (ngaps, num_peptide_gaps)) printPrettyAlignment(seq_wobble, seq_cds, p, map_p2c, options) if ngaps > max_gaps: if options.loglevel >= 2: options.stdlog.write( "# too many gaps (%i>%i), realigning exhaustively.\n" % (ngaps, max_gaps)) options.stdlog.flush() full_map_p2c = alignlib_lite.py_makeAlignmentVector() AlignExhaustive(seq_wobble, seq_cds, seq_peptide, full_map_p2c, options) if options.loglevel >= 6: options.stdlog.write("# full alignment between wobble and cds:\n") options.stdlog.flush() printPrettyAlignment(seq_wobble, seq_cds, p, full_map_p2c, options) map_p2c = full_map_p2c # remove incomplete codons x = 0 while x < len(p) * 3: if (map_p2c.mapRowToCol(x) < 0 or map_p2c.mapRowToCol(x + 1) < 0 or map_p2c.mapRowToCol(x + 2) < 0):
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py [%default]") parser.add_option( "-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option( "-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.Start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(IOTools.openFile(args[0], "r")) ]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(IOTools.openFile(args[1], "r")) ]) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in seqs1: if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len( filter(lambda x: x[0] == "U" or x[1] == "U", differences)) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len( filter(lambda x: x[0] in "NX" or x[1] in "NX", differences)) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % ( k, x, a, b, len(s1), len(s2)) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print "fix\t%s\t%s" % (k, str(f)) if not keep: print "# warning: not fixable: %s" % k if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in seqs2.keys(): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write("""# Legend: # seqs1: number of sequences in set 1 # seqs2: number of sequences in set 2 # same: number of identical sequences # diff: number of sequences with differences # nmissed1: sequences in set 1 that are not found in set 2 # nmissed2: sequences in set 2 that are not found in set 1 # Type of sequence differences # first: only the first residue is different # last: only the last residue is different # prefix: one sequence is prefix of the other # selenocysteine: difference due to selenocysteines # masked: difference due to masked residues # fixed: fixed differences # other: other differences """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]") parser.add_option("-f", "--format", dest="format", type="string", help="output format [Default=%default]") parser.add_option( "-e", "--expand", dest="expand", action="store_true", help= "expand positions from peptide to nucleotide alignment [Default=%default]" ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option("-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help= "expect one-based coordinates. The default are zero based coordinates [Default=%default]." ) parser.add_option("--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]") parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option("-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option("-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help= "given a set of previous alignments, only write new pairs [Default=%default]." ) parser.set_defaults(filename_sequences=None, filename_exons=None, filename_map=None, filename_outfile=None, no_gaps=False, format="fasta", expand=False, require_codons=False, no_identical=False, min_length=0, report_step=100, one_based_coordinates=False, filename_filter=None) (options, args) = E.Start(parser, add_mysql_options=True) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) else: sequences = {} if options.loglevel >= 1: options.stdlog.write("# read %i sequences\n" % len(sequences)) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r")) else: exons = {} if options.loglevel >= 1: options.stdlog.write("# read %i exons\n" % len(exons)) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read(line) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write("# read %i maps\n" % len(map_old2new)) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write("# reading filtering information.\n") sys.stdout.flush() map_pair2hids = {} if os.path.exists(options.filename_filter): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator(infile) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append(s) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids)) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write("# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links(sys.stdin): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write("# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1)) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write("# read link %s\n" % str(link)) row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken]) col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken]) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment(link.mQueryAli, 3) link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_row2col) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in row with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mQueryToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in col with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mSbjctToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError( "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" % (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) + "\n") # check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write("# %s\n" % str(map_row2col)) options.stdlog.write("# %s\n" % str(link)) options.stdlog.write("# %s\n" % str(map_old2new[link.mQueryToken])) options.stdlog.write("# %s\n" % str(map_old2new[link.mSbjctToken])) options.stdlog.write("#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) raise ValueError( "incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) # if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] # Get overlapping segments segments = Exons.MatchExons(map_row2col, exons1, exons2) for a, b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in # the input files. from1, to1 = GetAdjustedBoundaries(a, exons1) from2, to2 = GetAdjustedBoundaries(b, exons2) alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col, from1 + 1, to1, from2 + 1, to2) mode = Write(tmp1_map_row2col, row_seq, col_seq, link, no_gaps=options.no_gaps, no_identical=options.no_identical, min_length=options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile=outfile, pair_filter=map_pair2hid, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write(map_row2col, row_seq, col_seq, link, min_length=options.min_length, no_gaps=options.no_gaps, no_identical=options.no_identical, outfile=outfile, pair_filter=map_pair2hids, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map(lambda x, y: "%s=%i" % (x, y), counts.keys(), counts.values()))) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def Alignment2DNA(alignment, query_from=0, sbjct_from=0): """convert a peptide2genome alignment to a nucleotide2nucleotide alignment. Instead of peptide coordinates, the alignment will be in codon coordinates. Arguments --------- aligment : list List of tuples of the alignment in CIGAR format. query_from : int Start position of alignment on peptide sequence. sbjct_from : int Start position of alignment on nucleotide sequence. Returns ------- alignment : object The alignment as an alignlib.AlignmentVector object. """ map_query2sbjct = alignlib_lite.py_makeAlignmentVector() # count in nucleotides for query query_pos = query_from * 3 sbjct_pos = sbjct_from for state, l_query, l_sbjct in alignment: # count as nucleotides l_query *= 3 if state in ("A", "B", "C"): if state in ("A"): l_query = 0 elif state in ("B"): l_query = 1 elif state in ("C"): l_query = 2 elif state in ("a", "b", "c"): if state in ("a"): l_query = 0 elif state in ("b"): l_query = 2 elif state in ("c"): l_query = 1 elif state == "S": l_query = l_sbjct if l_query > 0 and l_sbjct > 0: alignlib_lite.addDiagonal2Alignment(map_query2sbjct, query_pos, query_pos + l_query, sbjct_pos - query_pos) query_pos += l_query sbjct_pos += l_sbjct return map_query2sbjct
def Alignment2PeptideAlignment(alignment, query_from=0, sbjct_from=0, genomic_sequence=None): """convert a Peptide2DNA aligment to a Peptide2Peptide alignment. How to handle frameshifts? """ map_query2sbjct = alignlib_lite.py_makeAlignmentVector() query_pos = query_from sbjct_pos = 0 sbjct_genome_pos = sbjct_from sbjct_residues = [] codon = "" for state, l_query, l_sbjct in alignment: query_increment = 0 sbjct_increment = 0 if state == "M": query_increment = l_query sbjct_increment = l_sbjct / 3 if genomic_sequence: codon = genomic_sequence[ sbjct_genome_pos:sbjct_genome_pos + l_sbjct] elif state == "S": if l_query: sbjct_increment = 1 query_increment = 1 if genomic_sequence: codon += genomic_sequence[sbjct_genome_pos: sbjct_genome_pos + l_sbjct] elif state == "G": query_increment = l_query sbjct_increment = l_sbjct / 3 if genomic_sequence: codon += genomic_sequence[sbjct_genome_pos: sbjct_genome_pos + l_sbjct] elif state == "P": # only increment query, sbjct does not advance. query_increment = l_query if query_increment and sbjct_increment: alignlib_lite.py_addDiagonal2Alignment(map_query2sbjct, query_pos, query_pos + query_increment, sbjct_pos - query_pos) if sbjct_increment and genomic_sequence: for x in range(0, len(codon), 3): sbjct_residues.append(MapCodon2AA(codon[x:x + 3])) codon = "" query_pos += query_increment sbjct_pos += sbjct_increment sbjct_genome_pos += l_sbjct return map_query2sbjct, "".join(sbjct_residues)
def read(self, line): data = string.split(line[:-1], "\t") if len(data) == 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, ) = data elif len(data) == 25: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 24: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 23: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, ) = data self.mAlignmentString = "" else: raise ValueError, "unknown format: %i fields in line %s" % ( len(data), line[:-1]) (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity) = map( float, (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity)) (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled) = map( int, (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled)) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatExplicit( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy(self.mMapPeptide2Translation) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.add_option( "-m", "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode, global=nw, local=sw [default=%default].") parser.set_defaults( gop=-12.0, gep=-2.0, format="fasta", mode="local", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info("read 2 multiple alignments") mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format) mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format) cmali1 = Mali.convertMali2Alignlib(mali1) cmali2 = Mali.convertMali2Alignlib(mali2) if options.mode == "local": mode = alignlib_lite.py_ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib_lite.py_ALIGNMENT_GLOBAL alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop, options.gep) alignlib_lite.py_setDefaultEncoder( alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20)) alignlib_lite.py_setDefaultLogOddor( alignlib_lite.py_makeLogOddorDirichlet(0.3)) alignlib_lite.py_setDefaultRegularizor( alignlib_lite.py_makeRegularizorDirichletPrecomputed()) cprofile1 = alignlib_lite.py_makeProfile(cmali1) cprofile2 = alignlib_lite.py_makeProfile(cmali2) result = alignlib_lite.py_makeAlignmentVector() alignator.align(result, cprofile1, cprofile2) E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result)) cmali1.add(cmali2, result) outmali = Mali.convertAlignlib2Mali(cmali1, identifiers=mali1.getIdentifiers() + mali2.getIdentifiers()) outmali.writeToFile(options.stdout, format=options.format) # write footer and output benchmark information. E.Stop()
def Add(self, const_other, combine_contig=False, allow_overlap=False, contig_size=0, combine_queries=False, as_intron=False): """add one entry to another. This procedure allows to add - predictions on different contigs if combine_contig = True - overlapping predictions on the same query if allow_overlap = True - results from different queries if combine_queries = True - if as_intron is set to true, the new fragment is added as an intron. """ # create working copies of each prediction other = const_other.getCopy() this = self.getCopy() other.Expand() this.Expand() if as_intron: code = "I" else: code = "P" # check for query overlaps if this.mQueryToken == other.mQueryToken: query_overlap = max( 0, min(this.mQueryTo, other.mQueryTo) - max(this.mQueryFrom, other.mQueryFrom) + 1) if query_overlap > 0: if allow_overlap: overlap = query_overlap # if queries overlap, truncate this before adding the other this.mMapPeptide2Translation.removeRowRegion( this.mQueryTo - overlap + 1, this.mQueryTo) other.mMapPeptide2Translation.moveAlignment(0, -overlap) this.mQueryTo -= overlap this.mTranslation = this.mTranslation[:-overlap] # remove aligned residues from the back for x in range(len(this.mMapPeptide2Genome) - 1, 0, -1): if this.mMapPeptide2Genome[x][1] <= overlap: overlap -= this.mMapPeptide2Genome[x][1] del this.mMapPeptide2Genome[x] else: break this.mMapPeptide2Genome[-1] = ( this.mMapPeptide2Genome[-1][0], this.mMapPeptide2Genome[-1][1] - overlap, this.mMapPeptide2Genome[-1][2] - overlap * 3) else: raise ValueError, "refusing to add overlapping entries: overlap = %i, queries:\n%s\n%s\n, set allow_overlap = True " % ( query_overlap, str(this), str(other)) else: if not combine_queries: raise ValueError, "refusing to add different queries - set combine_queries = True." if this.mSbjctToken != other.mSbjctToken or \ this.mSbjctStrand != other.mSbjctStrand: if combine_contig: this.mSbjctToken += "-" + other.mSbjctToken this.mSbjctStrand += other.mSbjctStrand else: raise ValueError, "can not add different sbjct." sbjct_overlap = max( 0, min(this.mSbjctGenomeTo, other.mSbjctGenomeTo) - max(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom), 0) if sbjct_overlap > 0: if not combine_contig: raise ValueError, "refusing to add overlapping entries: overlap = %i, sbjct:\n%s\n%s\n" % ( sbjct_overlap, str(this), str(other)) if this.mSbjctToken == other.mSbjctToken: # set precedence if this.mSbjctGenomeFrom < other.mSbjctGenomeFrom: first = this second = other else: first = other second = this # get length of gap d_na = second.mSbjctGenomeFrom - first.mSbjctGenomeTo if this.mQueryToken != other.mQueryToken: d_aa = first.mQueryLength - first.mQueryTo # create a new virtual query by concatenating # the two queries this.mQueryToken += "-" + other.mQueryToken # sort out the alignment second.mMapPeptide2Translation.moveAlignment( first.mQueryLength, 0) this.mQueryLength = first.mQueryLength + second.mQueryLength else: d_aa = second.mQueryFrom - first.mQueryTo - 1 this.mSbjctGenomeFrom = min(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom) this.mSbjctGenomeTo = max(this.mSbjctGenomeTo, other.mSbjctGenomeTo) this.mMapPeptide2Genome = first.mMapPeptide2Genome + \ [(code, d_aa, d_na)] + second.mMapPeptide2Genome this.mTranslation = first.mTranslation + second.mTranslation second.mMapPeptide2Translation.moveAlignment(0, first.mSbjctTo - 1) else: # join on different contigs d_na = contig_size - this.mSbjctGenomeTo + \ other.mSbjctGenomeFrom + query_overlap * 3 d_aa = other.mQueryFrom - this.mQueryTo - 1 this.mMapPeptide2Genome += [(code, d_aa, d_na), ] + \ other.mMapPeptide2Genome this.mTranslation += other.mTranslation other.mMapPeptide2Translation.moveAlignment(0, this.mSbjctTo - 1) this.mSbjctGenomeFrom = this.mSbjctGenomeFrom this.mSbjctGenomeTo = contig_size + other.mSbjctGenomeTo # now fill self from first and this self.mQueryToken = first.mQueryToken self.mQueryLength = this.mQueryLength nthis = this.mMapPeptide2Translation.getLength( ) - this.mMapPeptide2Translation.getNumGaps() nother = other.mMapPeptide2Translation.getLength( ) - other.mMapPeptide2Translation.getNumGaps() self.mMapPeptide2Genome = first.mMapPeptide2Genome self.mSbjctGenomeFrom = this.mSbjctGenomeFrom self.mSbjctGenomeTo = this.mSbjctGenomeTo # there might be some reference counting issues, thus # do it the explicit way. alignlib_lite.py_addAlignment2Alignment(this.mMapPeptide2Translation, other.mMapPeptide2Translation) self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_addAlignment2Alignment(self.mMapPeptide2Translation, this.mMapPeptide2Translation) self.mTranslation = this.mTranslation self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom() self.mQueryTo = self.mMapPeptide2Translation.getRowTo() self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom() self.mSbjctTo = self.mMapPeptide2Translation.getColTo() self.mQueryCoverage = 100.0 * \ (self.mQueryTo - self.mQueryFrom + 1) / float(self.mQueryLength) self.mAlignmentString = string.join( map(lambda x: string.join(map(str, x), " "), self.mMapPeptide2Genome), " ") f = alignlib_lite.py_AlignmentFormatEmssions( self.mMapPeptide2Translation) self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment # summary parameters self.mRank = max(this.mRank, other.mRank) self.score += other.score self.mNGaps += other.mNGaps self.mNFrameShifts += other.mNFrameShifts self.mNIntrons += other.mNIntrons + 1 self.mNStopCodons += other.mNStopCodons nnew = self.mMapPeptide2Translation.getLength( ) - self.mMapPeptide2Translation.getNumGaps() self.mPercentIdentity = min( 100.0, (self.mPercentIdentity * nthis + other.mPercentIdentity * nother) / nnew) self.mPercentSimilarity = min( 100.0, (self.mPercentSimilarity * nthis + other.mPercentSimilarity * nother) / nnew) self.mNAssembled += 1 + other.mNAssembled
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]" ) parser.add_option( "-f", "--format", dest="format", type="string", help="output format [Default=%default]" ) parser.add_option( "-e", "--expand", dest="expand", action="store_true", help="expand positions from peptide to nucleotide alignment [Default=%default]") parser.add_option( "-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option( "-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help="expect one-based coordinates. The default are zero based coordinates [Default=%default].") parser.add_option( "--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]" ) parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option( "-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option( "-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option( "--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help="given a set of previous alignments, only write new pairs [Default=%default].") parser.set_defaults( filename_sequences = None, filename_exons = None, filename_map = None, filename_outfile = None, no_gaps = False, format = "fasta", expand = False, require_codons = False, no_identical = False, min_length = 0, report_step = 100, one_based_coordinates = False, filename_filter = None) (options, args) = E.Start( parser, add_mysql_options = True ) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") ) else: sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i sequences\n" % len(sequences) ) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") ) else: exons = {} if options.loglevel >= 1: options.stdlog.write( "# read %i exons\n" % len(exons) ) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read( line ) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write( "# read %i maps\n" % len(map_old2new) ) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write( "# reading filtering information.\n" ) sys.stdout.flush() map_pair2hids = {} if os.path.exists( options.filename_filter ): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator( infile ) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append( s ) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) ) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links( sys.stdin ): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) ) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write( "# read link %s\n" % str(link) ) row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] ) col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] ) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 ) link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 ) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli ).copy( map_row2col ) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in row with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR ) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in col with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR ) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\ (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq )) + "\n" ) ## check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write( "# %s\n" % str(map_row2col) ) options.stdlog.write( "# %s\n" % str(link) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) ) options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq ) ) raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) ## if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] ## Get overlapping segments segments = Exons.MatchExons( map_row2col, exons1, exons2 ) for a,b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in the input files. from1, to1 = GetAdjustedBoundaries( a, exons1 ) from2, to2 = GetAdjustedBoundaries( b, exons2 ) alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col, from1+1, to1, from2+1, to2 ) mode = Write( tmp1_map_row2col, row_seq, col_seq, link, no_gaps = options.no_gaps, no_identical = options.no_identical, min_length = options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile = outfile, pair_filter = map_pair2hid, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write( map_row2col, row_seq, col_seq, link, min_length = options.min_length, no_gaps = options.no_gaps, no_identical = options.no_identical, outfile = outfile, pair_filter = map_pair2hids, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) )) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) ) E.Stop()
def AlignCodonBased(seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width=2, max_advance=2): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder()) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue(x) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue(xr, seq_cds.asResidue(y)) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y + 1) + seq_cds.asChar(y + 2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)])) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s))) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib_lite.py_makeAlignmentVector() # backtrack to previous three codons and align # three codons for double frameshifts that span two codons and # produce two X's and six WWWWWW. # number of nucleotides to extend (should be multiple of 3) # less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT)) if (x_start, y_start) == last_start: raise ValueError("infinite loop detected") last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq)) y_end = min(y_start + 2 * d, len(cds_seq)) wobble_fragment = alignlib_lite.py_makeSequence( wobble_seq[x_start:x_end]) cds_fragment = alignlib_lite.py_makeSequence( cds_seq[y_start:y_end]) AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c, options) if options.loglevel >= 10: options.stdlog.write( "# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str( alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment)))) options.stdlog.flush() # clear alignment map_p2c.removeRowRegion(x_start, x_end) ngap = 0 last_x, last_y = None, None for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue(seq_wobble.asResidue(x), seq_cds.asResidue(y)) if s < 0: raise ValueError( "mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair(x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s)) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to # next codon. if ngap == 3: map_p2c.removeRowRegion(last_x, last_x + 1) last_x += 1 map_p2c.addPair(last_x, last_y) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s)) options.stdlog.flush() ngap = 0 # exit condition if alignment is shorter than problematic residue # need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: # only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue(xr, seq_cds.asResidue(y)) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair(x, y, float(s)) # advance to next residues x += 1 y += 1 # sanity checks assert (map_p2c.getRowTo() <= seq_wobble.getLength()) assert (map_p2c.getColTo() <= seq_cds.getLength())
def main(): parser = E.OptionParser( version = "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("--random-proportion", dest="random_proportion", type="float", help="mask randomly columns in multiple alignments [default=%default]" ) parser.add_option("--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]" ) parser.set_defaults( quality_threshold = 40, quality_file = "quality", filename_map = None, frame = 3, ) (options, args) = E.Start( parser ) ################################################## ################################################## ################################################## ## read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator( infile ): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## ## get quality scores ################################################## quality = IndexedFasta.IndexedFasta( options.quality_file ) quality.setTranslator( IndexedFasta.TranslatorBytes() ) ################################################## ################################################## ################################################## ## main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write( "cluster_id\tstart\tend\n" ) for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn( "gene_id %s not found in map." % gene_id ) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment( map_gene2mali, alignment ) # get quality scores quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR ) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp,c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue positions.append( y ) scores = [ quality_scores[ x ] for x in positions ] random.shuffle(scores) for p,q in zip( positions,scores): quality_scores[p] = q # negative strand to_mask = [] ## reverse position rp = len(alignment) for fp,c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \ (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) ) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend( list( range(start, start + options.frame) ) ) else: to_mask.append( p ) regions = Iterators.group_by_distance( sorted(to_mask) ) for start,end in regions: options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) ) noutput += 1 E.info( "ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed) ) E.Stop()
def Align( self, method, anchor = 0, loglevel = 1 ): """align a pair of sequences. get rid of this and use a method class instead in the future """ map_a2b = alignlib_lite.py_makeAlignmentVector() s1 = "A" * anchor + self.mSequence1 + "A" * anchor s2 = "A" * anchor + self.mSequence2 + "A" * anchor self.strand = "+" if method == "dialign": dialign = WrapperDialign.Dialign( self.mOptionsDialign ) dialign.Align( s1, s2, map_a2b ) elif method == "blastz": blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ ) blastz.Align( s1, s2, map_a2b ) if blastz.isReverseComplement(): self.strand = "-" self.mSequence2 = Genomics.complement( self.mSequence2 ) elif method == "dialignlgs": dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS ) dialignlgs.Align( s1, s2, map_a2b ) elif method == "dba": dba = WrapperDBA.DBA() dba.Align( s1, s2, map_a2b ) elif method == "clustal": raise NotImplementedError( "clustal wrapper needs to be updated") clustal = WrapperClustal.Clustal() clustal.Align( s1, s2, map_a2b ) elif method == "nw": seq1 = alignlib_lite.py_makeSequence( s1 ) seq2 = alignlib_lite.py_makeSequence( s2 ) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL, gop=-12.0, gep=-2.0 ) alignator.align( map_a2b, seq1, seq2 ) elif method == "sw": seq1 = alignlib_lite.py_makeSequence( s1 ) seq2 = alignlib_lite.py_makeSequence( s2 ) alignlib_lite.py_performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw ) else: ## use callback function method(s1, s2, map_a2b) if map_a2b.getLength() == 0: raise AlignmentError("empty alignment") if anchor: map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() ) map_a2b.removeRowRegion( 1, anchor) map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() ) map_a2b.removeColRegion( 1, anchor) map_a2b.moveAlignment( -anchor, -anchor ) f = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, alignlib_lite.py_makeSequence( self.mSequence1), alignlib_lite.py_makeSequence( self.mSequence2) ) self.mMethod = method self.mAlignment = map_a2b self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment f = alignlib_lite.py_AlignmentFormatEmissions( map_a2b ) self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment self.mAlignmentFrom1 = map_a2b.getRowFrom() self.mAlignmentTo1 = map_a2b.getRowTo() self.mAlignmentFrom2 = map_a2b.getColFrom() self.mAlignmentTo2 = map_a2b.getColTo() self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength() self.mAligned = self.mLength - self.mNumGaps self.SetPercentIdentity() self.SetBlockSizes()
parser.add_option("-o", "--options", dest="options", type="string", help="BlastZ options.") parser.set_defaults(input_filename_seq1=None, input_filename_seq2=None, options="B=0 C=2") (options, args) = E.Start(parser) wrapper = BlastZ(options.options) import alignlib_lite seqs1 = Genomics.ReadPeptideSequences( open(options.input_filename_seq1, "r")) seqs2 = Genomics.ReadPeptideSequences( open(options.input_filename_seq2, "r")) seq1 = seqs1[seqs1.keys()[0]] seq2 = seqs2[seqs2.keys()[0]] result = alignlib_lite.py_makeAlignmentVector() wrapper.Align(seq1, seq2, result) print str( alignlib_lite.py_AlignmentFormatExplicit( result, alignlib_lite.py_makeSequence(seq1), alignlib_lite.py_makeSequence(seq2))) E.Stop()
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width = 2, max_advance = 2 ): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder() ) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue( x ) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue( xr, seq_cds.asResidue(y) ) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA( c ), pep_seq[int(x/3)]) ) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) )) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib_lite.py_makeAlignmentVector() ## backtrack to previous three codons and align ## three codons for double frameshifts that span two codons and ## produce two X's and six WWWWWW. ## number of nucleotides to extend (should be multiple of 3) ## less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx ) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib_lite.py_RIGHT )) if (x_start, y_start) == last_start: raise ValueError( "infinite loop detected" ) last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq) ) y_end = min(y_start + 2 * d, len(cds_seq) ) wobble_fragment = alignlib_lite.py_makeSequence(wobble_seq[x_start:x_end]) cds_fragment = alignlib_lite.py_makeSequence(cds_seq[y_start:y_end]) AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options ) if options.loglevel >= 10: options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str(alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment )))) options.stdlog.flush() ## clear alignment map_p2c.removeRowRegion( x_start, x_end ) ngap = 0 last_x, last_y = None, None for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) ) if s < 0: raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair( x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s )) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to next codon. if ngap == 3: map_p2c.removeRowRegion( last_x, last_x + 1 ) last_x += 1 map_p2c.addPair( last_x, last_y ) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s )) options.stdlog.flush() ngap = 0 ## exit condition if alignment is shorter than problematic residue ## need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: ## only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue( xr, seq_cds.asResidue(y) ) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair( x, y, float(s) ) # advance to next residues x += 1 y += 1 # sanity checks assert( map_p2c.getRowTo() <= seq_wobble.getLength() ) assert( map_p2c.getColTo() <= seq_cds.getLength() )
parser.set_defaults( \ input_filename_seq1 = None, input_filename_seq2 = None, options = "B=0 C=2") (options, args) = E.Start( parser ) wrapper = BlastZ( options.options ) import alignlib_lite seqs1 = Genomics.ReadPeptideSequences( open(options.input_filename_seq1, "r") ) seqs2 = Genomics.ReadPeptideSequences( open(options.input_filename_seq2, "r") ) seq1 = seqs1[seqs1.keys()[0]] seq2 = seqs2[seqs2.keys()[0]] result = alignlib_lite.py_makeAlignmentVector() wrapper.Align( seq1, seq2, result) print str( alignlib_lite.py_AlignmentFormatExplicit( result, alignlib_lite.py_makeSequence( seq1 ), alignlib_lite.py_makeSequence( seq2 ) ) ) E.Stop()
def Alignment2PeptideAlignment(alignment, query_from=0, sbjct_from=0, genomic_sequence=None): """convert a Peptide2DNA aligment to a Peptide2Peptide alignment. How to handle frameshifts? """ map_query2sbjct = alignlib_lite.py_makeAlignmentVector() query_pos = query_from sbjct_pos = 0 sbjct_genome_pos = sbjct_from sbjct_residues = [] codon = "" for state, l_query, l_sbjct in alignment: query_increment = 0 sbjct_increment = 0 if state == "M": query_increment = l_query sbjct_increment = l_sbjct / 3 if genomic_sequence: codon = genomic_sequence[sbjct_genome_pos:sbjct_genome_pos + l_sbjct] elif state == "S": if l_query: sbjct_increment = 1 query_increment = 1 if genomic_sequence: codon += genomic_sequence[sbjct_genome_pos:sbjct_genome_pos + l_sbjct] elif state == "G": query_increment = l_query sbjct_increment = l_sbjct / 3 if genomic_sequence: codon += genomic_sequence[sbjct_genome_pos:sbjct_genome_pos + l_sbjct] elif state == "P": # only increment query, sbjct does not advance. query_increment = l_query if query_increment and sbjct_increment: alignlib_lite.py_addDiagonal2Alignment(map_query2sbjct, query_pos, query_pos + query_increment, sbjct_pos - query_pos) if sbjct_increment and genomic_sequence: for x in range(0, len(codon), 3): sbjct_residues.append(MapCodon2AA(codon[x:x + 3])) codon = "" query_pos += query_increment sbjct_pos += sbjct_increment sbjct_genome_pos += l_sbjct return map_query2sbjct, "".join(sbjct_residues)
print globals()["__doc__"], msg sys.exit(2) for o, a in optlist: if o in ("-v", "--verbose"): param_loglevel = int(a) elif o in ("--version", ): print "version=" sys.exit(0) elif o in ("-h", "--help"): print globals()["__doc__"] sys.exit(0) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_LOCAL, param_gop, param_gep) map_query2token = alignlib_lite.py_makeAlignmentVector() for line in sys.stdin: if line[0] == "#": continue query_token, sbjct_token, query_sequence, sbjct_sequence = string.split( line[:-1], "\t") map_query2token.clear() row = alignlib_lite.py_makeSequence(query_sequence) col = alignlib_lite.py_makeSequence(sbjct_sequence) alignator.align(map_query2token, row, col) pidentity = 100.0 * \ alignlib_lite.py_calculatePercentIdentity(
def main(argv=None): parser = E.OptionParser( version= "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--random-proportion", dest="random_proportion", type="float", help="mask randomly columns in multiple alignments [default=%default]") parser.add_option( "--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]") parser.set_defaults( quality_threshold=40, quality_file="quality", filename_map=None, frame=3, ) (options, args) = E.Start(parser) ################################################## ################################################## ################################################## # read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator(infile): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## # get quality scores ################################################## quality = IndexedFasta.IndexedFasta(options.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) ################################################## ################################################## ################################################## # main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write("cluster_id\tstart\tend\n") for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn("gene_id %s not found in map." % gene_id) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment(map_gene2mali, alignment) # get quality scores quality_scores = quality.getSequence(match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR) # print str(alignlib_lite.py_AlignmentFormatEmissions( # map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp, c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue positions.append(y) scores = [quality_scores[x] for x in positions] random.shuffle(scores) for p, q in zip(positions, scores): quality_scores[p] = q # negative strand to_mask = [] # reverse position rp = len(alignment) for fp, c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol(fp), quality_scores[y])) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend(list(range(start, start + options.frame))) else: to_mask.append(p) regions = Iterators.group_by_distance(sorted(to_mask)) for start, end in regions: options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end)) noutput += 1 E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed)) E.Stop()
def getMapPeptide2Cds(peptide_sequence, cds_sequence, options): """get map between peptide sequence and cds sequence. The returned alignment is in nucleotides. """ # remove whitespaces form protein sequence p = re.sub(" ", "", peptide_sequence) # remove gaps and whitespaces from cds c = re.sub("[ .-]", "", cds_sequence) w = Genomics.Protein2Wobble(p.upper()) if options.loglevel >= 6: options.stdlog.write("# peptide original (%5i): %s\n" % (len(p), p)) options.stdlog.write("# cds original (%5i): %s\n" % (len(c), c)) options.stdlog.write("# wobble sequence (%5i): %s\n" % (len(w), w)) options.stdlog.flush() seq_wobble = alignlib_lite.py_makeSequence(w) seq_cds = alignlib_lite.py_makeSequence(c.upper()) seq_peptide = alignlib_lite.py_makeSequence(p) map_p2c = alignlib_lite.py_makeAlignmentVector() try: AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options=options) except ValueError as msg: raise ValueError("mapping error for sequence: %s" % (msg)) # if there are more than five frameshifts - do exhaustive alignment max_gaps = 5 num_peptide_gaps = len(re.sub("[^-]", "", p)) ngaps = map_p2c.getNumGaps() - \ (num_peptide_gaps * 3) - abs(len(w) - len(c)) if options.loglevel >= 6: options.stdlog.write( "# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n" % (ngaps, num_peptide_gaps)) printPrettyAlignment(seq_wobble, seq_cds, p, map_p2c, options) if ngaps > max_gaps: if options.loglevel >= 2: options.stdlog.write( "# too many gaps (%i>%i), realigning exhaustively.\n" % (ngaps, max_gaps)) options.stdlog.flush() full_map_p2c = alignlib_lite.py_makeAlignmentVector() AlignExhaustive( seq_wobble, seq_cds, seq_peptide, full_map_p2c, options) if options.loglevel >= 6: options.stdlog.write("# full alignment between wobble and cds:\n") options.stdlog.flush() printPrettyAlignment(seq_wobble, seq_cds, p, full_map_p2c, options) map_p2c = full_map_p2c # remove incomplete codons x = 0 while x < len(p) * 3: if (map_p2c.mapRowToCol(x) < 0 or map_p2c.mapRowToCol(x + 1) < 0 or map_p2c.mapRowToCol(x + 2) < 0): map_p2c.removeRowRegion(x, x + 3) x += 3 if map_p2c.getLength() == 0: if options.loglevel >= 1: options.stdlog.write("# WARNING: empty alignment\n") if options.loglevel >= 6: options.stdlog.write("# peptide original: %s\n" % p) options.stdlog.write("# cds original : %s\n" % c) options.stdlog.write("# wobble sequence : %s\n" % w) raise ValueError("empty alignment") assert(map_p2c.getRowTo() <= seq_wobble.getLength()) assert(map_p2c.getColTo() <= seq_cds.getLength()) return map_p2c
continue if pair.mMethod == "unaligned": unaligned_pair = pair pair.mType1 = GetIntronType(unaligned_pair.mAlignedSequence1) pair.mType2 = GetIntronType(unaligned_pair.mAlignedSequence2) do_print = param_echo_unaligned else: do_print = 1 if param_is_compressed: if unaligned_pair and \ unaligned_pair.mToken1 == pair.mToken1 and \ unaligned_pair.mToken2 == pair.mToken2 and \ unaligned_pair.mIntronId1 == pair.mIntronId1: map_a2b = alignlib_lite.py_makeAlignmentVector() f = AlignmentFormatEmissions( pair.mFrom1, pair.mAlignedSequence1, pair.mFrom2, pair.mAlignedSequence2).copy(map_a2b) map_a2b.moveAlignment(-unaligned_pair.mFrom1 + 1, -unaligned_pair.mFrom2 + 1) data = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, alignlib_lite.py_makeSequence( unaligned_pair.mAlignedSequence1), alignlib_lite.py_makeSequence( unaligned_pair.mAlignedSequence2)) from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo
raise ValueError( "mapping error for sequence: %s" % (msg) ) ## if there are more than five frameshifts - do exhaustive alignment max_gaps = 5 num_peptide_gaps = len( re.sub("[^-]", "", p ) ) ngaps = map_p2c.getNumGaps() - (num_peptide_gaps * 3) - abs(len(w)-len(c)) if options.loglevel >= 6: options.stdlog.write("# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n" % (ngaps, num_peptide_gaps) ) PrintPrettyAlignment( seq_wobble, seq_cds, p, map_p2c, options ) if ngaps > max_gaps: if options.loglevel >= 2: options.stdlog.write("# too many gaps (%i>%i), realigning exhaustively.\n" % (ngaps, max_gaps ) ) options.stdlog.flush() full_map_p2c = alignlib_lite.py_makeAlignmentVector() AlignExhaustive( seq_wobble, seq_cds, seq_peptide, full_map_p2c, options ) if options.loglevel >= 6: options.stdlog.write("# full alignment between wobble and cds:\n" ) options.stdlog.flush() PrintPrettyAlignment( seq_wobble, seq_cds, p, full_map_p2c, options ) map_p2c = full_map_p2c ## remove incomplete codons x = 0 while x < len(p) * 3: if (map_p2c.mapRowToCol( x ) < 0 or \ map_p2c.mapRowToCol( x+1 ) < 0 or \ map_p2c.mapRowToCol( x+2 ) < 0 ):
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.add_option("-m", "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode, global=nw, local=sw [default=%default].") parser.set_defaults( gop=-12.0, gep=-2.0, format="fasta", mode="local", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info("read 2 multiple alignments") mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format) mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format) cmali1 = Mali.convertMali2Alignlib(mali1) cmali2 = Mali.convertMali2Alignlib(mali2) if options.mode == "local": mode = alignlib_lite.py_ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib_lite.py_ALIGNMENT_GLOBAL alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop, options.gep) alignlib_lite.py_setDefaultEncoder( alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20)) alignlib_lite.py_setDefaultLogOddor( alignlib_lite.py_makeLogOddorDirichlet(0.3)) alignlib_lite.py_setDefaultRegularizor( alignlib_lite.py_makeRegularizorDirichletPrecomputed()) cprofile1 = alignlib_lite.py_makeProfile(cmali1) cprofile2 = alignlib_lite.py_makeProfile(cmali2) result = alignlib_lite.py_makeAlignmentVector() alignator.align(result, cprofile1, cprofile2) E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result)) cmali1.add(cmali2, result) outmali = Mali.convertAlignlib2Mali(cmali1, identifiers=mali1.getIdentifiers() + mali2.getIdentifiers()) outmali.writeToFile(options.stdout, format=options.format) # write footer and output benchmark information. E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py ") parser.add_argument( "-1", "--pattern1", dest="pattern1", type=str, help="pattern to extract identifier from in identifiers1. " ) parser.add_argument( "-2", "--pattern2", dest="pattern2", type=str, help="pattern to extract identifier from in identifiers2. " ) parser.add_argument( "-o", "--output-section", dest="output", type=str, action="append", choices=("diff", "missed", "seqdiff"), help="what to output ") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (args, unknown) = E.start(parser, unknowns=True) if len(unknown) != 2: raise ValueError("two files needed to compare.") if args.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate( iotools.open_file(unknown[0], "r"))]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate( iotools.open_file(unknown[1], "r"))]) if not seqs1: raise ValueError("first file %s is empty." % (unknown[0])) if not seqs2: raise ValueError("second file %s is empty." % (unknown[1])) MapIdentifiers(seqs1, args.pattern1) MapIdentifiers(seqs2, args.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in args.output write_missed2 = "missed" in args.output write_seqdiff = "seqdiff" in args.output write_diff = "diff" in args.output or write_seqdiff for k in sorted(seqs1): if k not in seqs2: nmissed1 += 1 if write_missed1: args.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len([x for x in differences if x[0] == "U" or x[1] == "U"]) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len([x for x in differences if x[0] in "NX" or x[1] in "NX"]) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if args.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print("# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print("fix\t%s\t%s" % (k, str(f))) if not keep: print("# warning: not fixable: %s" % k) if write_diff: args.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: args.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in sorted(list(seqs2.keys())): if k not in found2: nmissed2 += 1 if write_missed2: args.stdout.write("---- %s ---- %s\n" % (k, "missed2")) args.stdlog.write("""# Legend: """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("--methods", dest="methods", type="choice", action="append", choices=("summary-numbers", "jalview", "positive-site-table", "positive-site-list", "count-positive-sites"), help="methods for analysis.") parser.add_option("--selection-mode", dest="selection_mode", type="choice", choices=("all", "consistent", "emes"), help="how to select positive sites.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string", help="input pattern.") parser.add_option("--filter-probability", dest="filter_probability", type="float", help="threshold for probability above which to include positive sites [default=%default].") parser.add_option("--filter-omega", dest="filter_omega", type="float", help="threshold for omega above which to include positive sites [default=%default].") parser.add_option("--models", dest="models", type="string", help="restrict output to set of site specific models.") parser.add_option("--analysis", dest="analysis", type="string", help="restrict output to set of analysis [beb|neb].") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="significance threshold for log-likelihood test.") parser.add_option("--filter-mali", dest="filter_mali", type="choice", choices=("none", "gaps"), help="filter by mali to remove gapped positions.") parser.add_option("--filename-mali", dest="filename_mali", type="string", help="filename with multiple alignment used for calculating sites - used for filtering") parser.add_option("--filename-map-mali", dest="filename_map_mali", type="string", help="filename with multiple alignment to map sites onto.") parser.add_option("--jalview-titles", dest="jalview_titles", type="string", help="comma separated list of jalview annotation titles.") parser.add_option("--jalview-symbol", dest="jalview_symbol", type="string", help="symbol to use in jalview.") parser.set_defaults( methods=[], prefix=None, filter_probability=0, filter_omega=0, models="", analysis="", significance_threshold=0.05, selection_mode="consistent", filename_mali=None, filename_map_mali=None, jalview_symbol="*", jalview_titles="", filter_mali=None, ) (options, args) = E.Start(parser) if options.jalview_titles: options.jalview_titles = options.jalview_titles.split(",") else: options.jalview_titles = args options.models = options.models.split(",") options.analysis = options.analysis.split(",") for a in options.analysis: if a not in ("beb", "neb"): raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a for a in options.models: if a not in ("8", "2", "3"): raise "unknown model: '%s', possible values are 2, 3, 8" % a codeml = WrapperCodeML.CodeMLSites() # filter and extract functions filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega extract_f = lambda x: x.mResidue # read multiple results results = [] ninput, noutput, nskipped = 0, 0, 0 headers = [] for f in args: ninput += 1 try: results.append(codeml.parseOutput(open(f, "r").readlines())) except WrapperCodeML.UsageError: if options.loglevel >= 1: options.stdlog.write("# no input from %s\n" % f) nskipped += 1 continue noutput += 1 headers.append(f) # map of nested model (key) to more general model map_nested_models = {'8': '7', '2': '1', '3': '0'} if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: mali = None ############################################################### ############################################################### ############################################################### # use multiple alignment to map residues to a reference mali # or a sequence. ############################################################### if options.filename_map_mali: if not mali: raise "please supply the input multiple alignment, if residues are to be mapped." # translate the alignments def translate(s): sequence = s.mString seq = [] for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) s.mString = "".join(seq) tmali = Mali.Mali() tmali.readFromFile(open(options.filename_mali, "r")) tmali.apply(translate) tmap_mali = Mali.Mali() tmap_mali.readFromFile(open(options.filename_map_mali, "r")) if tmap_mali.getAlphabet() == "na": tmap_mali.apply(translate) map_old2new = alignlib_lite.py_makeAlignmentVector() mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali)) if tmap_mali.getLength() == 1: s = tmap_mali.values()[0].mString mali2 = alignlib_lite.py_makeSequence(s) # see if you can find an identical subsequence and then align to # thisD for x in tmali.values(): if s in re.sub("[- .]+", "", x.mString): mali1 = alignlib_lite.py_makeSequence(x.mString) break else: mali2 = alignlib_lite.py_makeProfileFromMali( convertMali2Mali(tmap_mali)) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0) alignator.align(map_old2new, mali1, mali2) consensus = tmap_mali.getConsensus() if options.loglevel >= 4: options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet()) options.stdlog.write("# orig : %s\n" % tmali.getConsensus()) options.stdlog.write("# mapped: %s\n" % consensus) options.stdlog.write("# alignment: %s\n" % map_old2new.Write()) else: map_old2new = None for method in options.methods: if method == "summary-numbers": options.stdlog.write( """# Numbers of positive sites. # # The consistent row/column contains positive sites that are significant # (above thresholds for probability and omega) for all models/analysis # that have been selected (label: cons). # # The log-likelihood ratio test is performed for model pairs, depending # on the output chosen. # Significance threshold: %6.4f # The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0. # """ % options.significance_threshold ) # write header if options.prefix: options.stdout.write("prefix\t") options.stdout.write("method\tnseq\t") h = [] for model in options.models: for analysis in options.analysis: h.append("%s%s" % (analysis, model)) h.append("p%s" % (model)) h.append("df%s" % (model)) h.append("chi%s" % (model)) h.append("lrt%s" % (model)) options.stdout.write("\t".join(h)) options.stdout.write("\tcons\tpassed\tfilename\n") nmethod = 0 consistent_cols = [None for x in range(len(options.analysis))] passed_tests = {} for m in options.models: passed_tests[m] = 0 for result in results: row_consistent = None if options.prefix: options.stdout.write("%s" % (options.prefix)) options.stdout.write("%i" % nmethod) options.stdout.write("\t%i" % (result.mNumSequences)) npassed = 0 for model in options.models: sites = result.mSites[model] # do significance test full_model, null_model = model, map_nested_models[model] lrt = Stats.doLogLikelihoodTest( result.mSites[full_model].mLogLikelihood, result.mSites[full_model].mNumParameters, result.mSites[null_model].mLogLikelihood, result.mSites[null_model].mNumParameters, options.significance_threshold) x = 0 for analysis in options.analysis: if analysis == "neb": s = set( map(extract_f, filter(filter_f, sites.mNEB.mPositiveSites))) elif analysis == "beb": s = set( map(extract_f, filter(filter_f, sites.mBEB.mPositiveSites))) options.stdout.write("\t%i" % (len(s))) if not lrt.mPassed: s = set() if row_consistent is None: row_consistent = s else: row_consistent = row_consistent.intersection(s) if consistent_cols[x] is None: consistent_cols[x] = s else: consistent_cols[x] = consistent_cols[ x].intersection(s) x += 1 if lrt.mPassed: c = "passed" passed_tests[model] += 1 npassed += 1 else: c = "failed" options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" % (lrt.mProbability, lrt.mDegreesFreedom, lrt.mChiSquaredValue, c)) options.stdout.write( "\t%i\t%i\t%s\n" % (len(row_consistent), npassed, headers[nmethod])) nmethod += 1 if options.prefix: options.stdout.write("%s\t" % options.prefix) options.stdout.write("cons") row_consistent = None total_passed = 0 for model in options.models: x = 0 for analysis in options.analysis: s = consistent_cols[x] if s is None: s = set() options.stdout.write("\t%i" % (len(s))) if row_consistent is None: row_consistent = s else: row_consistent = row_consistent.intersection(s) x += 1 options.stdout.write("\tna\t%i" % passed_tests[model]) total_passed += passed_tests[model] options.stdout.write( "\t%i\t%i\n" % (len(row_consistent), total_passed)) elif method == "jalview": options.stdout.write("JALVIEW_ANNOTATION\n") options.stdout.write("# Created: %s\n\n" % (time.asctime(time.localtime(time.time())))) l = 1 x = 0 for result in results: sites, significance = selectPositiveSites( [result], options.selection_mode, options, mali) codes = [""] * result.mLength if len(sites) == 0: continue for site in sites: codes[site - 1] = options.jalview_symbol options.stdout.write( "NO_GRAPH\t%s\t%s\n" % (options.jalview_titles[x], "|".join(codes))) x += 1 elif method == "count-positive-sites": sites, significance = selectPositiveSites( results, options.selection_mode, options, mali) options.stdout.write("%i\n" % (len(sites))) elif method in ("positive-site-table", ): sites, significance = selectPositiveSites( results, options.selection_mode, options, mali) headers = ["site", "P"] if map_old2new: headers.append("mapped") headers.append("Pm") options.stdout.write("\t".join(headers) + "\n") sites = list(sites) sites.sort() nmapped, nunmapped = 0, 0 for site in sites: values = [site, "%6.4f" % significance[site]] if map_old2new: r = map_old2new.mapRowToCol(site) if r == 0: values.append("na") values.append("") nunmapped += 1 if options.loglevel >= 2: options.stdlog.write( "# unmapped residue: %i\n" % site) else: values.append(r) values.append(consensus[r - 1]) nmapped += 1 options.stdout.write("\t".join(map(str, (values))) + "\n") if options.loglevel >= 1: options.stdlog.write("# sites: ninput=%i, noutput=%i, nskipped=%i\n" % ( len(sites), nmapped, nunmapped)) E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def Add( self, const_other, combine_contig = False, allow_overlap = False, contig_size = 0, combine_queries = False, as_intron = False ): """add one entry to another. This procedure allows to add - predictions on different contigs if combine_contig = True - overlapping predictions on the same query if allow_overlap = True - results from different queries if combine_queries = True - if as_intron is set to true, the new fragment is added as an intron. """ ## create working copies of each prediction other = const_other.getCopy() this = self.getCopy() other.Expand() this.Expand() if as_intron: code = "I" else: code = "P" ## check for query overlaps if this.mQueryToken == other.mQueryToken: query_overlap = max( 0, min(this.mQueryTo, other.mQueryTo) -\ max(this.mQueryFrom, other.mQueryFrom) + 1) if query_overlap > 0: if allow_overlap: overlap = query_overlap ## if queries overlap, truncate this before adding the other this.mMapPeptide2Translation.removeRowRegion( this.mQueryTo - overlap + 1, this.mQueryTo ) other.mMapPeptide2Translation.moveAlignment( 0, -overlap ) this.mQueryTo -= overlap this.mTranslation = this.mTranslation[:-overlap] ## remove aligned residues from the back for x in range(len(this.mMapPeptide2Genome) - 1, 0, -1): if this.mMapPeptide2Genome[x][1] <= overlap: overlap -= this.mMapPeptide2Genome[x][1] del this.mMapPeptide2Genome[x] else: break this.mMapPeptide2Genome[-1] = (this.mMapPeptide2Genome[-1][0], this.mMapPeptide2Genome[-1][1] - overlap, this.mMapPeptide2Genome[-1][2] - overlap * 3) else: raise ValueError, "refusing to add overlapping entries: overlap = %i, queries:\n%s\n%s\n, set allow_overlap = True " % (query_overlap, str(this), str(other)) else: if not combine_queries: raise ValueError, "refusing to add different queries - set combine_queries = True." if this.mSbjctToken != other.mSbjctToken or \ this.mSbjctStrand != other.mSbjctStrand : if combine_contig: this.mSbjctToken += "-" + other.mSbjctToken this.mSbjctStrand += other.mSbjctStrand else: raise ValueError, "can not add different sbjct." sbjct_overlap = max(0, min(this.mSbjctGenomeTo, other.mSbjctGenomeTo) -\ max(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom), 0) if sbjct_overlap > 0: if not combine_contig: raise ValueError, "refusing to add overlapping entries: overlap = %i, sbjct:\n%s\n%s\n" % (sbjct_overlap, str(this), str(other)) if this.mSbjctToken == other.mSbjctToken: ## set precedence if this.mSbjctGenomeFrom < other.mSbjctGenomeFrom: first = this second = other else: first = other second = this ## get length of gap d_na = second.mSbjctGenomeFrom - first.mSbjctGenomeTo if this.mQueryToken != other.mQueryToken: d_aa = first.mQueryLength - first.mQueryTo # create a new virtual query by concatenating # the two queries this.mQueryToken += "-" + other.mQueryToken # sort out the alignment second.mMapPeptide2Translation.moveAlignment( first.mQueryLength, 0 ) this.mQueryLength = first.mQueryLength + second.mQueryLength else: d_aa = second.mQueryFrom - first.mQueryTo - 1 this.mSbjctGenomeFrom = min(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom ) this.mSbjctGenomeTo = max(this.mSbjctGenomeTo, other.mSbjctGenomeTo ) this.mMapPeptide2Genome = first.mMapPeptide2Genome + [(code, d_aa, d_na)] + second.mMapPeptide2Genome this.mTranslation = first.mTranslation + second.mTranslation second.mMapPeptide2Translation.moveAlignment( 0, first.mSbjctTo - 1 ) else: ## join on different contigs d_na = contig_size - this.mSbjctGenomeTo + other.mSbjctGenomeFrom + query_overlap * 3 d_aa = other.mQueryFrom - this.mQueryTo - 1 this.mMapPeptide2Genome += [(code, d_aa, d_na),] + other.mMapPeptide2Genome this.mTranslation += other.mTranslation other.mMapPeptide2Translation.moveAlignment( 0, this.mSbjctTo - 1 ) this.mSbjctGenomeFrom = this.mSbjctGenomeFrom this.mSbjctGenomeTo = contig_size + other.mSbjctGenomeTo ## now fill self from first and this self.mQueryToken = first.mQueryToken self.mQueryLength = this.mQueryLength nthis = this.mMapPeptide2Translation.getLength() - this.mMapPeptide2Translation.getNumGaps() nother = other.mMapPeptide2Translation.getLength() - other.mMapPeptide2Translation.getNumGaps() self.mMapPeptide2Genome = first.mMapPeptide2Genome self.mSbjctGenomeFrom = this.mSbjctGenomeFrom self.mSbjctGenomeTo= this.mSbjctGenomeTo ## there might be some reference counting issues, thus ## do it the explicit way. alignlib_lite.py_addAlignment2Alignment( this.mMapPeptide2Translation, other.mMapPeptide2Translation) self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_addAlignment2Alignment( self.mMapPeptide2Translation, this.mMapPeptide2Translation ) self.mTranslation = this.mTranslation self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom() self.mQueryTo = self.mMapPeptide2Translation.getRowTo() self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom() self.mSbjctTo = self.mMapPeptide2Translation.getColTo() self.mQueryCoverage = 100.0 * (self.mQueryTo - self.mQueryFrom + 1) / float(self.mQueryLength) self.mAlignmentString = string.join( map( \ lambda x: string.join(map(str, x), " "), self.mMapPeptide2Genome), " ") f = alignlib_lite.py_AlignmentFormatEmssions( self.mMapPeptide2Translation ) self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment ## summary parameters self.mRank = max( this.mRank, other.mRank) self.score += other.score self.mNGaps += other.mNGaps self.mNFrameShifts += other.mNFrameShifts self.mNIntrons += other.mNIntrons + 1 self.mNStopCodons += other.mNStopCodons nnew = self.mMapPeptide2Translation.getLength() - self.mMapPeptide2Translation.getNumGaps() self.mPercentIdentity = min( 100.0, (self.mPercentIdentity * nthis + other.mPercentIdentity * nother) / nnew ) self.mPercentSimilarity = min( 100.0, (self.mPercentSimilarity * nthis + other.mPercentSimilarity * nother) / nnew ) self.mNAssembled += 1 + other.mNAssembled
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: diff_fasta.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. Requires alignlib_lite.py_ " "[%default]") parser.add_option("-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option("-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option("-o", "--output", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.Start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ but alignlib not found") seqs1 = Genomics.ReadPeptideSequences(IOTools.openFile(args[0], "r")) seqs2 = Genomics.ReadPeptideSequences(IOTools.openFile(args[1], "r")) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in seqs1: if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: # the first and last residues can be different for peptide sequences when comparing # my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len(filter(lambda x: x[0] == "U" or x[1] == "U", differences)) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len(filter(lambda x: x[0] in "NX" or x[1] in "NX", differences)) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2)) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print "fix\t%s\t%s" % (k, str(f)) if not keep: print "# warning: not fixable: %s" % k if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in seqs2.keys(): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write( """# Legend: # seqs1: number of sequences in set 1 # seqs2: number of sequences in set 2 # same: number of identical sequences # diff: number of sequences with differences # nmissed1: sequences in set 1 that are not found in set 2 # nmissed2: sequences in set 2 that are not found in set 1 # Type of sequence differences # first: only the first residue is different # last: only the last residue is different # prefix: one sequence is prefix of the other # selenocysteine: difference due to selenocysteines # masked: difference due to masked residues # fixed: fixed differences # other: other differences """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info("ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.Stop()
except getopt.error, msg: print globals()["__doc__"], msg sys.exit(2) for o, a in optlist: if o in ("-v", "--verbose"): param_loglevel = int(a) elif o in ("--version",): print "version=" sys.exit(0) elif o in ("-h", "--help"): print globals()["__doc__"] sys.exit(0) alignator = alignlib_lite.py_makeAlignatorDPFull(alignlib_lite.py_ALIGNMENT_LOCAL, param_gop, param_gep) map_query2token = alignlib_lite.py_makeAlignmentVector() for line in sys.stdin: if line[0] == "#": continue query_token, sbjct_token, query_sequence, sbjct_sequence = string.split(line[:-1], "\t") map_query2token.clear() row = alignlib_lite.py_makeSequence(query_sequence) col = alignlib_lite.py_makeSequence(sbjct_sequence) alignator.align(map_query2token, row, col) pidentity = 100.0 * alignlib_lite.py_calculatePercentIdentity(map_query2token, row, col) psimilarity = 100.0 * alignlib_lite.py_calculatePercentSimilarity(map_query2token) print string.join(