def applyMethod(self, neighbours): """apply the method.""" # build multiple alignment mali = alignlib.makeMultipleAlignment() query_nid = neighbours.mQueryToken sequence = self.mFasta.getSequence(query_nid) mali.add(alignlib.makeAlignatum(sequence)) qseq = alignlib.makeSequence(sequence) alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_GLOBAL, -10.0, -1.0, True, True, True, True) for n in neighbours.mMatches: if n.mSbjctToken == query_nid: continue sequence = self.mFasta.getSequence(n.mSbjctToken) blast_query2sbjct = n.getAlignment() if blast_query2sbjct == None: raise ValueError( "AddaRealignment.py needs a reference alignment.") realign_query2sbjct = alignlib.makeAlignmentVector() sseq = alignlib.makeSequence(sequence) qseq.useSegment(n.mQueryFrom, n.mQueryTo) sseq.useSegment(n.mSbjctFrom, n.mSbjctTo) realign_query2sbjct = alignlib.makeAlignmentVector() alignator.align(realign_query2sbjct, qseq, sseq) nidentical = alignlib.getAlignmentIdentity(realign_query2sbjct, blast_query2sbjct, alignlib.RR) nblast = blast_query2sbjct.getNumAligned() nrealigned = realign_query2sbjct.getNumAligned() self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \ (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) ) if nidentical == nblast: self.mNIdentical += 1 else: self.mNDifferent += 1
def checkLinkThreshold( self, query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to): """check, whether two domains are homologous. The check is done whether the alignment store between the two domains is above a score threshold. """ query_profile = self.getAlignandum( query_nid ) query_profile.useSegment( query_from, query_to ) sbjct_profile = self.getAlignandum( sbjct_nid ) sbjct_profile.useSegment( sbjct_from, sbjct_to ) result = alignlib.makeAlignmentVector() alignator.align( result, query_profile, sbjct_profile ) self.debug( "--> %i vs %i: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i" %\ (query_nid, sbjct_nid, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo()) ) query_profile.useFullLength() sbjct_profile.useFullLength() if result.getScore() > self.mMinAlignmentScore: return True,result, () else: return False,result, ()
def CheckAlignments( peptide_sequences, query_token, other_tokens ): """check wether query aligns to all others. """ if param_loglevel >= 3: print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens)) sys.stdout.flush() if query_token not in peptide_sequences: return True result = alignlib.makeAlignmentVector() alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0 ) row_seq = alignlib.makeSequence(peptide_sequences[query_token]) for x in other_tokens: if x not in peptide_sequences: continue col_seq = alignlib.makeSequence( peptide_sequences[x] ) alignator.align( result, row_seq, col_seq ) if param_loglevel >= 5: print "# %s - %s = %f" % (query_token, x, result.getScore()) if result.getScore() > param_min_alignment_score: return True return False
def getMapPeptide2Cds( peptide_sequence, cds_sequence, options ): """get map between peptide sequence and cds sequence. The returned alignment is in nucleotides. """ ## remove whitespaces form protein sequence p = re.sub(" ", "", peptide_sequence ) ## remove gaps and whitespaces from cds c = re.sub("[ .-]", "", cds_sequence ) w = Genomics.Protein2Wobble( p.upper() ) if options.loglevel >= 6: options.stdlog.write( "# peptide original (%5i): %s\n" % (len(p), p) ) options.stdlog.write( "# cds original (%5i): %s\n" % (len(c), c) ) options.stdlog.write( "# wobble sequence (%5i): %s\n" % (len(w), w) ) options.stdlog.flush() seq_wobble = alignlib.makeSequence( w ) seq_cds = alignlib.makeSequence( string.upper(c) ) seq_peptide = alignlib.makeSequence( p ) map_p2c = alignlib.makeAlignmentVector() try: AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options = options ) except ValueError, msg: raise ValueError( "mapping error for sequence: %s" % (msg) )
def applyMethod(self, neighbours ): """apply the method.""" # build multiple alignment mali = alignlib.makeMultipleAlignment() query_nid = neighbours.mQueryToken sequence = self.mFasta.getSequence( query_nid ) mali.add( alignlib.makeAlignatum( sequence ) ) qseq = alignlib.makeSequence( sequence ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, -10.0, -1.0, True, True, True, True) for n in neighbours.mMatches: if n.mSbjctToken == query_nid: continue sequence = self.mFasta.getSequence( n.mSbjctToken ) blast_query2sbjct = n.getAlignment() if blast_query2sbjct == None: raise ValueError( "AddaRealignment.py needs a reference alignment.") realign_query2sbjct = alignlib.makeAlignmentVector() sseq = alignlib.makeSequence( sequence ) qseq.useSegment( n.mQueryFrom, n.mQueryTo ) sseq.useSegment( n.mSbjctFrom, n.mSbjctTo ) realign_query2sbjct = alignlib.makeAlignmentVector() alignator.align( realign_query2sbjct, qseq, sseq ) nidentical = alignlib.getAlignmentIdentity( realign_query2sbjct, blast_query2sbjct, alignlib.RR ) nblast = blast_query2sbjct.getNumAligned() nrealigned = realign_query2sbjct.getNumAligned() self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \ (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) ) if nidentical == nblast: self.mNIdentical += 1 else: self.mNDifferent += 1
def GetMap( self ): """return map between the two segments.""" if self.mAlignmentFrom1 and self.mAlignmentFrom2: map_a2b = alignlib.makeAlignmentVector() alignlib.AlignmentFormatEmissions( self.mAlignmentFrom1, self.mAlignment1, self.mAlignmentFrom2, self.mAlignment2 ).copy( map_a2b ) return map_a2b else: return None
def fillFromTable( self, table_row ): if len(table_row) == 25: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString) = table_row elif len(table_row) == 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] elif len(table_row) > 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] else: raise ValueError, "unknown format: %i fields" % len(data) sys.exit(0) if self.mExpand: self.mMapPeptide2Translation = alignlib.makeAlignmentVector() if self.mQueryAli != "" and self.mSbjctAli != "": alignlib.AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli ).copy( self.mMapPeptide2Translation ) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
def AlignPair( pair, anchor = 0 ): """align a pair of introns.""" map_intron_a2b = alignlib.makeAlignmentVector() if param_loglevel >= 1: print "# aligning %s-%i with %s-%i: lengths %i and %i" % (pair.mToken1, pair.mIntronId1, pair.mToken2, pair.mIntronId2, len(pair.mAlignedSequence1), len(pair.mAlignedSequence2)) sys.stdout.flush() s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor if param_method == "dialigned": dialign.Align( s1, s2, map_intron_a2b ) elif param_method == "dialignedlgs": dialignlgs.Align( s1, s2, map_intron_a2b ) elif param_method == "dbaligned": dba.Align( s1, s2, map_intron_a2b ) elif param_method == "clusaligned": raise NotImplementedError("clustalw wrapper not up-to-date") clustal.Align( s1, s2, map_intron_a2b ) if anchor: map_intron_a2b.removeRowRegion( anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo() ) map_intron_a2b.removeRowRegion( 1, anchor) map_intron_a2b.removeColRegion( anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo() ) map_intron_a2b.removeColRegion( 1, anchor) map_intron_a2b.moveAlignment( -anchor, -anchor ) if map_intron_a2b.getLength() == 0: if param_loglevel >= 1: print "# Error: empty intron alignment" return False seq1 = alignlib.makeSequence( pair.mAlignedSequence1 ) seq2 = alignlib.makeSequence( pair.mAlignedSequence2 ) data = alignlib.AlignmentFormatExplicit( map_intron_a2b, seq1, seq2 ) pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mMethod = param_method pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps(), map_intron_a2b.getLength() pair.mAligned = pair.mLength - pair.mNumGaps if param_loglevel >= 2: print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2 return True
def buildAlignment(self): """build alignment for a match.""" block_sizes = map(int, self.mBlockSizes.split(",")[:-1]) query_starts = map(int, self.mQueryBlockStarts.split(",")[:-1]) sbjct_starts = map(int, self.mSbjctBlockStarts.split(",")[:-1]) self.mMapSbjct2Query = alignlib.makeAlignmentVector() for x in range(len(block_sizes)): self.mMapSbjct2Query.addDiagonal(sbjct_starts[x], sbjct_starts[x] + block_sizes[x], query_starts[x] - sbjct_starts[x])
def buildAlignment( self ): """build alignment for a match.""" block_sizes = map(int, self.mBlockSizes.split(",")[:-1]) query_starts = map(int, self.mQueryBlockStarts.split(",")[:-1]) sbjct_starts = map(int, self.mSbjctBlockStarts.split(",")[:-1]) self.mMapSbjct2Query = alignlib.makeAlignmentVector() for x in range( len(block_sizes) ): self.mMapSbjct2Query.addDiagonal( sbjct_starts[x], sbjct_starts[x] + block_sizes[x], query_starts[x] - sbjct_starts[x] )
def getCopy( self ): """return a new copy. """ new_entry = Prediction() new_entry.mExpand = self.mExpand new_entry.mPredictionId = self.mPredictionId new_entry.mQueryToken = self.mQueryToken new_entry.mQueryFrom = self.mQueryFrom new_entry.mQueryTo = self.mQueryTo new_entry.mSbjctToken = self.mSbjctToken new_entry.mSbjctStrand = self.mSbjctStrand new_entry.mSbjctFrom = self.mSbjctFrom new_entry.mSbjctTo = self.mSbjctTo new_entry.mRank = self.mRank new_entry.score = self.score new_entry.mQueryLength = self.mQueryLength new_entry.mQueryCoverage = self.mQueryCoverage new_entry.mNGaps = self.mNGaps new_entry.mNFrameShifts = self.mNFrameShifts new_entry.mNIntrons = self.mNIntrons new_entry.mNSplits = self.mNSplits new_entry.mNStopCodons = self.mNStopCodons new_entry.mPercentIdentity = self.mPercentIdentity new_entry.mPercentSimilarity = self.mPercentSimilarity new_entry.mTranslation = self.mTranslation new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo new_entry.mAlignmentString = self.mAlignmentString new_entry.mQueryAli = self.mQueryAli new_entry.mSbjctAli = self.mSbjctAli if self.mExpand: new_entry.mMapPeptide2Translation = alignlib.makeAlignmentVector() alignlib.copyAlignment( new_entry.mMapPeptide2Translation, self.mMapPeptide2Translation) new_entry.mMapPeptide2Genome = Genomics.String2Alignment( new_entry.mAlignmentString) else: new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None return new_entry
def getParts(src): '''split a wrap-around alignment''' result = None r = [] last_s = src.getColTo() for p in range(src.getRowFrom(), src.getRowTo()): s = src.mapRowToCol(p) if s < 0: continue if last_s >= s: if result: r.append(result) result = alignlib.makeAlignmentVector() last_s = s result.addPair(s, p, 0) if result: r.append(result) return r
def getParts( src ): '''split a wrap-around alignment''' result = None r = [] last_s = src.getColTo() for p in range( src.getRowFrom(), src.getRowTo() ): s = src.mapRowToCol(p) if s < 0: continue if last_s >= s: if result: r.append( result ) result = alignlib.makeAlignmentVector() last_s = s result.addPair( s, p, 0 ) if result: r.append( result ) return r
def __init__(self, expand = 1): self.mExpand = expand self.mPredictionId = 0 self.mQueryToken = 0 self.mQueryFrom = 0 self.mQueryTo = 0 self.mSbjctToken = 0 self.mSbjctStrand = 0 self.mSbjctFrom = 0 self.mSbjctTo = 0 self.mRank = 0 self.score = 0 self.mQueryLength = 0 self.mQueryCoverage = 0 self.mNGaps = 0 self.mNFrameShifts = 0 self.mNIntrons = 0 self.mNSplits = 0 self.mNStopCodons = 0 self.mPercentIdentity = 0 self.mPercentSimilarity = 0 self.mTranslation = "" self.mSbjctGenomeFrom = 0 self.mSbjctGenomeTo = 0 self.mAlignmentString = "" self.mQueryAli = "" self.mSbjctAli = "" if self.mExpand: self.mMapPeptide2Translation = alignlib.makeAlignmentVector() self.mMapPeptide2Genome = [] else: self.mMapPeptide2Translation = None self.mMapPeptide2Genome = None self.mNAssembled = 0
def buildMali(self, query_nid, neighbours): """build a multiple alignment from a set of neighbours. """ # build multiple alignment mali = alignlib.makeMultipleAlignment() query_sequence = self.mFasta.getSequence(query_nid) mali.add(alignlib.makeAlignatum(query_sequence)) qseq = alignlib.makeSequence(query_sequence) alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_LOCAL, -10, -2) nskipped = 0 for n in neighbours[:self.mMaxNumNeighbours]: if n.mSbjctToken == query_nid: continue if n.mEvalue > self.mMaxEvalue: nskipped += 1 continue sequence = self.mFasta.getSequence(n.mSbjctToken) E.debug("adding %s" % str(n)) map_query2sbjct = n.getAlignment() if map_query2sbjct == None: sseq = alignlib.makeSequence(sequence) qseq.useSegment(n.mQueryFrom, n.mQueryTo) sseq.useSegment(n.mSbjctFrom, n.mSbjctTo) map_query2sbjct = alignlib.makeAlignmentVector() alignator.align(map_query2sbjct, qseq, sseq) if map_query2sbjct.getLength() == 0: self.warn("empty alignment: %s" % str(n)) nskipped += 1 continue if map_query2sbjct.getRowTo() > len(query_sequence): self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\ (map_query2sbjct.getRowTo(), len(query_sequence), str(n))) nskipped += 1 continue elif map_query2sbjct.getColTo() > len(sequence): self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\ (map_query2sbjct.getColTo(), len(sequence), str(n))) nskipped += 1 continue try: mali.add(alignlib.makeAlignatum(sequence), map_query2sbjct, mali_is_in_row=True, insert_gaps_mali=False, insert_gaps_alignatum=True, use_end_mali=True, use_end_alignatum=False) except RuntimeError, msg: self.warn("problem when building alignment for %s: msg=%s" % (str(n), msg)) nskipped += 1 continue
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default]." ) parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default]." ) parser.add_option("-m", "--mode", dest="mode", type="choice", choices = ("global", "local" ), help="alignment mode, global=nw, local=sw [default=%default]." ) parser.set_defaults( gop = -12.0, gep = -2.0, format= "fasta", mode = "local", ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) if len(args) != 2: raise ValueError("please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info( "read 2 multiple alignments" ) mali1.readFromFile( IOTools.openFile( args[0], "r" ), format=options.format ) mali2.readFromFile( IOTools.openFile( args[1], "r" ), format=options.format ) cmali1 = Mali.convertMali2Alignlib( mali1 ) cmali2 = Mali.convertMali2Alignlib( mali2 ) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull( mode, options.gop, options.gep ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorDirichlet( 0.3 ) ) alignlib.setDefaultRegularizor( alignlib.makeRegularizorDirichletPrecomputed() ) cprofile1 = alignlib.makeProfile( cmali1 ) cprofile2 = alignlib.makeProfile( cmali2 ) result = alignlib.makeAlignmentVector() alignator.align( result, cprofile1, cprofile2 ) E.debug( "result=\n%s" % alignlib.AlignmentFormatEmissions( result) ) cmali1.add( cmali2, result ) outmali = Mali.convertAlignlib2Mali( cmali1, identifiers = mali1.getIdentifiers() + mali2.getIdentifiers() ) outmali.writeToFile( options.stdout, format=options.format) ## write footer and output benchmark information. E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser.add_option("-m", "--filename-map", dest="filename_map", type="string", help="filename with mapping information.") parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string", help="pattern for mapping new to old identifiers: extract string from old.") parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string", help="pattern for mapping new to old identifiers: put string into new.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="genome_file.") parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string", help="filename with peptide sequences.") parser.add_option("-f", "--input-format", dest="input_format", type="choice", help="format of mapping file", choices=("alignment", "offsets") ) parser.add_option("-i", "--write-missed", dest="write_missed", type="string", help="write missed identifiers to separate file.") parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string", help="filename with gene information.") parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string", help="filename with old peptide information.") parser.add_option("--no-renumber", dest="renumber", action="store_false", help="do not renumber predictions.") parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string", help="contig sizes for old data.") parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string", help="contig sizes for new data.") parser.add_option("--skip-errors", dest="skip_errors", action="store_true", help="skip entries with errors.") parser.set_defaults( filename_map = None, pattern_old = "(.+)", pattern_new = "%s", genome_file = None, filename_peptides = None, write_missed = None, filename_genes = None, filename_old_peptides = None, renumber = True, input_format = "alignment", contig_sizes_old = None, contig_sizes_new = None, skip_errors = None ) (options, args) = E.Start( parser, add_pipe_options = True) predictor = PredictorExonerate() ## the different mapping criteria map_sbjcts = {} breakpoints = {} ################################################################################################ map_transcript2gene = {} if options.filename_genes: infile = open(options.filename_genes, "r") for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())): map_transcript2gene[transcript] = gene infile.close() ################################################################################################ peptides = {} if options.filename_peptides: peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides)) ################################################################################################ ## read old query sequences and compare against new query sequences ## this can be used to build a map between old and new queries query_map_old2new = {} if options.filename_old_peptides: old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r")) options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides)) query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides) options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped))) if options.loglevel >= 2: options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable)) options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped)) ################################################################################################ ## read old/new contig sizes for mapping positive/negative coordinates contig_sizes_old = {} contig_sizes_new = {} if options.contig_sizes_old: contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") ) if options.contig_sizes_new: contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") ) ################################################################################################ if options.filename_map: infile = open(options.filename_map) if options.input_format == "alignments": for line in infile: if line[0] == "#": continue x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t") map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali) if options.loglevel >= 1: options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts)) elif options.input_format == "offsets": ## input is a list of segments and their offsets. breakpoints, endpoints, offsets = ReadOffsets( infile ) if options.loglevel >= 1: options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints)) infile.close() ################################################################################################ ################################################################################################ ################################################################################################ ## end of input section ################################################################################################ ################################################################################################ ################################################################################################ rx = re.compile(options.pattern_old) last_sbjct_token = None ninput = 0 nerrors = 0 nerrors_map = 0 nerrors_inconsistencies = 0 nerrors_boundaries = 0 nerrors_translation = 0 nerrors_inconsequential = 0 nerrors_realigned = 0 nmapped = 0 nfiltered = 0 naligned = 0 noutput = 0 found_transcripts = {} nduplicates = 0 output = {} for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry() entry.Read( line ) ninput += 1 is_positive = entry.mSbjctStrand == "+" is_error = False ## check if query token is mappable: using sequence map if (query_map_old2new and entry.mQueryToken not in query_map_old2new): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue else: ## check if query token is mappable: using filter if (peptides and entry.mQueryToken not in peptides): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0] ########################################################################################################## ## Map via alignments if entry.mSbjctToken in map_sbjcts: nmapped += 1 if last_sbjct_token != entry.mSbjctToken: old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken] map_a2b = alignlib.makeAlignmentVector() alignlib.AlignmentFormatExplicit( int(old_from), old_ali, int(new_from), new_ali).copy( map_a2b ) last_sbjct_token = entry.mSbjctToken if options.loglevel >= 3: print "#", str(entry) print "#", map_sbjcts[entry.mSbjctToken] sys.stdout.flush() old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t first_res, last_res = f + 1, t else: f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t first_res, last_res = f, t + 1 ## map first and last residues mfirst_res = map_a2b.mapRowToCol( first_res ) mlast_res = map_a2b.mapRowToCol( last_res ) if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ): options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, f, t)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# %s\n" % str(entry)) options.stderr.flush() nerrors_boundaries += 1 is_error = True ## get extended boundaries for alignment later on while mfirst_res == 0 and first_res > 1: first_res -= 1 mfirst_res = map_a2b.mapRowToCol(first_res) while mlast_res == 0 and last_res < map_a2b.getRowTo(): last_res += 1 mlast_res = map_a2b.mapRowToCol(last_res) ## convert to genomic coordinates ## convert negative strand coordinates if is_positive: new_f = mfirst_res - 1 new_t = mlast_res else: new_f = mfirst_res new_t = mlast_res - 1 new_f = map_a2b.getColTo() - new_f new_t = map_a2b.getColTo() - new_t ## Now map the alignment. try: MapAlignment( entry, map_a2b ) except ValueError: options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.flush() nerrors_map += 1 is_error= True if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo: options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) nerrors_inconsistencies += 1 is_error = True ########################################################################################################## ## Map via offsets if entry.mSbjctToken in breakpoints: old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t else: f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f o1 = GetOffset( f, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) o2 = GetOffset( t, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) if o1 != o2: options.stderr.write("# break within gene %s\n" % str(entry)) nerrors_map += 1 is_error = True f += o1 t += o2 if not is_positive: f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo: options.stderr.write("# mapping error: start after end %s\n" % str(entry)) nerrors_map += 1 is_error = True ########################################################################################################## ## do translation check, if genome is given if options.genome_file: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, options.genome_file, loglevel = 0) map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \ entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence ) if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation): options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) if map_sbjcts: options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation)) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome))) nerrors_translation += 1 is_error = True if peptides and entry.mQueryToken in peptides: naligned += 1 options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \ entry.mQueryToken, new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) # do a quick reprediction if entry.mQueryToken in peptides: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, 0, 0, genome_file = options.genome_pattern, loglevel = 0) predictor.mLogLevel = 0 result = predictor(entry.mQueryToken, peptides[entry.mQueryToken], entry.mSbjctToken, genomic_sequence, "--exhaustive --subopt FALSE --score '%s' " % str(80), new_f - 10, new_t + 10) prediction_id = entry.mPredictionId if result: entry = result[0] entry.mPredictionId = prediction_id nerrors_realigned += 1 else: if is_error: nerrors_inconsequential += 1 entry.mSbjctToken = new_sbjct_token ## map query tokens if query_map_old2new: query_tokens = query_map_old2new[entry.mQueryToken] else: query_tokens = (entry.mQueryToken,) if options.skip_errors and is_error: continue for query_token in query_tokens: entry.mQueryToken = query_token prediction_id = entry.mPredictionId entry.mPredictionId = 0 hid = Genomics.GetHID( str(entry) ) if hid in output: nduplicates += 1 continue noutput += 1 if options.renumber: prediction_id = noutput entry.mPredictionId = prediction_id options.stdout.write( str(entry) + "\n") options.stdout.flush() found_transcripts[entry.mQueryToken] = 1 ## write out found transcripts and genes nmissed_transcripts = 0 missed_transcripts = [] found_genes = {} if peptides: for x in peptides.keys(): if x not in found_transcripts: nmissed_transcripts += 1 missed_transcripts.append( x ) else: found_genes[map_transcript2gene[x]] = 1 missed_genes = {} nmissed_genes = 0 if map_transcript2gene: for t in missed_transcripts: g = map_transcript2gene[t] if g not in found_genes: missed_genes[g] = 1 nmissed_genes = len(missed_genes) if options.write_missed: outfile = open(options.write_missed, "w") for x in missed_transcripts: if x in unmapped: status = "unmapped" else: status = "mapped" outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status )) for x in missed_genes: status = "unknown" outfile.write( "%s\t%s\t%s\n" % ("gene", x, status )) outfile.close() options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\ ninput, noutput, nfiltered, nduplicates, nmapped, nerrors )) options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\ nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned )) options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\ len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) ) E.Stop()
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option( "--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].", ) parser.add_option( "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default]." ) parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder(alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor(alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor(alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" % ( nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo(), ) ) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()
nsplits, nstopcodons, pidentity, psimilarity, sequence, sbjct_genome_from, sbjct_genome_to, map_query2genome FROM %s AS p WHERE p.sbjct_token = '%s' AND p.sbjct_strand = '%s' AND OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 """ alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep ) map_reference2target = alignlib.makeAlignmentVector() assignment_id = 0 for line in cr.fetchall(): reference = PredictionParser.PredictionParserEntry() reference.FillFromTable( line ) ct = dbhandle.cursor() ct.execute( statement % (param_tablename_predictions_target, reference.mSbjctToken, reference.mSbjctStrand, reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo )) reference_exons = Exons.Alignment2Exons( reference.mMapPeptide2Genome, 0, reference.mSbjctFrom)
def alignlibCombineVector(): "test combination of vectors" vector = alignlib.makeAlignmentVector() alignlib.combineAlignment( vector, alignlib_vector, alignlib_vector, alignlib.RR)
def read( self, line ): data = string.split( line[:-1], "\t") if len(data) == 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, ) = data elif len(data) == 25: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 24: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 23: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, ) = data self.mAlignmentString = "" else: raise ValueError, "unknown format: %i fields in line %s" % (len(data), line[:-1]) (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity) = map (\ float, (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity)) (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled) = map (\ int, ( self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled)) if self.mExpand: self.mMapPeptide2Translation = alignlib.makeAlignmentVector() if self.mQueryAli != "" and self.mSbjctAli != "": alignlib.AlignmentFormatExplicit( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy( self.mMapPeptide2Translation ) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
def EliminateRedundantEntries( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = None, this_quality = None ): """eliminate redundant entries in a set.""" eliminated = [] rep_id = rep.transcript_id rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep ) result = alignlib.makeAlignmentVector() rep_seq = peptides[rep_id] rep_extended_seq = extended_peptides[rep_id] for entry in data: mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id, entry.mQueryCoverage, entry.mPid, entry.mQuality ) mem_seq = peptides[mem_id] mem_extended_seq = extended_peptides[mem_id] if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality)) if mem_id in eliminated_predictions: continue if mem_extended_seq == rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "i") ) elif mem_extended_seq in rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "p") ) else: if mem_quality != this_quality or \ mem_quality in options.quality_exclude_same: seq1 = alignlib.makeSequence( str(rep_seq) ) seq2 = alignlib.makeSequence( str(mem_seq) ) alignator.align( result, seq1, seq2 ) if options.loglevel >= 5: options.stdlog.write( "# ali\n%s\n" % alignlib.AlignmentFormatExplicit( result, seq1, seq2 ) ) pidentity = 100 * alignlib.calculatePercentIdentity( result, seq1, seq2 ) num_gaps = result.getNumGaps() if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\ ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) ) if pidentity >= options.min_identity: keep = False if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: keep = True reason = "covpid" elif num_gaps >= options.max_gaps and \ mem_coverage > rep_coverage - options.safety_coverage: keep = True reason = "gaps" elif mem_coverage >= rep_coverage - options.safety_coverage and \ 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage: keep = True reason = "memcov" if keep: options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "h") ) elif pidentity >= options.min_identity_non_genes and \ this_quality in options.quality_genes and \ mem_quality not in options.quality_genes: if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "l") ) return eliminated
def checkLinkZScore( self, query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to): """check, whether two domains are homologous. The check is done using a zscore calculation. """ result = alignlib.makeAlignmentVector() query_profile = self.getAlignandum( query_nid ) sbjct_profile = self.getAlignandum( sbjct_nid ) if not query_profile or not sbjct_profile: self.warn( "could not compute link %s_%i_%i - %s_%i_%i\n" % \ (query_nid, query_from, query_to, sbjct_nid, sbjct_from, sbjct_to) ) self.mNNotFound += 1 return False, result, ("na",) query_profile.useSegment( query_from, query_to ) sbjct_profile.useSegment( sbjct_from, sbjct_to ) self.mAlignator.align( result, query_profile, sbjct_profile ) self.debug( "# --> %s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i" %\ (query_nid, sbjct_nid, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) if result.getLength() == 0: query_profile.useSegment() sbjct_profile.useSegment() return False, result, ("na",) elif result.getScore() < self.mMinAlignmentScore: query_profile.useSegment() sbjct_profile.useSegment() return False, result, ("na",) elif result.getScore() > self.mSafetyThreshold * self.mMinAlignmentScore: query_profile.useSegment() sbjct_profile.useSegment() return True,result, ("na",) z_params = alignlib.makeNormalDistributionParameters() alignlib.calculateZScoreParameters( z_params, query_profile, sbjct_profile, self.mAlignator, self.mNumIterationsZScore) mean = z_params.getMean() stddev = z_params.getStandardDeviation() if stddev == 0: stddev = 1 zscore = (result.getScore() - mean) / stddev self.debug( "--> mean=%f, stdev=%f, zscore=%f" % (mean, stddev, zscore) ) query_profile.useSegment() sbjct_profile.useSegment() if zscore > self.mMinZScore: return True, result, ( "%5.2f" % zscore,) else: return False, result, ( "%5.2f" % zscore,)
def Expand( self ): self.mMapOld2New = alignlib.makeAlignmentVector() alignlib.AlignmentFormatEmissions( self.mOldFrom, self.mOldAli, self.mNewFrom, self.mNewAli).copy( self.mMapOld2New )
def main(): parser = E.OptionParser( version = "%prog version: $Id: quality2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("--quality-threshold", dest="quality_threshold", type="int", help="quality threshold for masking positions [default=%default]" ) parser.add_option("--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]" ) parser.add_option("--filename-map", dest="filename_map", type="string", help="filename in psl format mapping entries in multiple alignment to the genome [default=%default]" ) parser.add_option("-q", "--quality-file", dest="quality_file", type="string", help="filename with genomic base quality information [default=%default]." ) parser.set_defaults( quality_threshold = 40, quality_file = "quality", filename_map = None, frame = 3, ) (options, args) = E.Start( parser ) ################################################## ################################################## ################################################## ## read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator( infile ): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## ## get quality scores ################################################## quality = IndexedFasta.IndexedFasta( options.quality_file ) quality.setTranslator( IndexedFasta.TranslatorBytes() ) ################################################## ################################################## ################################################## ## main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write( "cluster_id\tstart\tend\n" ) for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn( "gene_id %s not found in map." % gene_id ) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib.makeAlignmentVector() fillAlignment( map_gene2mali, alignment ) # get quality scores try: quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) except ValueError, msg: nmissed += 1 E.warn( "could not retrieve quality scores for %s:%i-%i: %s" % (match.mSbjctId, match.mSbjctFrom, match.mSbjctTo, msg) ) continue # print str(alignlib.AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib.AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib.makeAlignmentVector() alignlib.combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib.RR ) # print str(alignlib.AlignmentFormatEmissions( map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp,c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue positions.append( y ) scores = [ quality_scores[ x ] for x in positions ] random.shuffle(scores) for p,q in zip( positions,scores): quality_scores[p] = q # negative strand to_mask = [] ## reverse position rp = len(alignment) for fp,c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \ (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) ) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend( list( range(start, start + options.frame) ) ) else: to_mask.append( p ) regions = Iterators.group_by_distance( sorted(to_mask) ) for start,end in regions: options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) ) noutput += 1
ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n" ) while 1: try: cur_record = iterator.next() except StopIteration: break ninput += 1 sequence = re.sub( " ", "", cur_record.sequence) l = len(sequence) map_sequence2mali = alignlib.makeAlignmentVector() alignlib.AlignmentFormatExplicit( 0, sequence, 0, "X" * l ).copy( map_sequence2mali ) options.stdout.write( "\t".join( ( cur_record.title, "ref", str( alignlib.AlignmentFormatBlocks( map_sequence2mali ) ) ) ) + "\n" ) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) E.Stop()
def _alignToProfile(infile, outfile, min_score=0): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile(open("../data/mouse.fasta")) src_mali = Mali.convertMali2Alignlib(mali) E.debug("read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns())) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0, 2): profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n) profile_mali = Mali.convertMali2Alignlib(profile_mali) alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4)) alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform()) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile(profile_mali) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal(0, n, 0) build_mali.add(src_mali, m) outf = open(outfile, "w") outf_log = open(outfile + ".info", "w") outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append(re.sub("-", "", mali[pid])) ids.append(pid) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator(open(infile)): E.debug("adding %s" % s.title) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence(s.sequence) rseq = alignlib.makeSequence(rsequence) alignator.align(map_seq2profile, seq, profile) alignator.align(map_rseq2profile, rseq, profile) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts(m) covered = 0 for mm in r: build_mali.add(mm) sequences.append(sequence) ids.append(s.title) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write("\t".join( map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence)), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())))) + "\n") c.output += 1 #build_mali.expand( aa ) result = str( alignlib.MultAlignmentFormatPlain(build_mali, sequences, alignlib.UnalignedStacked)) for pid, data in zip(ids, result.split("\n")): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start) + 1, int(end), sequence)) outf.close() outf_log.close() E.info("%s\n" % str(c))
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width = 2, max_advance = 2 ): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib.makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib.getDefaultEncoder() ) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue( x ) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue( xr, seq_cds.asResidue(y) ) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA( c ), pep_seq[int(x/3)]) ) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) )) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib.makeAlignmentVector() ## backtrack to previous three codons and align ## three codons for double frameshifts that span two codons and ## produce two X's and six WWWWWW. ## number of nucleotides to extend (should be multiple of 3) ## less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx ) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib.RIGHT )) if (x_start, y_start) == last_start: raise ValueError( "infinite loop detected" ) last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq) ) y_end = min(y_start + 2 * d, len(cds_seq) ) wobble_fragment = alignlib.makeSequence(wobble_seq[x_start:x_end]) cds_fragment = alignlib.makeSequence(cds_seq[y_start:y_end]) AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options ) if options.loglevel >= 10: options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str(alignlib.AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment )))) options.stdlog.flush() ## clear alignment map_p2c.removeRowRegion( x_start, x_end ) ngap = 0 last_x, last_y = None, None for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) ) if s < 0: raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair( x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s )) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to next codon. if ngap == 3: map_p2c.removeRowRegion( last_x, last_x + 1 ) last_x += 1 map_p2c.addPair( last_x, last_y ) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s )) options.stdlog.flush() ngap = 0 ## exit condition if alignment is shorter than problematic residue ## need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: ## only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue( xr, seq_cds.asResidue(y) ) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair( x, y, float(s) ) # advance to next residues x += 1 y += 1 # sanity checks assert( map_p2c.getRowTo() <= seq_wobble.getLength() ) assert( map_p2c.getColTo() <= seq_cds.getLength() )
def Add( self, const_other, combine_contig = False, allow_overlap = False, contig_size = 0, combine_queries = False, as_intron = False ): """add one entry to another. This procedure allows to add - predictions on different contigs if combine_contig = True - overlapping predictions on the same query if allow_overlap = True - results from different queries if combine_queries = True - if as_intron is set to true, the new fragment is added as an intron. """ ## create working copies of each prediction other = const_other.getCopy() this = self.getCopy() other.Expand() this.Expand() if as_intron: code = "I" else: code = "P" ## check for query overlaps if this.mQueryToken == other.mQueryToken: query_overlap = max( 0, min(this.mQueryTo, other.mQueryTo) -\ max(this.mQueryFrom, other.mQueryFrom) + 1) if query_overlap > 0: if allow_overlap: overlap = query_overlap ## if queries overlap, truncate this before adding the other this.mMapPeptide2Translation.removeRowRegion( this.mQueryTo - overlap + 1, this.mQueryTo ) other.mMapPeptide2Translation.moveAlignment( 0, -overlap ) this.mQueryTo -= overlap this.mTranslation = this.mTranslation[:-overlap] ## remove aligned residues from the back for x in range(len(this.mMapPeptide2Genome) - 1, 0, -1): if this.mMapPeptide2Genome[x][1] <= overlap: overlap -= this.mMapPeptide2Genome[x][1] del this.mMapPeptide2Genome[x] else: break this.mMapPeptide2Genome[-1] = (this.mMapPeptide2Genome[-1][0], this.mMapPeptide2Genome[-1][1] - overlap, this.mMapPeptide2Genome[-1][2] - overlap * 3) else: raise ValueError, "refusing to add overlapping entries: overlap = %i, queries:\n%s\n%s\n, set allow_overlap = True " % (query_overlap, str(this), str(other)) else: if not combine_queries: raise ValueError, "refusing to add different queries - set combine_queries = True." if this.mSbjctToken != other.mSbjctToken or \ this.mSbjctStrand != other.mSbjctStrand : if combine_contig: this.mSbjctToken += "-" + other.mSbjctToken this.mSbjctStrand += other.mSbjctStrand else: raise ValueError, "can not add different sbjct." sbjct_overlap = max(0, min(this.mSbjctGenomeTo, other.mSbjctGenomeTo) -\ max(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom), 0) if sbjct_overlap > 0: if not combine_contig: raise ValueError, "refusing to add overlapping entries: overlap = %i, sbjct:\n%s\n%s\n" % (sbjct_overlap, str(this), str(other)) if this.mSbjctToken == other.mSbjctToken: ## set precedence if this.mSbjctGenomeFrom < other.mSbjctGenomeFrom: first = this second = other else: first = other second = this ## get length of gap d_na = second.mSbjctGenomeFrom - first.mSbjctGenomeTo if this.mQueryToken != other.mQueryToken: d_aa = first.mQueryLength - first.mQueryTo # create a new virtual query by concatenating # the two queries this.mQueryToken += "-" + other.mQueryToken # sort out the alignment second.mMapPeptide2Translation.moveAlignment( first.mQueryLength, 0 ) this.mQueryLength = first.mQueryLength + second.mQueryLength else: d_aa = second.mQueryFrom - first.mQueryTo - 1 this.mSbjctGenomeFrom = min(this.mSbjctGenomeFrom, other.mSbjctGenomeFrom ) this.mSbjctGenomeTo = max(this.mSbjctGenomeTo, other.mSbjctGenomeTo ) this.mMapPeptide2Genome = first.mMapPeptide2Genome + [(code, d_aa, d_na)] + second.mMapPeptide2Genome this.mTranslation = first.mTranslation + second.mTranslation second.mMapPeptide2Translation.moveAlignment( 0, first.mSbjctTo - 1 ) else: ## join on different contigs d_na = contig_size - this.mSbjctGenomeTo + other.mSbjctGenomeFrom + query_overlap * 3 d_aa = other.mQueryFrom - this.mQueryTo - 1 this.mMapPeptide2Genome += [(code, d_aa, d_na),] + other.mMapPeptide2Genome this.mTranslation += other.mTranslation other.mMapPeptide2Translation.moveAlignment( 0, this.mSbjctTo - 1 ) this.mSbjctGenomeFrom = this.mSbjctGenomeFrom this.mSbjctGenomeTo = contig_size + other.mSbjctGenomeTo ## now fill self from first and this self.mQueryToken = first.mQueryToken self.mQueryLength = this.mQueryLength nthis = this.mMapPeptide2Translation.getLength() - this.mMapPeptide2Translation.getNumGaps() nother = other.mMapPeptide2Translation.getLength() - other.mMapPeptide2Translation.getNumGaps() self.mMapPeptide2Genome = first.mMapPeptide2Genome self.mSbjctGenomeFrom = this.mSbjctGenomeFrom self.mSbjctGenomeTo= this.mSbjctGenomeTo ## there might be some reference counting issues, thus ## do it the explicit way. alignlib.addAlignment2Alignment( this.mMapPeptide2Translation, other.mMapPeptide2Translation) self.mMapPeptide2Translation = alignlib.makeAlignmentVector() alignlib.addAlignment2Alignment( self.mMapPeptide2Translation, this.mMapPeptide2Translation ) self.mTranslation = this.mTranslation self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom() self.mQueryTo = self.mMapPeptide2Translation.getRowTo() self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom() self.mSbjctTo = self.mMapPeptide2Translation.getColTo() self.mQueryCoverage = 100.0 * (self.mQueryTo - self.mQueryFrom + 1) / float(self.mQueryLength) self.mAlignmentString = string.join( map( \ lambda x: string.join(map(str, x), " "), self.mMapPeptide2Genome), " ") f = alignlib.AlignmentFormatEmssions( self.mMapPeptide2Translation ) self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment ## summary parameters self.mRank = max( this.mRank, other.mRank) self.score += other.score self.mNGaps += other.mNGaps self.mNFrameShifts += other.mNFrameShifts self.mNIntrons += other.mNIntrons + 1 self.mNStopCodons += other.mNStopCodons nnew = self.mMapPeptide2Translation.getLength() - self.mMapPeptide2Translation.getNumGaps() self.mPercentIdentity = min( 100.0, (self.mPercentIdentity * nthis + other.mPercentIdentity * nother) / nnew ) self.mPercentSimilarity = min( 100.0, (self.mPercentSimilarity * nthis + other.mPercentSimilarity * nother) / nnew ) self.mNAssembled += 1 + other.mNAssembled
def PrintCluster( cluster, cluster_id, lengths, peptide_sequences = None, regex_preferred = None): """print a cluster. Take longest sequence as representative. If preferred is given, only take genes matching preferred identifier. """ if regex_preferred: rx = re.compile(regex_preferred) else: rx = None max_al = 0 max_pl = 0 rep_a = None rep_p = None for c in cluster: l = 0 if c in lengths: l = lengths[c] if l > max_al: max_al = l rep_a = c if rx and rx.search(c) and l > max_pl: max_pl = l rep_p = c if max_pl > 0: max_l = max_pl rep = rep_p else: max_l = max_al rep = rep_a for mem in cluster: l = 0 if mem in lengths: l = lengths[mem] if peptide_sequences: map_rep2mem = alignlib.makeAlignmentVector() if rep == mem and rep in lengths: alignlib.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0) elif mem in peptide_sequences and \ rep in peptide_sequences: alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0) alignator.align( map_rep2mem, alignlib.makeSequence( peptide_sequences[rep] ), alignlib.makeSequence( peptide_sequences[mem] ) ) f = alignlib.AlignmentFormatEmissions( map_rep2mem ) print string.join( map(str, (rep, mem, l, f)), "\t" ) else: print string.join( map(str, (rep, mem, l)), "\t" ) sys.stdout.flush() return cluster_id
def __init__(self): self.mAlignator1 = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 ) self.mAlignator = alignlib.makeAlignatorIterative( self.mAlignator1, options.iterative_min_score ) def align(self, query, sbjct, map_query2sbjct): xrow = alignlib.makeSequence(query.asString()) xcol = alignlib.makeSequence(sbjct.asString()) self.mAlignator.align( xrow, xcol, map_query2sbjct) alignator = AlignatorSequence() elif options.alignment_mode == "compass": alignator = AlignatorCompass() else: raise "unknown alignment mode %s" % options.alignment_mode map_query2sbjct = alignlib.makeAlignmentVector() def __align( query_profile, sbjct_profile ): """align two profiles and output the result.""" alignator.align( query_profile, sbjct_profile, map_query2sbjct ) blocks = alignlib.AlignedBlocks( map_query2sbjct ) if options.loglevel >= 3: options.stdlog.write( str(map_query2sbjct) ) if map_query2sbjct.getLength() > 0: options.stdout.write("%s\t%s\t%i\t%s\n" % ( query, sbjct, map_query2sbjct.getScore(), str(blocks) ) ) return 1
def IsParalogLink( link, cds1, cds2 ): """sort out ortholog relationships between transcripts of orthologous genes. """ map_a2b = alignlib.makeAlignmentVector() alignlib.AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli ).copy( map_a2b ) if link.mQueryLength < (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) or \ link.mSbjctLength < (map_a2b.getColTo() - map_a2b.getColFrom() + 1): print "ERRONEOUS LINK: %s" % str(link) raise "length discrepancy" coverage_a = 100.0 * (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / link.mQueryLength coverage_b = 100.0 * (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / link.mSbjctLength ## check exon boundaries, look at starts, skip first exon def MyMap( a, x): if x < a.getRowFrom(): return 0 while x <= a.getRowTo(): c = a.mapRowToCol( x ) if c: return c x += 1 else: return 0 mapped_boundaries = UniquifyList(map( lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1) , cds1[1:])) reference_boundaries = UniquifyList(map( lambda x: x.mPeptideFrom / 3 + 1, cds2[1:])) nmissed = 0 nfound = 0 nmin = min(len(mapped_boundaries), len(reference_boundaries)) nmax = max(len(mapped_boundaries), len(reference_boundaries)) both_single_exon = len(cds1) == 1 and len(cds2) == 1 one_single_exon = len(cds1) == 1 or len(cds2) == 1 if len(mapped_boundaries) < len(reference_boundaries): mless = mapped_boundaries mmore = reference_boundaries else: mmore = mapped_boundaries mless = reference_boundaries ## check if exon boundaries are ok for x in mless: is_ok = 0 for c in mmore: if abs(x-c) < param_boundaries_max_slippage: is_ok = 1 break if is_ok: nfound += 1 else: nmissed += 1 ## set is_ok for dependent on exon boundaries ## in single exon cases, require a check of coverage is_ok = False check_coverage = False if both_single_exon or one_single_exon: is_ok = True check_coverage = True else: if nmin == 1: is_ok = nmissed == 0 elif nmin == 2: is_ok = nmissed <= 1 elif nmin > 2: is_ok = nfound >= 2 cc = min(coverage_a, coverage_b) if param_loglevel >= 3: print "# nquery=", len(cds1), "nsbjct=", len(cds2), "nmin=", nmin, "nmissed=", nmissed, "nfound=", nfound, \ "is_ok=", is_ok, "check_cov=", check_coverage, \ "min_cov=", cc, coverage_a, coverage_b, \ "mapped=", mapped_boundaries, "reference=",reference_boundaries if not is_ok: return True, "different exon boundaries" if check_coverage and cc < param_min_coverage: return True, "low coverage" return False, None
import timeit import alignlib NUM_SAMPLES=1000 ALISIZE=2000 alignlib_vector = alignlib.makeAlignmentVector() alignlib_vector.addDiagonal( 0, ALISIZE, 0) python_vector = [] for x in xrange(ALISIZE): python_vector.append(x) def pythonBuildVector(): """build vector alignment in python.""" vector = [] for x in xrange(ALISIZE): vector.append(x) def alignlibBuildVector(): "Stupid test function" vector = alignlib.makeAlignmentVector() vector.addDiagonal( 0, ALISIZE, 0) def pythonMapVector(): "test speed of mapRowToCol" for x in xrange(ALISIZE): a = python_vector[x] def alignlibMapVector():
raise ValueError( "mapping error for sequence: %s" % (msg) ) ## if there are more than five frameshifts - do exhaustive alignment max_gaps = 5 num_peptide_gaps = len( re.sub("[^-]", "", p ) ) ngaps = map_p2c.getNumGaps() - (num_peptide_gaps * 3) - abs(len(w)-len(c)) if options.loglevel >= 6: options.stdlog.write("# alignment between wobble and cds: ngaps=%i, npeptide_gaps=%i\n" % (ngaps, num_peptide_gaps) ) PrintPrettyAlignment( seq_wobble, seq_cds, p, map_p2c, options ) if ngaps > max_gaps: if options.loglevel >= 2: options.stdlog.write("# too many gaps (%i>%i), realigning exhaustively.\n" % (ngaps, max_gaps ) ) options.stdlog.flush() full_map_p2c = alignlib.makeAlignmentVector() AlignExhaustive( seq_wobble, seq_cds, seq_peptide, full_map_p2c, options ) if options.loglevel >= 6: options.stdlog.write("# full alignment between wobble and cds:\n" ) options.stdlog.flush() PrintPrettyAlignment( seq_wobble, seq_cds, p, full_map_p2c, options ) map_p2c = full_map_p2c ## remove incomplete codons x = 0 while x < len(p) * 3: if (map_p2c.mapRowToCol( x ) < 0 or \ map_p2c.mapRowToCol( x+1 ) < 0 or \ map_p2c.mapRowToCol( x+2 ) < 0 ):
def alignlibBuildVector(): "Stupid test function" vector = alignlib.makeAlignmentVector() vector.addDiagonal( 0, ALISIZE, 0)
def Align( self, method, anchor = 0, loglevel = 1 ): """align a pair of sequences. get rid of this and use a method class instead in the future """ map_a2b = alignlib.makeAlignmentVector() s1 = "A" * anchor + self.mSequence1 + "A" * anchor s2 = "A" * anchor + self.mSequence2 + "A" * anchor self.strand = "+" if method == "dialign": dialign = WrapperDialign.Dialign( self.mOptionsDialign ) dialign.Align( s1, s2, map_a2b ) elif method == "blastz": blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ ) blastz.Align( s1, s2, map_a2b ) if blastz.isReverseComplement(): self.strand = "-" self.mSequence2 = Genomics.complement( self.mSequence2 ) elif method == "dialignlgs": dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS ) dialignlgs.Align( s1, s2, map_a2b ) elif method == "dba": dba = WrapperDBA.DBA() dba.Align( s1, s2, map_a2b ) elif method == "clustal": raise NotImplementedError( "clustal wrapper needs to be updated") clustal = WrapperClustal.Clustal() clustal.Align( s1, s2, map_a2b ) elif method == "nw": seq1 = alignlib.makeSequence( s1 ) seq2 = alignlib.makeSequence( s2 ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, gop=-12.0, gep=-2.0 ) alignator.align( map_a2b, seq1, seq2 ) elif method == "sw": seq1 = alignlib.makeSequence( s1 ) seq2 = alignlib.makeSequence( s2 ) alignlib.performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw ) else: ## use callback function method(s1, s2, map_a2b) if map_a2b.getLength() == 0: raise AlignmentError("empty alignment") if anchor: map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() ) map_a2b.removeRowRegion( 1, anchor) map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() ) map_a2b.removeColRegion( 1, anchor) map_a2b.moveAlignment( -anchor, -anchor ) f = alignlib.AlignmentFormatExplicit( map_a2b, alignlib.makeSequence( self.mSequence1), alignlib.makeSequence( self.mSequence2) ) self.mMethod = method self.mAlignment = map_a2b self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment f = alignlib.AlignmentFormatEmissions( map_a2b ) self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment self.mAlignmentFrom1 = map_a2b.getRowFrom() self.mAlignmentTo1 = map_a2b.getRowTo() self.mAlignmentFrom2 = map_a2b.getColFrom() self.mAlignmentTo2 = map_a2b.getColTo() self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength() self.mAligned = self.mLength - self.mNumGaps self.SetPercentIdentity() self.SetBlockSizes()
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--method", dest="method", type="choice", choices=("view", "align", "pileup", "profile"), help="method to perform [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode [default=%default].") parser.add_option("--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.set_defaults( filename_graph="adda.graph", filename_index="adda.graph.idx", method="view", filename_fasta="adda", filename_config="adda.ini", append=False, force=False, mode="local", gop=-10.0, gep=-1.0, ) (options, args) = E.Start(parser) config = AddaIO.ConfigParser() config.read(os.path.expanduser(options.filename_config)) index = cadda.IndexedNeighbours(options.filename_graph, options.filename_index) alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder(alignlib.Protein20)) alignlib.getDefaultToolkit().setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) alignlib.getDefaultToolkit().setLogOddor( alignlib.makeLogOddorDirichlet(0.3)) alignlib.getDefaultToolkit().setWeightor(alignlib.makeWeightor()) fasta = IndexedFasta.IndexedFasta(options.filename_fasta) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.method == "view": for nid in args: nid = int(args[0]) neighbours = index.getNeighbours(nid) for n in neighbours: print str(n) elif options.method == "pileup": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) options.stdout.write("%s\n" % str(mali)) elif options.method == "profile": if "_" in args[0]: nid, start, end = AddaIO.toTuple(args[0]) else: nid = int(args[0]) start, end = None, None neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) if start != None: prof.useSegment(start, end) prof.prepare() options.stdout.write("%s\n" % str(prof)) elif options.method == "align": nid1, start1, end1 = AddaIO.toTuple(args[0]) nid2, start2, end2 = AddaIO.toTuple(args[1]) align = AddaProfiles.AddaProfiles(config, fasta=fasta) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL else: mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull(mode, options.gop, options.gep) def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof seq1, prof1 = _buildProfile(nid1, start1, end1) seq2, prof2 = _buildProfile(nid2, start2, end2) result = alignlib.makeAlignmentVector() alignator.align(result, prof1, prof2) E.debug("%s\n" % str(result)) options.stdout.write( "%s vs %s: score=%5.2f, length=%i, numgaps=%i, row_from=%i, row_to=%i, col_from=%i, col_to=%i\n" %\ (nid1, nid2, result.getScore(), result.getLength(), result.getNumGaps(), result.getRowFrom(), result.getRowTo(), result.getColFrom(), result.getColTo())) f = alignlib.AlignmentFormatExplicit(result, seq1, seq2) options.stdout.write("%s\n" % str(f)) E.Stop()