def PrintAlignedSequences( sequence1, sequence2, chain = None, format="modeller" ): ## align sequences by identity seq_row = alignlib.makeSequence( sequence1 ) seq_col = alignlib.makeSequence( sequence2 ) alignator = alignlib.makeAlignatorFullDP( -0.0, -0.0 ) map_row2col = alignlib.makeAlignataVector() alignator.Align( seq_row, seq_col, map_row2col ) lines = string.split(alignlib.writePairAlignment( seq_row, seq_col, map_row2col ), "\n") if format == "modeller": first_res, sequence, last_res = string.split( lines[0], "\t" ) print ">P1;structure" print "structureX: %s : %s : %s : %s : %s : : : : " % ("structure", first_res, "" , last_res, "" ) print "%s*" % sequence first_res, sequence, last_res = string.split( lines[1], "\t" ) print ">P1;sequence" print "sequence:%s : %s : %s : %s : %s : : : : " % ("sequence" , first_res, "", last_res, "") print "%s*" % sequence else: print lines
def GetAlignmentBetweenCorrespondingAtoms( coordinates1, coordinates2, cutoff ): """returns a list of atom positions, which are close to each other. This is done via a dynamic programming step. First all versus all comparison between atom positions is done. Only those positions are kept below cutoff. """ dots = alignlib.makeAlignataMatrixRow() for i in range(len(coordinates1)): x1,y1,z1 = coordinates1[i] for j in range(len(coordinates2)): x2,y2,z2 = coordinates2[j] d = math.sqrt( (x1-x2)*(x1-x2) + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2)) if d <= cutoff: dots.addPairExplicit(i+1, j+1, 1) seq1 = alignlib.makeSequence ("A" * len(coordinates1)) seq2 = alignlib.makeSequence ("A" * len(coordinates2)) if dots.getLength() <= 3: return None dottor = alignlib.makeAlignatorDummy( dots ) alignator = alignlib.makeAlignatorDotsSquared( 0, 0, dottor) map_a2b = alignlib.makeAlignataVector() alignator.Align( seq1, seq2, map_a2b) return map_a2b
def CheckAlignments( peptide_sequences, query_token, other_tokens ): """check wether query aligns to all others. """ if param_loglevel >= 3: print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens)) sys.stdout.flush() if query_token not in peptide_sequences: return True result = alignlib.makeAlignmentVector() alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0 ) row_seq = alignlib.makeSequence(peptide_sequences[query_token]) for x in other_tokens: if x not in peptide_sequences: continue col_seq = alignlib.makeSequence( peptide_sequences[x] ) alignator.align( result, row_seq, col_seq ) if param_loglevel >= 5: print "# %s - %s = %f" % (query_token, x, result.getScore()) if result.getScore() > param_min_alignment_score: return True return False
def PrintAlignedSequences(sequence1, sequence2, chain=None, format="modeller"): ## align sequences by identity seq_row = alignlib.makeSequence(sequence1) seq_col = alignlib.makeSequence(sequence2) alignator = alignlib.makeAlignatorFullDP(-0.0, -0.0) map_row2col = alignlib.makeAlignataVector() alignator.Align(seq_row, seq_col, map_row2col) lines = string.split( alignlib.writePairAlignment(seq_row, seq_col, map_row2col), "\n") if format == "modeller": first_res, sequence, last_res = string.split(lines[0], "\t") print ">P1;structure" print "structureX: %s : %s : %s : %s : %s : : : : " % ( "structure", first_res, "", last_res, "") print "%s*" % sequence first_res, sequence, last_res = string.split(lines[1], "\t") print ">P1;sequence" print "sequence:%s : %s : %s : %s : %s : : : : " % ( "sequence", first_res, "", last_res, "") print "%s*" % sequence else: print lines
def getMapPeptide2Cds( peptide_sequence, cds_sequence, options ): """get map between peptide sequence and cds sequence. The returned alignment is in nucleotides. """ ## remove whitespaces form protein sequence p = re.sub(" ", "", peptide_sequence ) ## remove gaps and whitespaces from cds c = re.sub("[ .-]", "", cds_sequence ) w = Genomics.Protein2Wobble( p.upper() ) if options.loglevel >= 6: options.stdlog.write( "# peptide original (%5i): %s\n" % (len(p), p) ) options.stdlog.write( "# cds original (%5i): %s\n" % (len(c), c) ) options.stdlog.write( "# wobble sequence (%5i): %s\n" % (len(w), w) ) options.stdlog.flush() seq_wobble = alignlib.makeSequence( w ) seq_cds = alignlib.makeSequence( string.upper(c) ) seq_peptide = alignlib.makeSequence( p ) map_p2c = alignlib.makeAlignmentVector() try: AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options = options ) except ValueError, msg: raise ValueError( "mapping error for sequence: %s" % (msg) )
def AlignPair( pair, anchor = 0 ): """align a pair of introns.""" map_intron_a2b = alignlib.makeAlignmentVector() if param_loglevel >= 1: print "# aligning %s-%i with %s-%i: lengths %i and %i" % (pair.mToken1, pair.mIntronId1, pair.mToken2, pair.mIntronId2, len(pair.mAlignedSequence1), len(pair.mAlignedSequence2)) sys.stdout.flush() s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor if param_method == "dialigned": dialign.Align( s1, s2, map_intron_a2b ) elif param_method == "dialignedlgs": dialignlgs.Align( s1, s2, map_intron_a2b ) elif param_method == "dbaligned": dba.Align( s1, s2, map_intron_a2b ) elif param_method == "clusaligned": raise NotImplementedError("clustalw wrapper not up-to-date") clustal.Align( s1, s2, map_intron_a2b ) if anchor: map_intron_a2b.removeRowRegion( anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo() ) map_intron_a2b.removeRowRegion( 1, anchor) map_intron_a2b.removeColRegion( anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo() ) map_intron_a2b.removeColRegion( 1, anchor) map_intron_a2b.moveAlignment( -anchor, -anchor ) if map_intron_a2b.getLength() == 0: if param_loglevel >= 1: print "# Error: empty intron alignment" return False seq1 = alignlib.makeSequence( pair.mAlignedSequence1 ) seq2 = alignlib.makeSequence( pair.mAlignedSequence2 ) data = alignlib.AlignmentFormatExplicit( map_intron_a2b, seq1, seq2 ) pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mMethod = param_method pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps(), map_intron_a2b.getLength() pair.mAligned = pair.mLength - pair.mNumGaps if param_loglevel >= 2: print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2 return True
def applyMethod(self, neighbours): """apply the method.""" # build multiple alignment mali = alignlib.makeMultipleAlignment() query_nid = neighbours.mQueryToken sequence = self.mFasta.getSequence(query_nid) mali.add(alignlib.makeAlignatum(sequence)) qseq = alignlib.makeSequence(sequence) alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_GLOBAL, -10.0, -1.0, True, True, True, True) for n in neighbours.mMatches: if n.mSbjctToken == query_nid: continue sequence = self.mFasta.getSequence(n.mSbjctToken) blast_query2sbjct = n.getAlignment() if blast_query2sbjct == None: raise ValueError( "AddaRealignment.py needs a reference alignment.") realign_query2sbjct = alignlib.makeAlignmentVector() sseq = alignlib.makeSequence(sequence) qseq.useSegment(n.mQueryFrom, n.mQueryTo) sseq.useSegment(n.mSbjctFrom, n.mSbjctTo) realign_query2sbjct = alignlib.makeAlignmentVector() alignator.align(realign_query2sbjct, qseq, sseq) nidentical = alignlib.getAlignmentIdentity(realign_query2sbjct, blast_query2sbjct, alignlib.RR) nblast = blast_query2sbjct.getNumAligned() nrealigned = realign_query2sbjct.getNumAligned() self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \ (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) ) if nidentical == nblast: self.mNIdentical += 1 else: self.mNDifferent += 1
def _buildProfile(nid, start, end): neighbours = index.getNeighbours(nid) mali = align.buildMali(nid, neighbours) prof = alignlib.makeProfile(mali) E.info("nid: %i, neighours=%i" % (nid, len(neighbours))) prof.useSegment(start, end) prof.prepare() seq = fasta.getSequence(nid) return alignlib.makeSequence(seq), prof
def applyMethod(self, neighbours ): """apply the method.""" # build multiple alignment mali = alignlib.makeMultipleAlignment() query_nid = neighbours.mQueryToken sequence = self.mFasta.getSequence( query_nid ) mali.add( alignlib.makeAlignatum( sequence ) ) qseq = alignlib.makeSequence( sequence ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, -10.0, -1.0, True, True, True, True) for n in neighbours.mMatches: if n.mSbjctToken == query_nid: continue sequence = self.mFasta.getSequence( n.mSbjctToken ) blast_query2sbjct = n.getAlignment() if blast_query2sbjct == None: raise ValueError( "AddaRealignment.py needs a reference alignment.") realign_query2sbjct = alignlib.makeAlignmentVector() sseq = alignlib.makeSequence( sequence ) qseq.useSegment( n.mQueryFrom, n.mQueryTo ) sseq.useSegment( n.mSbjctFrom, n.mSbjctTo ) realign_query2sbjct = alignlib.makeAlignmentVector() alignator.align( realign_query2sbjct, qseq, sseq ) nidentical = alignlib.getAlignmentIdentity( realign_query2sbjct, blast_query2sbjct, alignlib.RR ) nblast = blast_query2sbjct.getNumAligned() nrealigned = realign_query2sbjct.getNumAligned() self.mOutfile.write( "%s\t%s\t%i\t%i\t%i\n" % \ (n.mQueryToken, n.mSbjctToken, nidentical, nblast, nrealigned ) ) if nidentical == nblast: self.mNIdentical += 1 else: self.mNDifferent += 1
def CreateAlignandumObjects( self, sources ): tbl_nrdb = Table_nrdb( self.dbhandle ) alignanda = [] for id, nid, nid_from, nid_to in sources: if self.mLogLevel >= 2: print id, sys.stdout.flush() sequence = tbl_nrdb.Get_Sequence_From_NID( nid ) alignandum = alignlib.makeSequence( sequence[nid_from-1:nid_to] ) alignanda.append( (id, alignandum) ) if self.mLogLevel >= 2: print return alignanda
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage = globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "-c", "--cds", dest="filename_cds", type="string", help="filename with cds seguences." ) parser.add_option( "-f", "--format", dest="format", type="choice", choices=("paired_fasta", ), help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format" ) parser.set_defaults( genome_file = "genome", filename_cds = "cds.fasta", format = "paired_fasta", filename_suffix = ".fasta", filename_prefix = "", ) (options, args) = E.Start( parser, add_psql_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(1) fasta = IndexedFasta.IndexedFasta( options.genome_file ) ## reading CDS sequences if options.filename_cds: cds_sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r") ) else: cds_sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i CDS sequences\n" % len(cds_sequences) ) last_filename_genome = None p = PredictionParser.PredictionParserEntry() ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0 for line in options.stdin: if line[0] == "#": continue if line[0] == '"': continue p.Read(line) ninput += 1 genomic_fragment = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ) if len(genomic_fragment) == 0: raise "ERROR: empty fragment %s:%s for line" % (p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line try: cds_fragment = cds_sequences[p.mQueryToken] except KeyError: options.stdlog.write( "# ERROR: cds not found: query %s.\n" % p.mQueryToken ) continue map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA( p.mMapPeptide2Genome, query_from = p.mQueryFrom, sbjct_from = 0, genome = genomic_fragment ) ## check for errors: if map_query2sbjct.getRowTo() != p.mQueryTo * 3: options.stdlog.write( "# ERROR: boundary shift in query at line %s\n# %i %i\n" % (line, map_query2sbjct.getRowTo(), p.mQueryTo * 3 ) ) if map_query2sbjct.getColTo() > len(genomic_fragment): options.stdlog.write( "# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" %\ (line, len(genomic_fragment), map_query2sbjct.getColTo()) ) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment )) nlength += 1 continue if map_query2sbjct.getRowTo() > len(cds_fragment): options.stdlog.write( "# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" %\ (line, len(cds_fragment), map_query2sbjct.getRowTo()) ) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment )) nlength += 1 continue cds_seq = alignlib.makeSequence( cds_fragment ) genomic_seq = alignlib.makeSequence( genomic_fragment ) f = alignlib.AlignmentFormatExplicit( map_query2sbjct, cds_seq, genomic_seq ) row_ali = f.mRowAlignment col_ali = f.mColAlignment row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(row_ali, col_ali) row_ali = Genomics.MaskStopCodons( row_ali ) col_ali = Genomics.MaskStopCodons( col_ali ) if len(row_ali) != len(col_ali): options.stdlog.write( "# ERROR: wrong alignment lengths.\n" ) sys.exit(1) if len(row_ali) % 3 or len(col_ali) % 3: options.stdlog.write( "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line ) options.stdlog.write( "# %6i %s\n# %6i %s\n" % (len(row_ali), str(row_ali), len(col_ali), str(col_ali) ) ) n3 += 1 input = re.sub( "[-X]", "", p.mTranslation ) ref = re.sub( "[-X]", "", Genomics.TranslateDNA2Protein( col_ali ) ) if input != ref: if options.loglevel >= 1: options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken, len(input), input, len(ref), ref ) ) nsanity += 1 continue options.stdout.write( ">%s\n%s\n" % (p.mPredictionId, row_ali) ) options.stdout.write( ">%s_vs_%s_%s_%i_%i\n%s\n" % \ (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali) ) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % (ninput, noutput, nsanity, nlength, n3) ) E.Stop()
def Align( self, method, anchor = 0, loglevel = 1 ): """align a pair of sequences. get rid of this and use a method class instead in the future """ map_a2b = alignlib.makeAlignmentVector() s1 = "A" * anchor + self.mSequence1 + "A" * anchor s2 = "A" * anchor + self.mSequence2 + "A" * anchor self.strand = "+" if method == "dialign": dialign = WrapperDialign.Dialign( self.mOptionsDialign ) dialign.Align( s1, s2, map_a2b ) elif method == "blastz": blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ ) blastz.Align( s1, s2, map_a2b ) if blastz.isReverseComplement(): self.strand = "-" self.mSequence2 = Genomics.complement( self.mSequence2 ) elif method == "dialignlgs": dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS ) dialignlgs.Align( s1, s2, map_a2b ) elif method == "dba": dba = WrapperDBA.DBA() dba.Align( s1, s2, map_a2b ) elif method == "clustal": raise NotImplementedError( "clustal wrapper needs to be updated") clustal = WrapperClustal.Clustal() clustal.Align( s1, s2, map_a2b ) elif method == "nw": seq1 = alignlib.makeSequence( s1 ) seq2 = alignlib.makeSequence( s2 ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_GLOBAL, gop=-12.0, gep=-2.0 ) alignator.align( map_a2b, seq1, seq2 ) elif method == "sw": seq1 = alignlib.makeSequence( s1 ) seq2 = alignlib.makeSequence( s2 ) alignlib.performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw ) else: ## use callback function method(s1, s2, map_a2b) if map_a2b.getLength() == 0: raise AlignmentError("empty alignment") if anchor: map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() ) map_a2b.removeRowRegion( 1, anchor) map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() ) map_a2b.removeColRegion( 1, anchor) map_a2b.moveAlignment( -anchor, -anchor ) f = alignlib.AlignmentFormatExplicit( map_a2b, alignlib.makeSequence( self.mSequence1), alignlib.makeSequence( self.mSequence2) ) self.mMethod = method self.mAlignment = map_a2b self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment f = alignlib.AlignmentFormatEmissions( map_a2b ) self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment self.mAlignmentFrom1 = map_a2b.getRowFrom() self.mAlignmentTo1 = map_a2b.getRowTo() self.mAlignmentFrom2 = map_a2b.getColFrom() self.mAlignmentTo2 = map_a2b.getColTo() self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength() self.mAligned = self.mLength - self.mNumGaps self.SetPercentIdentity() self.SetBlockSizes()
t = target_exons[tt] if r.mGenomeTo < t.mGenomeFrom: rr += 1 continue elif t.mGenomeTo < r.mGenomeFrom: tt += 1 continue overlap += ( min(r.mGenomeTo, t.mGenomeTo) - max(r.mGenomeFrom, t.mGenomeFrom)) rr += 1 tt += 1 if overlap == 0: continue map_reference2target.clear() row = alignlib.makeSequence(reference.mTranslation) col = alignlib.makeSequence(target.mTranslation) alignator.align( map_reference2target, row, col ) f = alignlib.AlignmentFormatEmissions( map_reference2target ) row_ali, col_ali = f.mRowAlignment, f.mColAlignment pidentity = 100.0 * alignlib.calculatePercentIdentity( map_reference2target, row, col ) psimilarity = 100.0 * alignlib.calculatePercentSimilarity( map_reference2target ) union = max( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \ min( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom ) inter = min( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \ max( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom ) assignment_id += 1
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width = 2, max_advance = 2 ): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib.makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib.getDefaultEncoder() ) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue( x ) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue( xr, seq_cds.asResidue(y) ) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA( c ), pep_seq[int(x/3)]) ) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) )) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib.makeAlignmentVector() ## backtrack to previous three codons and align ## three codons for double frameshifts that span two codons and ## produce two X's and six WWWWWW. ## number of nucleotides to extend (should be multiple of 3) ## less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx ) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib.RIGHT )) if (x_start, y_start) == last_start: raise ValueError( "infinite loop detected" ) last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq) ) y_end = min(y_start + 2 * d, len(cds_seq) ) wobble_fragment = alignlib.makeSequence(wobble_seq[x_start:x_end]) cds_fragment = alignlib.makeSequence(cds_seq[y_start:y_end]) AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options ) if options.loglevel >= 10: options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str(alignlib.AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment )))) options.stdlog.flush() ## clear alignment map_p2c.removeRowRegion( x_start, x_end ) ngap = 0 last_x, last_y = None, None for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) ) if s < 0: raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair( x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s )) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to next codon. if ngap == 3: map_p2c.removeRowRegion( last_x, last_x + 1 ) last_x += 1 map_p2c.addPair( last_x, last_y ) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s )) options.stdlog.flush() ngap = 0 ## exit condition if alignment is shorter than problematic residue ## need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: ## only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue( xr, seq_cds.asResidue(y) ) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair( x, y, float(s) ) # advance to next residues x += 1 y += 1 # sanity checks assert( map_p2c.getRowTo() <= seq_wobble.getLength() ) assert( map_p2c.getColTo() <= seq_cds.getLength() )
def FilterConflicts( old_predictions, new_predictions, removed_predictions, min_overlap, peptide_sequences): """remove conflicts. Remove overlapping entries between different queries. Only remove those sequences, which are alignable. If they are alignable, take the sequence with the highest score and highest coverage. (Take both, if score and coverage are not correlated.) """ ################################################################################################## ## sort predictions by genomic region if isinstance( old_predictions, PredictionFile.PredictionFile): old_predictions.sort( ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo' ) ) else: old_predictions.sort( lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo) )) ################################################################################################## ## filter predictions and resolve conflicts based on genomic overlap ## deleted segments are put in a temporary storage space. alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep ) result = alignlib.makeAlignmentVector() alignments = {} noverlaps = 0 nredundants = 0 nnew = 0 last_prediction = None for this_prediction in old_predictions: try: this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None if not last_prediction: last_prediction = this_prediction last_query_gene = this_query_gene continue overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \ max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) union = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \ min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) # resolve overlap between different genes if overlap > 0 and \ (last_query_gene != this_query_gene or last_query_gene == None): noverlaps += 1 relative_overlap = 100 * overlap / union # Start conflict resolution, if overlap is above threshold. # Keep higher scoring segment. # # Check if queries are homologous. if relative_overlap >= param_max_percent_overlap: if peptide_sequences: if last_prediction.mQueryToken < this_prediction.mQueryToken: key = "%s-%s" % (last_prediction.mQueryToken, this_prediction.mQueryToken) else: key = "%s-%s" % (this_prediction.mQueryToken, last_prediction.mQueryToken) if not alignments.has_key( key ): result.clear() alignator.align( result, alignlib.makeSequence( peptide_sequences[this_prediction.mQueryToken]), alignlib.makeSequence( peptide_sequences[last_prediction.mQueryToken]) ) alignments[key] = result.getScore() if result.getScore() >= param_min_score_overlap: nredundants += 1 if alignments[key] >= param_min_score_overlap: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 0 if is_overlap: # take best prediction. If difference is very small, set # difference to 0 (difference does not matter). In this case, # the first prediction is taken. d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference: d1 = 0 d2 = last_prediction.score - this_prediction.score if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference: d2 = 0 if d1 >= 0 and d2 >= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(this_prediction)) if param_benchmarks: if CheckBenchmark( this_prediction, last_prediction ): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap, str(last_prediction)) removed_predictions.append( this_prediction ) continue elif d1 <= 0 and d2 <= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(last_prediction)) if param_benchmarks: if CheckBenchmark( last_prediction, this_prediction ): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap, str(this_prediction)) removed_predictions.append( last_prediction ) last_prediction = this_prediction last_query_gene = this_query_gene continue else: if param_loglevel >= 2: print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \ (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, this_prediction.score, this_prediction.mQueryCoverage, this_prediction.mPercentIdentity, last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, last_prediction.score, last_prediction.mQueryCoverage, last_prediction.mPercentIdentity) new_predictions.append(last_prediction) nnew += 1 last_query_gene = this_query_gene last_prediction = this_prediction new_predictions.append(last_prediction) nnew += 1 if param_loglevel >= 1: print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \ (len(alignments), noverlaps, nredundants) return nnew
def getAlignmentFull( m, q, t, options ): """print alignment with gaps in both query and target.""" a = alignlib.AlignmentFormatExplicit( m, alignlib.makeSequence(q), alignlib.makeSequence(t) ) return a.mRowAlignment, a.mColAlignment
def buildMali(self, query_nid, neighbours ): """build a multiple alignment from a set of neighbours. """ # build multiple alignment mali = alignlib.makeMultipleAlignment() query_sequence = self.mFasta.getSequence( query_nid ) mali.add( alignlib.makeAlignatum( query_sequence ) ) qseq = alignlib.makeSequence( query_sequence ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10, -2) nskipped = 0 for n in neighbours[:self.mMaxNumNeighbours]: if n.mSbjctToken == query_nid: continue if n.mEvalue > self.mMaxEvalue: nskipped += 1 continue sequence = self.mFasta.getSequence( n.mSbjctToken ) E.debug( "adding %s" % str(n) ) map_query2sbjct = n.getAlignment() if map_query2sbjct == None: sseq = alignlib.makeSequence( sequence ) qseq.useSegment( n.mQueryFrom, n.mQueryTo ) sseq.useSegment( n.mSbjctFrom, n.mSbjctTo ) map_query2sbjct = alignlib.makeAlignmentVector() alignator.align( map_query2sbjct, qseq, sseq ) if map_query2sbjct.getLength() == 0: self.warn( "empty alignment: %s" % str( n ) ) nskipped += 1 continue if map_query2sbjct.getRowTo() > len(query_sequence): self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\ (map_query2sbjct.getRowTo(), len(query_sequence), str(n))) nskipped += 1 continue elif map_query2sbjct.getColTo() > len(sequence): self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\ (map_query2sbjct.getColTo(), len(sequence), str(n))) nskipped += 1 continue try: mali.add( alignlib.makeAlignatum( sequence ), map_query2sbjct, mali_is_in_row = True, insert_gaps_mali = False, insert_gaps_alignatum = True, use_end_mali = True, use_end_alignatum = False ) except RuntimeError, msg: self.warn( "problem when building alignment for %s: msg=%s" % (str(n), msg)) nskipped += 1 continue
def buildMapPdb2Sequence( sequence, filename_pdb, options, pdb_chain = ""): """build a map for residue numbers in pdb file to residue numbers on a sequence. returns the following maps: map_structure2seq: mapping of residue numbers between structure and sequence. These are mappings that will work if you "renumber" the structure. map_pdb2seq, map_seq2pdb: mapping according to residue numbers in pdb file. """ if not os.path.exists( filename_pdb ): return None, None structure = Scientific.IO.PDB.Structure( filename_pdb ) map_pdb2seq = {} map_seq2pdb = {} for chain in structure.peptide_chains: if chain.chain_id == pdb_chain: ## align pdb sequence to sequence map_structure2seq = alignlib.makeAlignataVector() alignator = alignlib.makeFullDP( -10.0, -2.0 ) ## build sequence of pdb file structure = "" for residue in chain.sequence(): structure += AMINOACIDS[residue] ## align reference sequence to sequence of pdb file row = alignlib.makeSequence( structure ) col = alignlib.makeSequence( sequence ) alignator.Align(row, col, map_structure2seq) if options.loglevel >= 3: options.stdlog.write( "structure: %s\n" % structure ) options.stdlog.write( "sequence : %s\n" % sequence ) options.stdlog.write( "alignment of structure to sequence:\n" ) options.stdlog.write( alignlib.writePairAlignment( row, col, map_structure2seq ) + "\n" ) # print alignlib.writeAlignataTable(map_structure2seq) residue_number = 0 for residue in chain.residues: residue_number += 1 mapped_residue = map_structure2seq.mapRowToCol(residue_number) if not mapped_residue: if options.loglevel >= 3: options.stdlog.write( "# skipped residue %s=%s %i\n" % (str(residue.number), residue.name, residue_number)) continue r = str(residue.number) map_pdb2seq[r] = mapped_residue map_seq2pdb[mapped_residue] = r return map_structure2seq, map_pdb2seq, map_seq2pdb, residue_number-1, str(chain.residues[0].number), str(chain.residues[-1].number), structure
tmali.apply( translate ) tmap_mali = Mali.Mali() tmap_mali.readFromFile( open(options.filename_map_mali, "r") ) if tmap_mali.getAlphabet() == "na": tmap_mali.apply( translate ) map_old2new = alignlib.makeAlignmentVector() mali1 = alignlib.makeProfileFromMali( convertMali2Mali( tmali ) ) if tmap_mali.getLength() == 1: s = tmap_mali.values()[0].mString mali2 = alignlib.makeSequence( s ) ## see if you can find an identical subsequence and then align to thisD for x in tmali.values(): if s in re.sub( "[- .]+", "", x.mString): mali1 = alignlib.makeSequence( x.mString ) break else: mali2 = alignlib.makeProfileFromMali( convertMali2Mali( tmap_mali ) ) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -2.0 ) alignator.align( map_old2new, mali1, mali2 ) consensus = tmap_mali.getConsensus() if options.loglevel >= 4: options.stdlog.write( "# alphabet: %s\n" % tmap_mali.getAlphabet() )
filename_sequences = None, format = "fasta", ) (options, args) = E.Start( parser ) if not options.filename_sequences: raise "please supply filename with sequences." sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") ) if options.loglevel >= 1: print "# read %i sequences" % len(sequences) for k in sequences.keys(): sequences[k] = alignlib.makeSequence( sequences[k] ) if options.loglevel >= 2: print "# converted %i sequences" % len(sequences) ninput, noutput, nskipped, nfailed = 0, 0, 0, 0 link = BlastAlignments.Link() ali = alignlib.makeAlignataVector() for line in sys.stdin: if line[0] == "#": continue link.Read( line ) ninput += 1
def align(self, query, sbjct, map_query2sbjct): xrow = alignlib.makeSequence(query.asString()) xcol = alignlib.makeSequence(sbjct.asString()) self.mAlignator.align( xrow, xcol, map_query2sbjct)
print "version=" sys.exit(0) elif o in ( "-h", "--help" ): print globals()["__doc__"] sys.exit(0) alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, param_gop, param_gep ) map_query2token = alignlib.makeAlignmentVector() for line in sys.stdin: if line[0] == "#": continue query_token, sbjct_token, query_sequence, sbjct_sequence = string.split(line[:-1], "\t") map_query2token.clear() row = alignlib.makeSequence(query_sequence) col = alignlib.makeSequence(sbjct_sequence) alignator.align( map_query2token, row, col ) pidentity = 100.0 * alignlib.calculatePercentIdentity( map_query2token, row, col ) psimilarity = 100.0 * alignlib.calculatePercentSimilarity( map_query2token ) print string.join( map(str, ( query_token, sbjct_token, map_query2token.getScore(), alignlib.AlignmentFormatEmissions( map_query2token ), pidentity, psimilarity, map_query2token.getNumGaps()) ), "\t" )
def ProcessRegion( predictions, region_id, region, peptide_sequences = None, filter_queries = {} ): """process a set of matches to a region. resolve region according to homology. """ if options.loglevel >= 3: options.stdlog.write( "###################################################################\n" ) options.stdlog.write( "# resolving %i predictions in region %s\n" % ( len(predictions), str(region)) ) sys.stdout.flush() predictions.sort( lambda x,y: cmp(x.score, y.score)) predictions.reverse() alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep ) result = alignlib.makeAlignmentVector() cluster = [] map_sequence2cluster = range(0,len(predictions)) edges = [] noutput, nskipped = 0, 0 if peptide_sequences: for x in range(len(predictions)): if options.loglevel >= 5: options.stdlog.write( "# filtering from %i with prediction %i: %s\n" % (x, predictions[x].mPredictionId, predictions[x].mQueryToken) ) sys.stdout.flush() if map_sequence2cluster[x] != x: continue region_id += 1 edges = [] if predictions[x].mQueryToken not in filter_queries: edges.append( predictions[x] ) else: nskipped += 1 for y in range(x+1,len(predictions)): if map_sequence2cluster[y] != y: continue if predictions[x].mQueryToken < predictions[y].mQueryToken: key = "%s-%s" % (predictions[x].mQueryToken, predictions[y].mQueryToken) else: key = "%s-%s" % (predictions[y].mQueryToken, predictions[x].mQueryToken) # check if predictions are overlapping on the genomic sequence if min(predictions[x].mSbjctGenomeTo, predictions[y].mSbjctGenomeTo) - \ max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0: if options.loglevel >= 4: options.stdlog.write( "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" %\ (predictions[x].mPredictionId, predictions[y].mPredictionId ) ) sys.stdout.flush() continue if not global_alignments.has_key( key ): seq1 = peptide_sequences[predictions[x].mQueryToken] seq2 = peptide_sequences[predictions[y].mQueryToken] result.clear() s1 = alignlib.makeSequence( seq1 ) s2 = alignlib.makeSequence( seq2 ) alignator.align( result, s1, s2 ) c1 = 100 * (result.getRowTo() - result.getRowFrom()) / len(seq1) c2 = 100 * (result.getColTo() - result.getColFrom()) / len(seq2) min_cov = min(c1,c2) max_cov = max(c1,c2) identity = alignlib.calculatePercentIdentity( result, s1, s2 ) * 100 # check if predictions overlap and they are homologous if result.getScore() >= options.overlap_min_score and \ max_cov >= options.overlap_max_coverage and \ min_cov >= options.overlap_min_coverage and \ identity >= options.overlap_min_identity : global_alignments[key] = True else: global_alignments[key] = False if options.loglevel >= 4: options.stdlog.write( "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" %\ (key, result.getScore(), identity, c1,c2, min_cov, max_cov, global_alignments[key]) ) sys.stdout.flush() if global_alignments[key]: map_sequence2cluster[y] = x if predictions[y].mQueryToken not in filter_queries: edges.append( predictions[y] ) else: nskipped += 1 noutput += PrintEdges( region_id, region, edges ) return region_id, noutput, nskipped
def _alignToProfile(infile, outfile, min_score=0): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile(open("../data/mouse.fasta")) src_mali = Mali.convertMali2Alignlib(mali) E.debug("read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns())) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0, 2): profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n) profile_mali = Mali.convertMali2Alignlib(profile_mali) alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4)) alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform()) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile(profile_mali) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal(0, n, 0) build_mali.add(src_mali, m) outf = open(outfile, "w") outf_log = open(outfile + ".info", "w") outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append(re.sub("-", "", mali[pid])) ids.append(pid) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator(open(infile)): E.debug("adding %s" % s.title) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence(s.sequence) rseq = alignlib.makeSequence(rsequence) alignator.align(map_seq2profile, seq, profile) alignator.align(map_rseq2profile, rseq, profile) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts(m) covered = 0 for mm in r: build_mali.add(mm) sequences.append(sequence) ids.append(s.title) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write("\t".join( map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence)), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())))) + "\n") c.output += 1 #build_mali.expand( aa ) result = str( alignlib.MultAlignmentFormatPlain(build_mali, sequences, alignlib.UnalignedStacked)) for pid, data in zip(ids, result.split("\n")): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start) + 1, int(end), sequence)) outf.close() outf_log.close() E.info("%s\n" % str(c))
def _alignToProfile( infile, outfile, min_score = 0 ): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile( open("../data/mouse.fasta") ) src_mali = Mali.convertMali2Alignlib( mali ) E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() )) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0,2): profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n ) profile_mali = Mali.convertMali2Alignlib( profile_mali ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() ) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile( profile_mali ) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull( alignment_mode, -5.0, -0.5 ) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal( 0, n, 0 ) build_mali.add( src_mali, m ) outf = open( outfile, "w" ) outf_log = open( outfile + ".info", "w" ) outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append( re.sub( "-", "", mali[pid] ) ) ids.append( pid ) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator( open(infile)): E.debug("adding %s" % s.title ) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence( s.sequence ) rseq = alignlib.makeSequence( rsequence ) alignator.align( map_seq2profile, seq, profile ) alignator.align( map_rseq2profile, rseq, profile ) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts( m ) covered = 0 for mm in r: build_mali.add( mm ) sequences.append( sequence ) ids.append( s.title ) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write( "\t".join( map(str, ( s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence) ), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns()) ) ) ) + "\n" ) c.output += 1 #build_mali.expand( aa ) result = str(alignlib.MultAlignmentFormatPlain( build_mali, sequences, alignlib.UnalignedStacked )) for pid, data in zip(ids, result.split("\n") ): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) ) outf.close() outf_log.close() E.info( "%s\n" % str(c) )
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option( "-b", "--boundaries", dest="filename_boundaries", type="string", help="filename with exon boundaries." ) parser.add_option( "-e", "--exons", dest="filename_exons", type="string", help="filename with exons (output)." ) parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences." ) parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true", help="print exons for predictions not found in reference." ) parser.add_option( "-q", "--quality-pide", dest="quality_threshold_pide", type="int", help="quality threshold (pide) for exons." ) parser.set_defaults( genome_file = "genome", filename_boundaries = None, filename_exons = None, filename_peptides = None, quality_threshold_pide = 0, write_notfound = False, ## allowed number of nucleotides for exon boundaries to ## be considered equivalent. slipping_exon_boundary = 9, ## stop codons to search for stop_codons = ("TAG", "TAA", "TGA"), ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) reference_exon_boundaries = {} if options.filename_boundaries: reference_exon_boundaries = Exons.ReadExonBoundaries( open( options.filename_boundaries, "r"), do_invert = 1, remove_utr = 1) E.info( "read exon boundaries for %i queries" % len(reference_exon_boundaries) ) if options.filename_exons: outfile_exons = open( options.filename_exons, "w") outfile_exons.write( "%s\n" % "\t".join( ( "prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame", "reference_id", "reference_from", "reference_to", "reference_phase", "pidentity", "psimilarity", "nframeshifts", "ngaps", "nstopcodons", "is_ok", "genome_exon_from", "genome_exon_to") ) ) else: outfile_exons = None if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r") ) E.info("read peptide sequences for %i queries" % len(peptide_sequences) ) else: peptide_sequences = {} entry = PredictionParser.PredictionParserEntry() last_filename_genome = None nfound, nmissed_exons, nmissed_length = 0, 0, 0 nempty_alignments = 0 fasta = IndexedFasta.IndexedFasta( options.genome_file ) options.stdout.write( "%s\n" % "\t".join( ( "prediction_id", "number", "dubious_exons", "boundaries_sum", "boundaries_max", "identical_exons", "inserted_exons", "deleted_exons", "inserted_introns", "deleted_introns", "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons", "deleted_Cexons", "inserted_Nexons", "inserted_Cexons" ) ) ) for line in sys.stdin: if line[0] == "#": continue try: entry.Read(line) except ValueError, msg: print "# parsing failed with msg %s in line %s" % (msg, line[:-1]) sys.exit(1) exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome, query_from = entry.mQueryFrom, sbjct_from = entry.mSbjctGenomeFrom, add_stop_codon = 0 ) if exons[-1][4] != entry.mSbjctGenomeTo: print "# WARNING: discrepancy in exon calculation!!!" for e in exons: print "#", str(e) print "#", str(entry) if options.loglevel >= 5: for e in exons: print "#", str(e) genomic_fragment = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ) skip = False if peptide_sequences.has_key( entry.mQueryToken ): query_sequence = alignlib.makeSequence(peptide_sequences[entry.mQueryToken]) sbjct_sequence = alignlib.makeSequence(entry.mTranslation) percent_similarity, percent_identity = 0, 0 if query_sequence.getLength() < entry.mMapPeptide2Translation.getRowTo(): print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken, query_sequence.getLength(), entry.mMapPeptide2Translation.getRowTo()) sys.stdout.flush() nmissed_length += 1 skip = True elif sbjct_sequence.getLength() < entry.mMapPeptide2Translation.getColTo(): print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken, sbjct_sequence.getLength(), entry.mMapPeptide2Translation.getColTo()) sys.stdout.flush() nmissed_length += 1 skip = True else: alignlib.rescoreAlignment( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence, alignlib.makeScorer( query_sequence, sbjct_sequence ) ) percent_identity = alignlib.calculatePercentIdentity( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence ) * 100 percent_similarity = alignlib.calculatePercentSimilarity( entry.mMapPeptide2Translation ) * 100 E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % ( str(entry.mPredictionId), entry.mPercentSimilarity, entry.mPercentIdentity, percent_similarity, percent_identity ) ) else: query_sequence = None sbjct_sequence = None # default values exons_num_exons = "na" exons_boundaries_sum = "na" exons_boundaries_max = "na" dubious_exons = "na" ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0,0,0,0,0 truncated_Nterminal_exon, truncated_Cterminal_exon = 0,0 ndeleted_Nexons, ndeleted_Cexons = 0, 0 ninserted_Nexons, ninserted_Cexons = 0, 0 exons_offset = exons[0][3] if not reference_exon_boundaries.has_key( entry.mQueryToken ): print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken ) sys.stdout.flush() nmissed_exons += 1 skip = True if not skip: nfound += 1 ref_exons = reference_exon_boundaries[entry.mQueryToken] ref_exons_offset = ref_exons[0].mGenomeFrom exons_num_exons = len(ref_exons) - len(exons) exons_boundaries_sum = 0 exons_phase = 0 exons_boundaries_max = 0 dubious_exons = 0 inserted_exons = 0 temp_inserted_exons = 0 if options.loglevel >= 3: for e in exons: options.stdlog.write( "# %s\n" % str(e) ) for e in ref_exons: options.stdlog.write( "# %s\n" % str(e) ) min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100 in_sync = 0 e,r = 0,0 while e < len(exons) and r < len(ref_exons): this_e, this_r = e+1, r+1 percent_identity = 0 percent_similarity = 0 is_good_exon = 0 if options.loglevel >= 4: options.stdlog.write( "# current exons: %i and %i\n" % (e, r) ) sys.stdout.flush() exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[e][0:6] ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset ## get percent identity for exon exon_percent_identity = 0 exon_percent_similarity = 0 if query_sequence and sbjct_sequence: tmp_ali = alignlib.makeAlignmentVector() xquery_from = exon_from / 3 xquery_to = exon_to / 3 alignlib.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to ) if tmp_ali.getLength() == 0: options.stdlog.write( "# WARNING: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) nempty_alignments += 1 else: if options.loglevel >= 5: options.stdlog.write( "# %s\n" % str( alignlib.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) ) ) exon_percent_identity = alignlib.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence ) * 100 exon_percent_similarity = alignlib.calculatePercentSimilarity( tmp_ali ) * 100 if exon_percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 if e < len(exons) -1 : (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e+1][0:6] else: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, [] if r < len(ref_exons) - 1: next_ref_from, next_ref_to, next_ref_phase = (ref_exons[r+1].mPeptideFrom, ref_exons[r+1].mPeptideTo, ref_exons[r+1].frame) else: next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0 if options.loglevel >= 2: options.stdlog.write( "# %s\n" % "\t".join( map(str, (entry.mQueryToken, exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, ref_from, ref_to, ref_phase )))) sys.stdout.flush() # beware of small exons. # if less than options.slipping_exon_boundary: boundary is 0 # check if end is more than options.splipping_exon_boundary apart as well. if exon_to - exon_from <= options.slipping_exon_boundary or \ ref_to - ref_from <= options.slipping_exon_boundary: boundary = 0 else: boundary = options.slipping_exon_boundary if ref_to <= exon_from + boundary and \ ref_to <= exon_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if e == 0: ndeleted_Nexons += 1 else: ndeleted_exons += 1 r += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0 overlap = 0 elif exon_to <= ref_from + boundary and \ exon_to <= ref_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if r == 0: ninserted_Nexons += 1 else: ninserted_exons += 1 e += 1 ref_from, ref_to, ref_phase = 0, 0, 0 overlap = 0 else: ## overlap overlap = 1 dfrom = int(math.fabs(exon_from - ref_from)) dto = int(math.fabs(exon_to - ref_to)) ## get percent identity for overlapping fragment if query_sequence and sbjct_sequence: ## this the problem tmp_ali = alignlib.makeAlignmentVector() xquery_from = max( ref_from / 3, exon_from / 3) xquery_to = min(ref_to / 3, exon_to / 3) alignlib.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to ) if tmp_ali.getLength() == 0: options.stdlog.write( "# warning: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to ))) percent_identity = 0 percent_similarity = 0 else: if options.loglevel >= 5: print str( alignlib.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) ) percent_identity = alignlib.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence ) * 100 percent_similarity = alignlib.calculatePercentSimilarity( tmp_ali ) * 100 if percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 dubious_exons += 1 ## adjust regions for terminal exons if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0: if is_good_exon: truncated_Nterminal_exon = dfrom dfrom = 0 ## truncated terminal exons if e == len(exons)-1 and r == len(ref_exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: if is_good_exon: truncated_Cterminal_exon = dto dto = 0 ## do not count deviations for terminal query exons if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0: dfrom = 0 if e == len(exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: dto = 0 ## permit difference of one codon (assumed to be stop) if e == len(exons)-1 and r == len(ref_exons)-1 and dto == 3: dto = 0 ## deal with different boundary conditions: if dfrom == 0 and dto == 0: if is_good_exon: nidentical_exons += 1 e += 1 r += 1 ## next exon within this ref_exon elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary: if is_good_exon: ninserted_introns += 1 e += 1 in_sync = 1 dto = 0 ## next ref_exon within this exon elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary: if is_good_exon: ndeleted_introns += 1 r += 1 in_sync = 1 dto = 0 else: e += 1 r += 1 if in_sync: dfrom = 0 if is_good_exon: exons_boundaries_sum += dfrom + dto exons_boundaries_max = max( dfrom, exons_boundaries_max ) exons_boundaries_max = max( dto, exons_boundaries_max ) ########################################################### ## count inserted/deleted introns and misplaced boundaries ## ## if exon and next_exon in ref_exon: inserted intron ## if ref_exon and next_ref_exon in exon: deleted intron if outfile_exons: if genomic_fragment and exon_genome_to: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment, border_stop_codon = 0 ) else: nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0 if exon_to == 0: this_e = 0 if ref_to == 0: this_r = 0 outfile_exons.write( string.join( map(str, (entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, this_r, ref_from, ref_to, ref_phase, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, is_good_exon, exon_genome_from, exon_genome_to, )), "\t") + "\n") while e < len(exons): exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[e][0:5] e += 1 ninserted_Cexons += 1 if outfile_exons: outfile_exons.write( string.join( map(str, (entry.mPredictionId, e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") while r < len(ref_exons): ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ndeleted_Cexons += 1 ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset r += 1 if outfile_exons: outfile_exons.write( string.join( map(str, (entry.mPredictionId, 0, 0, 0, 0, r, ref_from, ref_to, ref_phase, 0, 0, 0, 0, 0, 0, 0, 0, )), "\t") + "\n") else: if options.write_notfound: this_e = 0 ## use prediction's identity/similarity for exons. ## This will still then flag stop-codons in later analysis percent_identity = entry.mPercentIdentity percent_similarity = entry.mPercentSimilarity for exon in exons: this_e += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[0:6] if genomic_fragment: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment ) outfile_exons.write( string.join( map(str, (entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") options.stdout.write( "\t".join(map(str, (entry.mPredictionId, exons_num_exons, dubious_exons, exons_boundaries_sum, exons_boundaries_max, nidentical_exons, ninserted_exons, ndeleted_exons, ninserted_introns, ndeleted_introns, truncated_Nterminal_exon, truncated_Cterminal_exon, ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons, ninserted_Cexons))) + "\n" )
def buildMali(self, query_nid, neighbours): """build a multiple alignment from a set of neighbours. """ # build multiple alignment mali = alignlib.makeMultipleAlignment() query_sequence = self.mFasta.getSequence(query_nid) mali.add(alignlib.makeAlignatum(query_sequence)) qseq = alignlib.makeSequence(query_sequence) alignator = alignlib.makeAlignatorDPFull(alignlib.ALIGNMENT_LOCAL, -10, -2) nskipped = 0 for n in neighbours[:self.mMaxNumNeighbours]: if n.mSbjctToken == query_nid: continue if n.mEvalue > self.mMaxEvalue: nskipped += 1 continue sequence = self.mFasta.getSequence(n.mSbjctToken) E.debug("adding %s" % str(n)) map_query2sbjct = n.getAlignment() if map_query2sbjct == None: sseq = alignlib.makeSequence(sequence) qseq.useSegment(n.mQueryFrom, n.mQueryTo) sseq.useSegment(n.mSbjctFrom, n.mSbjctTo) map_query2sbjct = alignlib.makeAlignmentVector() alignator.align(map_query2sbjct, qseq, sseq) if map_query2sbjct.getLength() == 0: self.warn("empty alignment: %s" % str(n)) nskipped += 1 continue if map_query2sbjct.getRowTo() > len(query_sequence): self.warn( "alignment out of bounds for query: %i>%i, line=%s" %\ (map_query2sbjct.getRowTo(), len(query_sequence), str(n))) nskipped += 1 continue elif map_query2sbjct.getColTo() > len(sequence): self.warn( "alignment out of bounds for sbjct: %i>%i, line=%s" %\ (map_query2sbjct.getColTo(), len(sequence), str(n))) nskipped += 1 continue try: mali.add(alignlib.makeAlignatum(sequence), map_query2sbjct, mali_is_in_row=True, insert_gaps_mali=False, insert_gaps_alignatum=True, use_end_mali=True, use_end_alignatum=False) except RuntimeError, msg: self.warn("problem when building alignment for %s: msg=%s" % (str(n), msg)) nskipped += 1 continue
(options, args) = E.Start( parser, add_pipe_options = True ) if options.filename_sequences: infile = open(options.filename_sequences, "r") else: infile = sys.stdin parser = FastaIterator.FastaIterator( infile ) sequences = [] while 1: cur_record = iterator.next() if cur_record is None: break sequences.append( (cur_record.title, alignlib.makeSequence(re.sub( " ", "", cur_record.sequence)) ) ) if options.filename_sequences: infile.close() alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep ) map_a2b = alignlib.makeAlignataVector() nsequences = len(sequences) for x in range(0,nsequences-1): for y in range(x+1, nsequences): alignator.Align( sequences[x][1], sequences[y][1], map_a2b) row_ali, col_ali = alignlib.writeAlignataCompressed( map_a2b ) options.stdout.write( "%s\t%s\t%i\t%i\t%i\t%s\t%i\t%i\t%s\t%i\t%i\t%i\t%i\n" % (\
def PrintCluster( cluster, cluster_id, lengths, peptide_sequences = None, regex_preferred = None): """print a cluster. Take longest sequence as representative. If preferred is given, only take genes matching preferred identifier. """ if regex_preferred: rx = re.compile(regex_preferred) else: rx = None max_al = 0 max_pl = 0 rep_a = None rep_p = None for c in cluster: l = 0 if c in lengths: l = lengths[c] if l > max_al: max_al = l rep_a = c if rx and rx.search(c) and l > max_pl: max_pl = l rep_p = c if max_pl > 0: max_l = max_pl rep = rep_p else: max_l = max_al rep = rep_a for mem in cluster: l = 0 if mem in lengths: l = lengths[mem] if peptide_sequences: map_rep2mem = alignlib.makeAlignmentVector() if rep == mem and rep in lengths: alignlib.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0) elif mem in peptide_sequences and \ rep in peptide_sequences: alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, -10.0, -1.0) alignator.align( map_rep2mem, alignlib.makeSequence( peptide_sequences[rep] ), alignlib.makeSequence( peptide_sequences[mem] ) ) f = alignlib.AlignmentFormatEmissions( map_rep2mem ) print string.join( map(str, (rep, mem, l, f)), "\t" ) else: print string.join( map(str, (rep, mem, l)), "\t" ) sys.stdout.flush() return cluster_id
print "#", cds_fragment print "# genomic" print "#",genomic_fragment continue if map_query2sbjct.getRowTo() > len(cds_fragment): print "# ERROR: length mismatch: cds fragment (%i) shorter than last aligned residue (%i)" %\ (len(cds_fragment), map_query2sbjct.getRowTo()) print "#", line print "# cds" print "#", cds_fragment print "# genomic" print "#",genomic_fragment continue cds_seq = alignlib.makeSequence( cds_fragment ) genomic_seq = alignlib.makeSequence( genomic_fragment ) data = map( lambda x: string.split(x, "\t"), string.split( alignlib.writePairAlignment( cds_seq, genomic_seq, map_query2sbjct ), "\n" )) row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(data[0][1], data[1][1]) row_ali = Genomics.MaskStopCodons( row_ali ) col_ali = Genomics.MaskStopCodons( col_ali ) if len(row_ali) != len(col_ali): print "# ERROR: wrong alignment lengths."
def EliminateRedundantEntries( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = None, this_quality = None ): """eliminate redundant entries in a set.""" eliminated = [] rep_id = rep.transcript_id rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid alignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, options.gop, options.gep ) result = alignlib.makeAlignmentVector() rep_seq = peptides[rep_id] rep_extended_seq = extended_peptides[rep_id] for entry in data: mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id, entry.mQueryCoverage, entry.mPid, entry.mQuality ) mem_seq = peptides[mem_id] mem_extended_seq = extended_peptides[mem_id] if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality)) if mem_id in eliminated_predictions: continue if mem_extended_seq == rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "i") ) elif mem_extended_seq in rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "p") ) else: if mem_quality != this_quality or \ mem_quality in options.quality_exclude_same: seq1 = alignlib.makeSequence( str(rep_seq) ) seq2 = alignlib.makeSequence( str(mem_seq) ) alignator.align( result, seq1, seq2 ) if options.loglevel >= 5: options.stdlog.write( "# ali\n%s\n" % alignlib.AlignmentFormatExplicit( result, seq1, seq2 ) ) pidentity = 100 * alignlib.calculatePercentIdentity( result, seq1, seq2 ) num_gaps = result.getNumGaps() if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\ ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) ) if pidentity >= options.min_identity: keep = False if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: keep = True reason = "covpid" elif num_gaps >= options.max_gaps and \ mem_coverage > rep_coverage - options.safety_coverage: keep = True reason = "gaps" elif mem_coverage >= rep_coverage - options.safety_coverage and \ 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage: keep = True reason = "memcov" if keep: options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "h") ) elif pidentity >= options.min_identity_non_genes and \ this_quality in options.quality_genes and \ mem_quality not in options.quality_genes: if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "l") ) return eliminated
if param_is_compressed: if unaligned_pair and \ unaligned_pair.mToken1 == pair.mToken1 and \ unaligned_pair.mToken2 == pair.mToken2 and \ unaligned_pair.mIntronId1 == pair.mIntronId1: map_a2b = alignlib.makeAlignmentVector() f = AlignmentFormatEmissions( pair.mFrom1, pair.mAlignedSequence1, pair.mFrom2, pair.mAlignedSequence2).copy( map_a2b ) map_a2b.moveAlignment( -unaligned_pair.mFrom1 + 1, -unaligned_pair.mFrom2 + 1 ) data = alignlib.AlignmentFormatExplicit( map_a2b, alignlib.makeSequence( unaligned_pair.mAlignedSequence1), alignlib.makeSequence( unaligned_pair.mAlignedSequence2) ) from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mAlignedSequence1 = ali1 pair.mAlignedSequence2 = ali2 else: raise "sequence not found for pair %s" % str(pair) if param_do_gblocks: if param_loglevel >= 4: print "# length before: %i %i" % (len(pair.mAlignedSequence1), pair.mAligned)
print "# WARNING: discrepancy in exon calculation!!!" for e in exons: print "#", str(e) print "#", str(entry) if options.loglevel >= 5: for e in exons: print "#", str(e) genomic_fragment = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ) skip = False if peptide_sequences.has_key( entry.mQueryToken ): query_sequence = alignlib.makeSequence(peptide_sequences[entry.mQueryToken]) sbjct_sequence = alignlib.makeSequence(entry.mTranslation) percent_similarity, percent_identity = 0, 0 if query_sequence.getLength() < entry.mMapPeptide2Translation.getRowTo(): print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken, query_sequence.getLength(), entry.mMapPeptide2Translation.getRowTo()) sys.stdout.flush() nmissed_length += 1 skip = True elif sbjct_sequence.getLength() < entry.mMapPeptide2Translation.getColTo(): print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken, sbjct_sequence.getLength(), entry.mMapPeptide2Translation.getColTo())
old_length = mali.getLength() new_mali = convertMali2Mali( mali ) if options.alignment_method == "sw": alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep ) else: alignator = alignlib.makeAlignatorFullDPGlobal( options.gop, options.gep ) while 1: cur_record = iterator.next() if cur_record is None: break map_mali2seq = alignlib.makeAlignataVector() sequence = alignlib.makeSequence( cur_record.sequence ) profile = alignlib.makeProfileFromMali( new_mali ) if options.loglevel >= 4: options.stdlog.write(profile.Write()) alignator.Align( profile, sequence, map_mali2seq ) if options.loglevel >= 3: options.stdlog.write( map_mali2seq.Write() ) ## add sequence to mali a = alignlib.makeAlignatumFromString( cur_record.sequence ) a.thisown = 0 new_mali.addAlignatum( a, map_mali2seq, 1, 1, 1, 1, 1 )