def CheckAlignments(peptide_sequences, query_token, other_tokens): """check wether query aligns to all others. """ if param_loglevel >= 3: print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens)) sys.stdout.flush() if query_token not in peptide_sequences: return True result = alignlib_lite.makeAlignmentVector() alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token]) for x in other_tokens: if x not in peptide_sequences: continue col_seq = alignlib_lite.makeSequence(peptide_sequences[x]) alignator.align(result, row_seq, col_seq) if param_loglevel >= 5: print "# %s - %s = %f" % (query_token, x, result.getScore()) if result.getScore() > param_min_alignment_score: return True return False
def CheckAlignments(peptide_sequences, query_token, other_tokens): """check wether query aligns to all others. """ if param_loglevel >= 3: print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens)) sys.stdout.flush() if query_token not in peptide_sequences: return True result = alignlib_lite.makeAlignmentVector() alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token]) for x in other_tokens: if x not in peptide_sequences: continue col_seq = alignlib_lite.makeSequence(peptide_sequences[x]) alignator.align(result, row_seq, col_seq) if param_loglevel >= 5: print "# %s - %s = %f" % (query_token, x, result.getScore()) if result.getScore() > param_min_alignment_score: return True return False
def FilterConflicts(old_predictions, new_predictions, removed_predictions, min_overlap, peptide_sequences): """remove conflicts. Remove overlapping entries between different queries. Only remove those sequences, which are alignable. If they are alignable, take the sequence with the highest score and highest coverage. (Take both, if score and coverage are not correlated.) """ ########################################################################## # sort predictions by genomic region if isinstance(old_predictions, PredictionFile.PredictionFile): old_predictions.sort( ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo')) else: old_predictions.sort(lambda x, y: cmp((x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo))) ########################################################################## # filter predictions and resolve conflicts based on genomic overlap # deleted segments are put in a temporary storage space. alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) result = alignlib_lite.makeAlignmentVector() alignments = {} noverlaps = 0 nredundants = 0 nnew = 0 last_prediction = None for this_prediction in old_predictions: try: (this_query_peptide, this_query_status, this_query_gene, this_query_transcript) = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None if not last_prediction: last_prediction = this_prediction last_query_gene = this_query_gene continue overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) -\ max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) union = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) -\ min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) # resolve overlap between different genes if overlap > 0 and \ (last_query_gene != this_query_gene or last_query_gene is None): noverlaps += 1 relative_overlap = 100 * overlap / union # Start conflict resolution, if overlap is above threshold. # Keep higher scoring segment. # # Check if queries are homologous. if relative_overlap >= param_max_percent_overlap: if peptide_sequences: if last_prediction.mQueryToken < this_prediction.mQueryToken: key = "%s-%s" % (last_prediction.mQueryToken, this_prediction.mQueryToken) else: key = "%s-%s" % (this_prediction.mQueryToken, last_prediction.mQueryToken) if not alignments.has_key(key): result.clear() alignator.align(result, alignlib_lite.makeSequence( peptide_sequences[this_prediction.mQueryToken]), alignlib_lite.makeSequence(peptide_sequences[last_prediction.mQueryToken])) alignments[key] = result.getScore() if result.getScore() >= param_min_score_overlap: nredundants += 1 if alignments[key] >= param_min_score_overlap: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 0 if is_overlap: # take best prediction. If difference is very small, set # difference to 0 (difference does not matter). In this case, # the first prediction is taken. d1 = last_prediction.mQueryCoverage - \ this_prediction.mQueryCoverage if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference: d1 = 0 d2 = last_prediction.score - this_prediction.score if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference: d2 = 0 if d1 >= 0 and d2 >= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(this_prediction)) if param_benchmarks: if CheckBenchmark(this_prediction, last_prediction): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap, str(last_prediction)) removed_predictions.append(this_prediction) continue elif d1 <= 0 and d2 <= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(last_prediction)) if param_benchmarks: if CheckBenchmark(last_prediction, this_prediction): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap, str(this_prediction)) removed_predictions.append(last_prediction) last_prediction = this_prediction last_query_gene = this_query_gene continue else: if param_loglevel >= 2: print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \ (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, this_prediction.score, this_prediction.mQueryCoverage, this_prediction.mPercentIdentity, last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, last_prediction.score, last_prediction.mQueryCoverage, last_prediction.mPercentIdentity) new_predictions.append(last_prediction) nnew += 1 last_query_gene = this_query_gene last_prediction = this_prediction new_predictions.append(last_prediction) nnew += 1 if param_loglevel >= 1: print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \ (len(alignments), noverlaps, nredundants) return nnew
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2, peptides2, cds2): """sort out ortholog relationships between transcripts of orthologous genes. Orthologs have: the same number of exons compatible intron/exon boundaries For the remaining transcript pairs, take reciprocal bet hits. I see the following: 0: 0(100%), 1: 0(94%), 2: 0,1(100%) 0: 0(100%), 1: 0,1,2(100%) Selecting 1-0 first, would result in a suboptimal match, because one transcript is longer than the other, while matching up 0-0 and 2-1 would be better. Objective function: it is the maximal matching/assignment problem. Use greedy implementation instead. Assign as much as possible according to descending weights. """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) # for long sequence: use dot alignment with tuple size of three dottor = alignlib_lite.makeAlignatorTuples(3) alignator_dots = alignlib_lite.makeAlignatorDotsSquared( param_gop, param_gep, dottor) seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]), transcripts1) seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]), transcripts2) if param_loglevel >= 4: print "# building sequence 1" for i in range(len(seqs1)): if not cds1.has_key(transcripts1[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# building sequence 2" for i in range(len(seqs2)): if not cds2.has_key(transcripts2[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# all-vs-all alignment" # do all versus all alignment alis1 = [] alis2 = [] for i in range(len(seqs1)): alis1.append([]) for i in range(len(seqs2)): alis2.append([]) if param_loglevel >= 3: print "#################################" for i in range(len(seqs1)): for cd in cds1[transcripts1[i][0]]: print "#", str(cd) print "# versus" for i in range(len(seqs2)): for cd in cds2[transcripts2[i][0]]: print "#", str(cd) sys.stdout.flush() weights = {} for i in range(len(seqs1)): prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[ i] for j in range(len(seqs2)): prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[ j] map_a2b = alignlib_lite.makeAlignmentVector() m = seqs1[i].getLength() * seqs2[j].getLength() if param_loglevel >= 3: print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\ (i, j, prediction_id1, seqs1[ i].getLength(), prediction_id2, seqs2[j].getLength()) sys.stdout.flush() if m > param_max_matrix_size: # switch to tuple alignment if sequences are too large if param_loglevel >= 2: print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % ( seqs1[i].getLength(), seqs2[j].getLength()) sys.stdout.flush() alignator_dots.align(map_a2b, seqs1[i], seqs2[j]) else: alignator.align(map_a2b, seqs1[i], seqs2[j]) coverage_a = 100.0 * \ (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \ seqs1[i].getLength() coverage_b = 100.0 * \ (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \ seqs2[j].getLength() # get copy of cds, but only those overlapping with alignment c1 = Exons.GetExonsRange( cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3, (map_a2b.getRowTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) c2 = Exons.GetExonsRange( cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3, (map_a2b.getColTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) # check exon boundaries, look at starts, skip first exon def MyMap(a, x): while x <= a.getRowTo(): c = a.mapRowToCol(x) if c: return c x += 1 else: return 0 mapped_boundaries = map( lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:]) mapped_boundaries.sort() reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1, c2[1:]) reference_boundaries.sort() nmissed_cmp2ref = Exons.CountMissedBoundaries( mapped_boundaries, reference_boundaries, param_boundaries_max_slippage) nmissed_ref2cmp = Exons.CountMissedBoundaries( reference_boundaries, mapped_boundaries, param_boundaries_max_slippage) min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp) # set is_ok for the whole thing # no intron: is ok is_ok = 0 if (len(c1) == 1 and len(c2) == 1): is_ok = 1 else: # allow for missed boundaries, if param_boundaries_allow_missed # > 0 if min_nmissed == 0: is_ok = 1 else: if param_boundaries_allow_missed and \ len(mapped_boundaries) >= param_boundaries_allow_missed and \ min_nmissed <= param_boundaries_max_missed: is_ok = 1 cc = min(coverage_a, coverage_b) if cc >= param_min_coverage: is_ok_coverage = 1 else: is_ok_coverage = 0 # check for missing introns is_ok_exons = 1 if abs(len(c1) - len(c2)) != 0: if param_missing_max_missing: if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or (min(len(c1), len(c2)) < param_missing_min_present)): is_ok_exons = 0 else: is_ok_exons = 0 if param_loglevel >= 3: print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \ "boundaries_ok=", is_ok, \ "nexons_ok=", is_ok_exons, \ "missed_c2r=", nmissed_cmp2ref, \ "missed_r2c=", nmissed_ref2cmp, \ "min_cov=", cc, \ "mapped=", mapped_boundaries, \ "reference=", reference_boundaries print "#", string.join( map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b), map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") sys.stdout.flush() # dump out pairs for method in param_write_pairs: if method == "all": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength(), map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b, nmissed_cmp2ref, mapped_boundaries, nmissed_ref2cmp, reference_boundaries, i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t") elif method == "alignment": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") elif method == "location": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t") if not is_ok_exons: if param_loglevel >= 4: print "# rejected %i and %i: too many exons difference." % ( i, j) continue if param_check_exon_boundaries: if not is_ok: continue if cc < param_min_coverage: continue if not weights.has_key(cc): weights[cc] = [] alis1[i].append((coverage_a, j)) alis2[j].append((coverage_b, i)) weights[cc].append((i, j, map_a2b)) # sort out alignments ww = weights.keys() ww.sort() ww.reverse() pairs = [] assigned1 = {} assigned2 = {} if param_loglevel >= 3: print "# alis1=", alis1 print "# alis2=", alis2 print "# --------------------------------------" for w in ww: for i, j, map_a2b in weights[w]: if not assigned1.has_key(i) and not assigned2.has_key(j): pairs.append((transcripts1[i], transcripts2[j], w, map_a2b)) assigned1[i] = 1 assigned2[j] = 1 if len(assigned1) == len(transcripts1): break if len(assigned2) == len(transcripts2): break return pairs
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-b", "--boundaries", dest="filename_boundaries", type="string", help="filename with exon boundaries.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename with exons (output).") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences.") parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true", help="print exons for predictions not found in reference.") parser.add_option("-q", "--quality-pide", dest="quality_threshold_pide", type="int", help="quality threshold (pide) for exons.") parser.set_defaults( genome_file="genome", filename_boundaries=None, filename_exons=None, filename_peptides=None, quality_threshold_pide=0, write_notfound=False, ## allowed number of nucleotides for exon boundaries to ## be considered equivalent. slipping_exon_boundary=9, ## stop codons to search for stop_codons=("TAG", "TAA", "TGA"), ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) reference_exon_boundaries = {} if options.filename_boundaries: reference_exon_boundaries = Exons.ReadExonBoundaries(open( options.filename_boundaries, "r"), do_invert=1, remove_utr=1) E.info("read exon boundaries for %i queries" % len(reference_exon_boundaries)) if options.filename_exons: outfile_exons = open(options.filename_exons, "w") outfile_exons.write("%s\n" % "\t".join( ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame", "reference_id", "reference_from", "reference_to", "reference_phase", "pidentity", "psimilarity", "nframeshifts", "ngaps", "nstopcodons", "is_ok", "genome_exon_from", "genome_exon_to"))) else: outfile_exons = None if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) E.info("read peptide sequences for %i queries" % len(peptide_sequences)) else: peptide_sequences = {} entry = PredictionParser.PredictionParserEntry() last_filename_genome = None nfound, nmissed_exons, nmissed_length = 0, 0, 0 nempty_alignments = 0 fasta = IndexedFasta.IndexedFasta(options.genome_file) options.stdout.write("%s\n" % "\t".join( ("prediction_id", "number", "dubious_exons", "boundaries_sum", "boundaries_max", "identical_exons", "inserted_exons", "deleted_exons", "inserted_introns", "deleted_introns", "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons", "deleted_Cexons", "inserted_Nexons", "inserted_Cexons"))) for line in sys.stdin: if line[0] == "#": continue try: entry.Read(line) except ValueError, msg: print "# parsing failed with msg %s in line %s" % (msg, line[:-1]) sys.exit(1) exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) if exons[-1][4] != entry.mSbjctGenomeTo: print "# WARNING: discrepancy in exon calculation!!!" for e in exons: print "#", str(e) print "#", str(entry) if options.loglevel >= 5: for e in exons: print "#", str(e) genomic_fragment = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) skip = False if peptide_sequences.has_key(entry.mQueryToken): query_sequence = alignlib_lite.makeSequence( peptide_sequences[entry.mQueryToken]) sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation) percent_similarity, percent_identity = 0, 0 if query_sequence.getLength( ) < entry.mMapPeptide2Translation.getRowTo(): print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken, query_sequence.getLength(), entry.mMapPeptide2Translation.getRowTo()) sys.stdout.flush() nmissed_length += 1 skip = True elif sbjct_sequence.getLength( ) < entry.mMapPeptide2Translation.getColTo(): print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken, sbjct_sequence.getLength(), entry.mMapPeptide2Translation.getColTo()) sys.stdout.flush() nmissed_length += 1 skip = True else: alignlib_lite.rescoreAlignment( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence, alignlib_lite.makeScorer(query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( entry.mMapPeptide2Translation) * 100 E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % (str(entry.mPredictionId), entry.mPercentSimilarity, entry.mPercentIdentity, percent_similarity, percent_identity)) else: query_sequence = None sbjct_sequence = None # default values exons_num_exons = "na" exons_boundaries_sum = "na" exons_boundaries_max = "na" dubious_exons = "na" ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0 truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0 ndeleted_Nexons, ndeleted_Cexons = 0, 0 ninserted_Nexons, ninserted_Cexons = 0, 0 exons_offset = exons[0][3] if not reference_exon_boundaries.has_key(entry.mQueryToken): print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken) sys.stdout.flush() nmissed_exons += 1 skip = True if not skip: nfound += 1 ref_exons = reference_exon_boundaries[entry.mQueryToken] ref_exons_offset = ref_exons[0].mGenomeFrom exons_num_exons = len(ref_exons) - len(exons) exons_boundaries_sum = 0 exons_phase = 0 exons_boundaries_max = 0 dubious_exons = 0 inserted_exons = 0 temp_inserted_exons = 0 if options.loglevel >= 3: for e in exons: options.stdlog.write("# %s\n" % str(e)) for e in ref_exons: options.stdlog.write("# %s\n" % str(e)) min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100 in_sync = 0 e, r = 0, 0 while e < len(exons) and r < len(ref_exons): this_e, this_r = e + 1, r + 1 percent_identity = 0 percent_similarity = 0 is_good_exon = 0 if options.loglevel >= 4: options.stdlog.write("# current exons: %i and %i\n" % (e, r)) sys.stdout.flush() exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[ e][0:6] ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset ## get percent identity for exon exon_percent_identity = 0 exon_percent_similarity = 0 if query_sequence and sbjct_sequence: tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = exon_from / 3 xquery_to = exon_to / 3 alignlib_lite.copyAlignment(tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# WARNING: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) nempty_alignments += 1 else: if options.loglevel >= 5: options.stdlog.write("# %s\n" % str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence))) exon_percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 exon_percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if exon_percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 if e < len(exons) - 1: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e + 1][0:6] else: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, [] if r < len(ref_exons) - 1: next_ref_from, next_ref_to, next_ref_phase = ( ref_exons[r + 1].mPeptideFrom, ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame) else: next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0 if options.loglevel >= 2: options.stdlog.write("# %s\n" % "\t".join( map(str, (entry.mQueryToken, exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, ref_from, ref_to, ref_phase)))) sys.stdout.flush() # beware of small exons. # if less than options.slipping_exon_boundary: boundary is 0 # check if end is more than options.splipping_exon_boundary apart as well. if exon_to - exon_from <= options.slipping_exon_boundary or \ ref_to - ref_from <= options.slipping_exon_boundary: boundary = 0 else: boundary = options.slipping_exon_boundary if ref_to <= exon_from + boundary and \ ref_to <= exon_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if e == 0: ndeleted_Nexons += 1 else: ndeleted_exons += 1 r += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0 overlap = 0 elif exon_to <= ref_from + boundary and \ exon_to <= ref_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if r == 0: ninserted_Nexons += 1 else: ninserted_exons += 1 e += 1 ref_from, ref_to, ref_phase = 0, 0, 0 overlap = 0 else: ## overlap overlap = 1 dfrom = int(math.fabs(exon_from - ref_from)) dto = int(math.fabs(exon_to - ref_to)) ## get percent identity for overlapping fragment if query_sequence and sbjct_sequence: ## this the problem tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = max(ref_from / 3, exon_from / 3) xquery_to = min(ref_to / 3, exon_to / 3) alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# warning: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) percent_identity = 0 percent_similarity = 0 else: if options.loglevel >= 5: print str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 dubious_exons += 1 ## adjust regions for terminal exons if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0: if is_good_exon: truncated_Nterminal_exon = dfrom dfrom = 0 ## truncated terminal exons if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: if is_good_exon: truncated_Cterminal_exon = dto dto = 0 ## do not count deviations for terminal query exons if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0: dfrom = 0 if e == len(exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: dto = 0 ## permit difference of one codon (assumed to be stop) if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto == 3: dto = 0 ## deal with different boundary conditions: if dfrom == 0 and dto == 0: if is_good_exon: nidentical_exons += 1 e += 1 r += 1 ## next exon within this ref_exon elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary: if is_good_exon: ninserted_introns += 1 e += 1 in_sync = 1 dto = 0 ## next ref_exon within this exon elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary: if is_good_exon: ndeleted_introns += 1 r += 1 in_sync = 1 dto = 0 else: e += 1 r += 1 if in_sync: dfrom = 0 if is_good_exon: exons_boundaries_sum += dfrom + dto exons_boundaries_max = max(dfrom, exons_boundaries_max) exons_boundaries_max = max(dto, exons_boundaries_max) ########################################################### ## count inserted/deleted introns and misplaced boundaries ## ## if exon and next_exon in ref_exon: inserted intron ## if ref_exon and next_ref_exon in exon: deleted intron if outfile_exons: if genomic_fragment and exon_genome_to: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment, border_stop_codon=0) else: nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0 if exon_to == 0: this_e = 0 if ref_to == 0: this_r = 0 outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, this_r, ref_from, ref_to, ref_phase, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, is_good_exon, exon_genome_from, exon_genome_to, )), "\t") + "\n") while e < len(exons): exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[ e][0:5] e += 1 ninserted_Cexons += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") while r < len(ref_exons): ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ndeleted_Cexons += 1 ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset r += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, 0, 0, 0, 0, r, ref_from, ref_to, ref_phase, 0, 0, 0, 0, 0, 0, 0, 0, )), "\t") + "\n") else: if options.write_notfound: this_e = 0 ## use prediction's identity/similarity for exons. ## This will still then flag stop-codons in later analysis percent_identity = entry.mPercentIdentity percent_similarity = entry.mPercentSimilarity for exon in exons: this_e += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[ 0:6] if genomic_fragment: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment) outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") options.stdout.write("\t".join( map(str, (entry.mPredictionId, exons_num_exons, dubious_exons, exons_boundaries_sum, exons_boundaries_max, nidentical_exons, ninserted_exons, ndeleted_exons, ninserted_introns, ndeleted_introns, truncated_Nterminal_exon, truncated_Cterminal_exon, ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons, ninserted_Cexons))) + "\n")
alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) for q1, q2 in pairs: ninput += 1 if param_loglevel >= 1: print "# processing %s and %s" % (q1, q2) if q1 in transcripts1 and q2 in transcripts2: map_a2b = alignlib_lite.makeAlignmentVector() alignator.align(map_a2b, alignlib_lite.makeSequence(peptides1[q1]), alignlib_lite.makeSequence(peptides2[q2])) if map_a2b.getLength() == 0: if param_loglevel >= 1: print "# Alignment failed between %s and %s" % (q1, q2) sys.stdout.flush() ntotal_errors += 1 continue nerrors, nskipped = WriteExons(q1, peptides1[q1], cds1[q1], transcripts1[q1], q2, peptides2[q2], cds2[q2], transcripts2[q2], map_a2b)
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2, transcript2, peptide_map_a2b): if param_loglevel >= 3: for cd in cds1: print "#", str(cd) for cd in cds2: print "#", str(cd) print "# peptide_map_a2b", str( alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b)) sys.stdout.flush() dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2) if len(cds1) != len(cds2): if param_loglevel >= 4: print "" # WARNING: different number of exons!" seq1 = alignlib_lite.makeSequence(transcript1) seq2 = alignlib_lite.makeSequence(transcript2) tmp_map_a2b = alignlib_lite.makeAlignmentVector() dialign = WrapperDialign.Dialign("-n") dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8") dba = WrapperDBA.DBA() #clustal = WrapperClustal.Clustal() matrix, gop, gep = global_substitution_matrix alignator_nw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix) alignator_sw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix) # concatenated alignments for exons: # 1: only the common parts ali_common1 = "" ali_common2 = "" e1, e2 = 0, 0 while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom(): e1 += 1 while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom(): e2 += 1 nskipped, nerrors = 0, 0 if param_loglevel >= 5: nmapped = 0 for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1): if dna_map_a2b.mapRowToCol(x) >= 0: nmapped += 1 print "# nmapped=", nmapped print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b)) # declare alignments used map_intron_a2b = alignlib_lite.makeAlignmentVector() result = Exons.CompareGeneStructures(cds1, cds2, map_cmp2ref=peptide_map_a2b) if param_loglevel >= 2: print result.Pretty("#") nskipped_exons, nskipped_introns = 0, 0 last_e1, last_e2 = None, None for link in result.mEquivalences: if link.mCoverage <= param_min_exon_coverage: nskipped_exons += 1 continue e1, e2 = link.mId1, link.mId2 c1 = cds1[e1] c2 = cds2[e2] exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo] exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo] ####################################################################### # write unaligned exons if param_write_exons: pair = AlignedPairs.UnalignedPair() pair.mCategory = "exon" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) pair.mLen1 = len(exon_fragment1) pair.mSequence1 = exon_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum2 = len(cds2) pair.mLen2 = len(exon_fragment2) pair.mSequence2 = exon_fragment2 pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo, pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo, print str(pair) sys.stdout.flush() ####################################################################### # build alignment for overlap of both exons # tmp_map_a2b.clear() # alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b, # c1.mGenomeFrom + 1, c1.mGenomeTo ) # if param_loglevel >= 5: # print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo) # for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"): # print "#", x # if tmp_map_a2b.getLength() == 0: # if param_loglevel >= 1: # print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \ ## (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2) # print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\ ## peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(peptide_map_a2b) # print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\ ## dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(dna_map_a2b) # for cd in cds1: print "##", str(cd) # for cd in cds2: print "##", str(cd) ## nerrors += 1 # continue ## data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b ).split("\n")) # if "caligned" in param_write_exons : # print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1, ## token2, e2, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) ## ali_common1 += data[0][1] ## ali_common2 += data[1][1] ####################################################################### # write alignment of introns for orthologous introns # orthologous introns are between orthologous exons if param_write_introns: if last_e1 is not None: if e1 - last_e1 != 1 or e2 - last_e2 != 1: nskipped_introns += 1 else: pair = AlignedPairs.UnalignedPair() intron_from1 = cds1[e1 - 1].mGenomeTo intron_to1 = cds1[e1].mGenomeFrom intron_from2 = cds2[e2 - 1].mGenomeTo intron_to2 = cds2[e2].mGenomeFrom intron_fragment1 = transcript1[intron_from1:intron_to1] intron_fragment2 = transcript2[intron_from2:intron_to2] if len(intron_fragment1) == 0 or len( intron_fragment2) == 0: print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\ (intron_from1, intron_to1, len(transcript1), intron_from2, intron_to2, len(transcript2)) continue pair.mCategory = "intron" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) - 1 pair.mLen1 = len(intron_fragment1) pair.mFrom1 = intron_from1 pair.mTo1 = intron_to1 pair.mSequence1 = intron_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum1 = len(cds2) - 1 pair.mLen2 = len(intron_fragment2) pair.mFrom2 = intron_from2 pair.mTo2 = intron_to2 pair.mSequence2 = intron_fragment2 if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \ (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \ (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \ (param_max_intron_length and len(intron_fragment2) > param_max_intron_length): if param_loglevel >= 1: print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\ (token1, e1, token2, e2, len(intron_fragment1), len(intron_fragment2)) sys.stdout.flush() nskipped += 1 print str(pair) # else: ## anchored_from1 = intron_from1 - param_extend_introns ## anchored_to1 = intron_to1 + param_extend_introns ## anchored_from2 = intron_from2 - param_extend_introns ## anchored_to2 = intron_to2 + param_extend_introns ## anchored_fragment1 = transcript1[anchored_from1:anchored_to1] ## anchored_fragment2 = transcript2[anchored_from2:anchored_to2] # for method in param_write_introns: # if param_loglevel >= 2: # print "## aligning with method %s" % method # sys.stdout.flush # map_intron_a2b.clear() # if method == "unaligned": ## from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2 # elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"): ## tmp_intron_a2b = alignlib_lite.makeAlignmentVector() # if param_loglevel >= 1: # print "# aligning with method %s two fragments of length %i and %i" % (method, # len(anchored_fragment1), # len(anchored_fragment2)) # sys.stdout.flush() # if method == "dialigned": ## result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dialignedlgs": ## result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dbaligned": ## result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "clusaligned": ## result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # if not result or result.getLength() == 0: # if param_loglevel >= 1: # print "# Error: empty intron alignment" # sys.stdout.flush() ## nerrors += 1 # continue ## tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 ) # alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b, ## intron_from1 + 1, intron_to1, # intron_from2 + 1, intron_to2 ) # elif method == "nwaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignator_nw.Align( seq1, seq2, map_intron_a2b ) # seq1.useFullLength() # seq2.useFullLength() # elif method == "swaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw ) # seq1.useFullLength() # seq2.useFullLength() # else: ## raise "unknown method %s" % method # if map_intron_a2b.getLength() > 0: # if param_compress: ## from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo() ## from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo() ## ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b ) # else: # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b ).split("\n")) # if len(data) < 2: ## data=[ ( 0, "", 0), (0, "", 0)] ## from1, ali1, to1 = data[0] ## from2, ali2, to2 = data[1] # print string.join(map(str, ("intron", # method, ## token1, e1, len(cds1) - 1, len(intron_fragment1), ## token2, e2, len(cds2) - 1, len(intron_fragment2), # map_intron_a2b.getNumGaps(), # map_intron_a2b.getLength(), ## map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(), ## from1, to1, ali1, ## from2, to2, ali2, ## intron_from1, intron_to1, # intron_from2, intron_to2)), "\t") # sys.stdout.flush() last_e1, last_e2 = e1, e2 ########################################################################## # write concatenated exons # for method in param_write_exons: # if method == "common": # print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## 0, 0, ## 0, 0, # ali_common1, ali_common2 ) # elif method == "exons": # Write full alignment without gaps. # This will not care about exon boundaries and gaps. # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) # try: ## from1, s1, to1, from2, s2, to2 = data[0] + data[1] # except ValueError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # except IndexError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # if from1: # if len(s1) != len(s2): # print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2)) ## nerrors += 1 ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" # else: ## a1, a2 = [], [] # for x in range( min(len(s1), len(s2)) ): # if s1[x] != "-" and s2[x] != "-": ## a1.append( s1[x] ) ## a2.append( s2[x] ) ## s1 = string.join(a1, "") ## s2 = string.join(a2, "") # print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0, ## token2, 0, ## from1, to1, ## from2, to2, # s1, s2 ) ) # elif method == "full": # write full alignment (do not care about exon boundaries) # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) ## if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)] # print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) if param_loglevel >= 3: print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons, nskipped_introns) return nerrors, nskipped
def ProcessRegion(predictions, region_id, region, peptide_sequences=None, filter_queries={}): """process a set of matches to a region. resolve region according to homology. """ if options.loglevel >= 3: options.stdlog.write( "###################################################################\n") options.stdlog.write( "# resolving %i predictions in region %s\n" % (len(predictions), str(region))) sys.stdout.flush() predictions.sort(lambda x, y: cmp(x.score, y.score)) predictions.reverse() alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep) result = alignlib_lite.makeAlignmentVector() cluster = [] map_sequence2cluster = range(0, len(predictions)) edges = [] noutput, nskipped = 0, 0 if peptide_sequences: for x in range(len(predictions)): if options.loglevel >= 5: options.stdlog.write("# filtering from %i with prediction %i: %s\n" % ( x, predictions[x].mPredictionId, predictions[x].mQueryToken)) sys.stdout.flush() if map_sequence2cluster[x] != x: continue region_id += 1 edges = [] if predictions[x].mQueryToken not in filter_queries: edges.append(predictions[x]) else: nskipped += 1 for y in range(x + 1, len(predictions)): if map_sequence2cluster[y] != y: continue if predictions[x].mQueryToken < predictions[y].mQueryToken: key = "%s-%s" % (predictions[x].mQueryToken, predictions[y].mQueryToken) else: key = "%s-%s" % (predictions[y].mQueryToken, predictions[x].mQueryToken) # check if predictions are overlapping on the genomic sequence if min(predictions[x].mSbjctGenomeTo, predictions[y].mSbjctGenomeTo) - \ max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0: if options.loglevel >= 4: options.stdlog.write("# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" % (predictions[x].mPredictionId, predictions[y].mPredictionId)) sys.stdout.flush() continue if not global_alignments.has_key(key): seq1 = peptide_sequences[predictions[x].mQueryToken] seq2 = peptide_sequences[predictions[y].mQueryToken] result.clear() s1 = alignlib_lite.makeSequence(seq1) s2 = alignlib_lite.makeSequence(seq2) alignator.align(result, s1, s2) c1 = 100 * \ (result.getRowTo() - result.getRowFrom()) / len(seq1) c2 = 100 * \ (result.getColTo() - result.getColFrom()) / len(seq2) min_cov = min(c1, c2) max_cov = max(c1, c2) identity = alignlib_lite.calculatePercentIdentity( result, s1, s2) * 100 # check if predictions overlap and they are homologous if result.getScore() >= options.overlap_min_score and \ max_cov >= options.overlap_max_coverage and \ min_cov >= options.overlap_min_coverage and \ identity >= options.overlap_min_identity: global_alignments[key] = True else: global_alignments[key] = False if options.loglevel >= 4: options.stdlog.write("# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" % (key, result.getScore(), identity, c1, c2, min_cov, max_cov, global_alignments[key])) sys.stdout.flush() if global_alignments[key]: map_sequence2cluster[y] = x if predictions[y].mQueryToken not in filter_queries: edges.append(predictions[y]) else: nskipped += 1 noutput += PrintEdges(region_id, region, edges) return region_id, noutput, nskipped
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage = globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "-c", "--cds", dest="filename_cds", type="string", help="filename with cds seguences." ) parser.add_option( "-f", "--format", dest="format", type="choice", choices=("paired_fasta", ), help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format" ) parser.set_defaults( genome_file = "genome", filename_cds = "cds.fasta", format = "paired_fasta", filename_suffix = ".fasta", filename_prefix = "", ) (options, args) = E.Start( parser, add_psql_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(1) fasta = IndexedFasta.IndexedFasta( options.genome_file ) ## reading CDS sequences if options.filename_cds: cds_sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r") ) else: cds_sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i CDS sequences\n" % len(cds_sequences) ) last_filename_genome = None p = PredictionParser.PredictionParserEntry() ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0 for line in options.stdin: if line[0] == "#": continue if line[0] == '"': continue p.Read(line) ninput += 1 genomic_fragment = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ) if len(genomic_fragment) == 0: raise "ERROR: empty fragment %s:%s for line" % (p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line try: cds_fragment = cds_sequences[p.mQueryToken] except KeyError: options.stdlog.write( "# ERROR: cds not found: query %s.\n" % p.mQueryToken ) continue map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA( p.mMapPeptide2Genome, query_from = p.mQueryFrom, sbjct_from = 0, genome = genomic_fragment ) ## check for errors: if map_query2sbjct.getRowTo() != p.mQueryTo * 3: options.stdlog.write( "# ERROR: boundary shift in query at line %s\n# %i %i\n" % (line, map_query2sbjct.getRowTo(), p.mQueryTo * 3 ) ) if map_query2sbjct.getColTo() > len(genomic_fragment): options.stdlog.write( "# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" %\ (line, len(genomic_fragment), map_query2sbjct.getColTo()) ) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment )) nlength += 1 continue if map_query2sbjct.getRowTo() > len(cds_fragment): options.stdlog.write( "# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" %\ (line, len(cds_fragment), map_query2sbjct.getRowTo()) ) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment )) nlength += 1 continue cds_seq = alignlib_lite.makeSequence( cds_fragment ) genomic_seq = alignlib_lite.makeSequence( genomic_fragment ) f = alignlib_lite.AlignmentFormatExplicit( map_query2sbjct, cds_seq, genomic_seq ) row_ali = f.mRowAlignment col_ali = f.mColAlignment row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(row_ali, col_ali) row_ali = Genomics.MaskStopCodons( row_ali ) col_ali = Genomics.MaskStopCodons( col_ali ) if len(row_ali) != len(col_ali): options.stdlog.write( "# ERROR: wrong alignment lengths.\n" ) sys.exit(1) if len(row_ali) % 3 or len(col_ali) % 3: options.stdlog.write( "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line ) options.stdlog.write( "# %6i %s\n# %6i %s\n" % (len(row_ali), str(row_ali), len(col_ali), str(col_ali) ) ) n3 += 1 input = re.sub( "[-X]", "", p.mTranslation ) ref = re.sub( "[-X]", "", Genomics.TranslateDNA2Protein( col_ali ) ) if input != ref: if options.loglevel >= 1: options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken, len(input), input, len(ref), ref ) ) nsanity += 1 continue options.stdout.write( ">%s\n%s\n" % (p.mPredictionId, row_ali) ) options.stdout.write( ">%s_vs_%s_%s_%i_%i\n%s\n" % \ (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali) ) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % (ninput, noutput, nsanity, nlength, n3) ) E.Stop()
if r.mGenomeTo < t.mGenomeFrom: rr += 1 continue elif t.mGenomeTo < r.mGenomeFrom: tt += 1 continue overlap += (min(r.mGenomeTo, t.mGenomeTo) - max(r.mGenomeFrom, t.mGenomeFrom)) rr += 1 tt += 1 if overlap == 0: continue map_reference2target.clear() row = alignlib_lite.makeSequence(reference.mTranslation) col = alignlib_lite.makeSequence(target.mTranslation) alignator.align(map_reference2target, row, col) f = alignlib_lite.AlignmentFormatEmissions(map_reference2target) row_ali, col_ali = f.mRowAlignment, f.mColAlignment pidentity = 100.0 * \ alignlib_lite.calculatePercentIdentity( map_reference2target, row, col) psimilarity = 100.0 * \ alignlib_lite.calculatePercentSimilarity(map_reference2target) union = max( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \ min(reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom) inter = min( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \ max(reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom)
def EliminateRedundantEntries( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality=None, this_quality=None ): """eliminate redundant entries in a set.""" eliminated = [] rep_id = rep.transcript_id rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep) result = alignlib_lite.makeAlignmentVector() rep_seq = peptides[rep_id] rep_extended_seq = extended_peptides[rep_id] for entry in data: mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id, entry.mQueryCoverage, entry.mPid, entry.mQuality, ) mem_seq = peptides[mem_id] mem_extended_seq = extended_peptides[mem_id] if options.loglevel >= 4: options.stdlog.write("# processing: id=%s class=%s\n" % (mem_id, mem_quality)) if mem_id in eliminated_predictions: continue if mem_extended_seq == rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append((mem_id, "i")) elif mem_extended_seq in rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append((mem_id, "p")) else: if mem_quality != this_quality or mem_quality in options.quality_exclude_same: seq1 = alignlib_lite.makeSequence(str(rep_seq)) seq2 = alignlib_lite.makeSequence(str(mem_seq)) alignator.align(result, seq1, seq2) if options.loglevel >= 5: options.stdlog.write("# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit(result, seq1, seq2)) pidentity = 100 * alignlib_lite.calculatePercentIdentity(result, seq1, seq2) num_gaps = result.getNumGaps() if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" % (mem_id, mem_quality, pidentity, rep_coverage, mem_coverage) ) if pidentity >= options.min_identity: keep = False if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide: keep = True reason = "covpid" elif num_gaps >= options.max_gaps and mem_coverage > rep_coverage - options.safety_coverage: keep = True reason = "gaps" elif ( mem_coverage >= rep_coverage - options.safety_coverage and 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage ): keep = True reason = "memcov" if keep: options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" % (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append((mem_id, "h")) elif ( pidentity >= options.min_identity_non_genes and this_quality in options.quality_genes and mem_quality not in options.quality_genes ): if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide: options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" % (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append((mem_id, "l")) return eliminated
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2, transcript2, peptide_map_a2b): if param_loglevel >= 3: for cd in cds1: print "#", str(cd) for cd in cds2: print "#", str(cd) print "# peptide_map_a2b", str(alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b)) sys.stdout.flush() dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2) if len(cds1) != len(cds2): if param_loglevel >= 4: print "" # WARNING: different number of exons!" seq1 = alignlib_lite.makeSequence(transcript1) seq2 = alignlib_lite.makeSequence(transcript2) tmp_map_a2b = alignlib_lite.makeAlignmentVector() dialign = WrapperDialign.Dialign("-n") dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8") dba = WrapperDBA.DBA() #clustal = WrapperClustal.Clustal() matrix, gop, gep = global_substitution_matrix alignator_nw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix) alignator_sw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix) # concatenated alignments for exons: # 1: only the common parts ali_common1 = "" ali_common2 = "" e1, e2 = 0, 0 while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom(): e1 += 1 while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom(): e2 += 1 nskipped, nerrors = 0, 0 if param_loglevel >= 5: nmapped = 0 for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1): if dna_map_a2b.mapRowToCol(x) >= 0: nmapped += 1 print "# nmapped=", nmapped print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b)) # declare alignments used map_intron_a2b = alignlib_lite.makeAlignmentVector() result = Exons.CompareGeneStructures( cds1, cds2, map_cmp2ref=peptide_map_a2b) if param_loglevel >= 2: print result.Pretty("#") nskipped_exons, nskipped_introns = 0, 0 last_e1, last_e2 = None, None for link in result.mEquivalences: if link.mCoverage <= param_min_exon_coverage: nskipped_exons += 1 continue e1, e2 = link.mId1, link.mId2 c1 = cds1[e1] c2 = cds2[e2] exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo] exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo] ####################################################################### # write unaligned exons if param_write_exons: pair = AlignedPairs.UnalignedPair() pair.mCategory = "exon" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) pair.mLen1 = len(exon_fragment1) pair.mSequence1 = exon_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum2 = len(cds2) pair.mLen2 = len(exon_fragment2) pair.mSequence2 = exon_fragment2 pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo, pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo, print str(pair) sys.stdout.flush() ####################################################################### # build alignment for overlap of both exons # tmp_map_a2b.clear() # alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b, # c1.mGenomeFrom + 1, c1.mGenomeTo ) # if param_loglevel >= 5: # print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo) # for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"): # print "#", x # if tmp_map_a2b.getLength() == 0: # if param_loglevel >= 1: # print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \ ## (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2) # print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\ ## peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(peptide_map_a2b) # print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\ ## dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(dna_map_a2b) # for cd in cds1: print "##", str(cd) # for cd in cds2: print "##", str(cd) ## nerrors += 1 # continue ## data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b ).split("\n")) # if "caligned" in param_write_exons : # print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1, ## token2, e2, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) ## ali_common1 += data[0][1] ## ali_common2 += data[1][1] ####################################################################### # write alignment of introns for orthologous introns # orthologous introns are between orthologous exons if param_write_introns: if last_e1 is not None: if e1 - last_e1 != 1 or e2 - last_e2 != 1: nskipped_introns += 1 else: pair = AlignedPairs.UnalignedPair() intron_from1 = cds1[e1 - 1].mGenomeTo intron_to1 = cds1[e1].mGenomeFrom intron_from2 = cds2[e2 - 1].mGenomeTo intron_to2 = cds2[e2].mGenomeFrom intron_fragment1 = transcript1[intron_from1:intron_to1] intron_fragment2 = transcript2[intron_from2:intron_to2] if len(intron_fragment1) == 0 or len(intron_fragment2) == 0: print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\ (intron_from1, intron_to1, len(transcript1), intron_from2, intron_to2, len(transcript2)) continue pair.mCategory = "intron" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) - 1 pair.mLen1 = len(intron_fragment1) pair.mFrom1 = intron_from1 pair.mTo1 = intron_to1 pair.mSequence1 = intron_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum1 = len(cds2) - 1 pair.mLen2 = len(intron_fragment2) pair.mFrom2 = intron_from2 pair.mTo2 = intron_to2 pair.mSequence2 = intron_fragment2 if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \ (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \ (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \ (param_max_intron_length and len(intron_fragment2) > param_max_intron_length): if param_loglevel >= 1: print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\ (token1, e1, token2, e2, len(intron_fragment1), len(intron_fragment2)) sys.stdout.flush() nskipped += 1 print str(pair) # else: ## anchored_from1 = intron_from1 - param_extend_introns ## anchored_to1 = intron_to1 + param_extend_introns ## anchored_from2 = intron_from2 - param_extend_introns ## anchored_to2 = intron_to2 + param_extend_introns ## anchored_fragment1 = transcript1[anchored_from1:anchored_to1] ## anchored_fragment2 = transcript2[anchored_from2:anchored_to2] # for method in param_write_introns: # if param_loglevel >= 2: # print "## aligning with method %s" % method # sys.stdout.flush # map_intron_a2b.clear() # if method == "unaligned": ## from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2 # elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"): ## tmp_intron_a2b = alignlib_lite.makeAlignmentVector() # if param_loglevel >= 1: # print "# aligning with method %s two fragments of length %i and %i" % (method, # len(anchored_fragment1), # len(anchored_fragment2)) # sys.stdout.flush() # if method == "dialigned": ## result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dialignedlgs": ## result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dbaligned": ## result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "clusaligned": ## result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # if not result or result.getLength() == 0: # if param_loglevel >= 1: # print "# Error: empty intron alignment" # sys.stdout.flush() ## nerrors += 1 # continue ## tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 ) # alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b, ## intron_from1 + 1, intron_to1, # intron_from2 + 1, intron_to2 ) # elif method == "nwaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignator_nw.Align( seq1, seq2, map_intron_a2b ) # seq1.useFullLength() # seq2.useFullLength() # elif method == "swaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw ) # seq1.useFullLength() # seq2.useFullLength() # else: ## raise "unknown method %s" % method # if map_intron_a2b.getLength() > 0: # if param_compress: ## from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo() ## from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo() ## ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b ) # else: # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b ).split("\n")) # if len(data) < 2: ## data=[ ( 0, "", 0), (0, "", 0)] ## from1, ali1, to1 = data[0] ## from2, ali2, to2 = data[1] # print string.join(map(str, ("intron", # method, ## token1, e1, len(cds1) - 1, len(intron_fragment1), ## token2, e2, len(cds2) - 1, len(intron_fragment2), # map_intron_a2b.getNumGaps(), # map_intron_a2b.getLength(), ## map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(), ## from1, to1, ali1, ## from2, to2, ali2, ## intron_from1, intron_to1, # intron_from2, intron_to2)), "\t") # sys.stdout.flush() last_e1, last_e2 = e1, e2 ########################################################################## # write concatenated exons # for method in param_write_exons: # if method == "common": # print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## 0, 0, ## 0, 0, # ali_common1, ali_common2 ) # elif method == "exons": # Write full alignment without gaps. # This will not care about exon boundaries and gaps. # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) # try: ## from1, s1, to1, from2, s2, to2 = data[0] + data[1] # except ValueError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # except IndexError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # if from1: # if len(s1) != len(s2): # print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2)) ## nerrors += 1 ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" # else: ## a1, a2 = [], [] # for x in range( min(len(s1), len(s2)) ): # if s1[x] != "-" and s2[x] != "-": ## a1.append( s1[x] ) ## a2.append( s2[x] ) ## s1 = string.join(a1, "") ## s2 = string.join(a2, "") # print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0, ## token2, 0, ## from1, to1, ## from2, to2, # s1, s2 ) ) # elif method == "full": # write full alignment (do not care about exon boundaries) # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) ## if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)] # print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) if param_loglevel >= 3: print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons, nskipped_introns) return nerrors, nskipped
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2, peptides2, cds2): """sort out ortholog relationships between transcripts of orthologous genes. Orthologs have: the same number of exons compatible intron/exon boundaries For the remaining transcript pairs, take reciprocal bet hits. I see the following: 0: 0(100%), 1: 0(94%), 2: 0,1(100%) 0: 0(100%), 1: 0,1,2(100%) Selecting 1-0 first, would result in a suboptimal match, because one transcript is longer than the other, while matching up 0-0 and 2-1 would be better. Objective function: it is the maximal matching/assignment problem. Use greedy implementation instead. Assign as much as possible according to descending weights. """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) # for long sequence: use dot alignment with tuple size of three dottor = alignlib_lite.makeAlignatorTuples(3) alignator_dots = alignlib_lite.makeAlignatorDotsSquared( param_gop, param_gep, dottor) seqs1 = map(lambda x: alignlib_lite.makeSequence( peptides1[x[0]]), transcripts1) seqs2 = map(lambda x: alignlib_lite.makeSequence( peptides2[x[0]]), transcripts2) if param_loglevel >= 4: print "# building sequence 1" for i in range(len(seqs1)): if not cds1.has_key(transcripts1[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# building sequence 2" for i in range(len(seqs2)): if not cds2.has_key(transcripts2[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# all-vs-all alignment" # do all versus all alignment alis1 = [] alis2 = [] for i in range(len(seqs1)): alis1.append([]) for i in range(len(seqs2)): alis2.append([]) if param_loglevel >= 3: print "#################################" for i in range(len(seqs1)): for cd in cds1[transcripts1[i][0]]: print "#", str(cd) print "# versus" for i in range(len(seqs2)): for cd in cds2[transcripts2[i][0]]: print "#", str(cd) sys.stdout.flush() weights = {} for i in range(len(seqs1)): prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[ i] for j in range(len(seqs2)): prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[ j] map_a2b = alignlib_lite.makeAlignmentVector() m = seqs1[i].getLength() * seqs2[j].getLength() if param_loglevel >= 3: print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\ (i, j, prediction_id1, seqs1[ i].getLength(), prediction_id2, seqs2[j].getLength()) sys.stdout.flush() if m > param_max_matrix_size: # switch to tuple alignment if sequences are too large if param_loglevel >= 2: print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (seqs1[i].getLength(), seqs2[j].getLength()) sys.stdout.flush() alignator_dots.align(map_a2b, seqs1[i], seqs2[j]) else: alignator.align(map_a2b, seqs1[i], seqs2[j]) coverage_a = 100.0 * \ (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \ seqs1[i].getLength() coverage_b = 100.0 * \ (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \ seqs2[j].getLength() # get copy of cds, but only those overlapping with alignment c1 = Exons.GetExonsRange(cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3, (map_a2b.getRowTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) c2 = Exons.GetExonsRange(cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3, (map_a2b.getColTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) # check exon boundaries, look at starts, skip first exon def MyMap(a, x): while x <= a.getRowTo(): c = a.mapRowToCol(x) if c: return c x += 1 else: return 0 mapped_boundaries = map( lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:]) mapped_boundaries.sort() reference_boundaries = map( lambda x: x.mPeptideFrom / 3 + 1, c2[1:]) reference_boundaries.sort() nmissed_cmp2ref = Exons.CountMissedBoundaries( mapped_boundaries, reference_boundaries, param_boundaries_max_slippage) nmissed_ref2cmp = Exons.CountMissedBoundaries( reference_boundaries, mapped_boundaries, param_boundaries_max_slippage) min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp) # set is_ok for the whole thing # no intron: is ok is_ok = 0 if (len(c1) == 1 and len(c2) == 1): is_ok = 1 else: # allow for missed boundaries, if param_boundaries_allow_missed # > 0 if min_nmissed == 0: is_ok = 1 else: if param_boundaries_allow_missed and \ len(mapped_boundaries) >= param_boundaries_allow_missed and \ min_nmissed <= param_boundaries_max_missed: is_ok = 1 cc = min(coverage_a, coverage_b) if cc >= param_min_coverage: is_ok_coverage = 1 else: is_ok_coverage = 0 # check for missing introns is_ok_exons = 1 if abs(len(c1) - len(c2)) != 0: if param_missing_max_missing: if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or (min(len(c1), len(c2)) < param_missing_min_present)): is_ok_exons = 0 else: is_ok_exons = 0 if param_loglevel >= 3: print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \ "boundaries_ok=", is_ok, \ "nexons_ok=", is_ok_exons, \ "missed_c2r=", nmissed_cmp2ref, \ "missed_r2c=", nmissed_ref2cmp, \ "min_cov=", cc, \ "mapped=", mapped_boundaries, \ "reference=", reference_boundaries print "#", string.join(map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b), map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") sys.stdout.flush() # dump out pairs for method in param_write_pairs: if method == "all": print string.join(map(str, ( "pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[ i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[ j].getLength(), map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b, nmissed_cmp2ref, mapped_boundaries, nmissed_ref2cmp, reference_boundaries, i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t") elif method == "alignment": print string.join(map(str, ( "pair", method, prediction_id1, prediction_id2, map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") elif method == "location": print string.join(map(str, ( "pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[ i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t") if not is_ok_exons: if param_loglevel >= 4: print "# rejected %i and %i: too many exons difference." % (i, j) continue if param_check_exon_boundaries: if not is_ok: continue if cc < param_min_coverage: continue if not weights.has_key(cc): weights[cc] = [] alis1[i].append((coverage_a, j)) alis2[j].append((coverage_b, i)) weights[cc].append((i, j, map_a2b)) # sort out alignments ww = weights.keys() ww.sort() ww.reverse() pairs = [] assigned1 = {} assigned2 = {} if param_loglevel >= 3: print "# alis1=", alis1 print "# alis2=", alis2 print "# --------------------------------------" for w in ww: for i, j, map_a2b in weights[w]: if not assigned1.has_key(i) and not assigned2.has_key(j): pairs.append((transcripts1[i], transcripts2[j], w, map_a2b)) assigned1[i] = 1 assigned2[j] = 1 if len(assigned1) == len(transcripts1): break if len(assigned2) == len(transcripts2): break return pairs
alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) for q1, q2 in pairs: ninput += 1 if param_loglevel >= 1: print "# processing %s and %s" % (q1, q2) if q1 in transcripts1 and q2 in transcripts2: map_a2b = alignlib_lite.makeAlignmentVector() alignator.align(map_a2b, alignlib_lite.makeSequence(peptides1[q1]), alignlib_lite.makeSequence(peptides2[q2])) if map_a2b.getLength() == 0: if param_loglevel >= 1: print "# Alignment failed between %s and %s" % (q1, q2) sys.stdout.flush() ntotal_errors += 1 continue nerrors, nskipped = WriteExons(q1, peptides1[q1], cds1[q1], transcripts1[q1], q2, peptides2[q2], cds2[ q2], transcripts2[q2], map_a2b)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed).") parser.add_option("-c", "--cds-gtf-file", dest="filename_cds", type="string", help="filename with cds seguences.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("paired_fasta", ), help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format") parser.set_defaults( genome_file="genome", filename_cds="cds.fasta", format="paired_fasta", filename_suffix=".fasta", filename_prefix="", ) (options, args) = E.Start(parser, add_database_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(1) fasta = IndexedFasta.IndexedFasta(options.genome_file) # reading CDS sequences if options.filename_cds: cds_sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r")) else: cds_sequences = {} if options.loglevel >= 1: options.stdlog.write("# read %i CDS sequences\n" % len(cds_sequences)) last_filename_genome = None p = PredictionParser.PredictionParserEntry() ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0 for line in options.stdin: if line[0] == "#": continue if line[0] == '"': continue p.Read(line) ninput += 1 genomic_fragment = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo) if len(genomic_fragment) == 0: raise "ERROR: empty fragment %s:%s for line" % ( p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line try: cds_fragment = cds_sequences[p.mQueryToken] except KeyError: options.stdlog.write( "# ERROR: cds not found: query %s.\n" % p.mQueryToken) continue map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA(p.mMapPeptide2Genome, query_from=p.mQueryFrom, sbjct_from=0, genome=genomic_fragment) # check for errors: if map_query2sbjct.getRowTo() != p.mQueryTo * 3: options.stdlog.write("# ERROR: boundary shift in query at line %s\n# %i %i\n" % ( line, map_query2sbjct.getRowTo(), p.mQueryTo * 3)) if map_query2sbjct.getColTo() > len(genomic_fragment): options.stdlog.write("# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" % (line, len(genomic_fragment), map_query2sbjct.getColTo())) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str(cds_fragment), genomic_fragment)) nlength += 1 continue if map_query2sbjct.getRowTo() > len(cds_fragment): options.stdlog.write("# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" % (line, len(cds_fragment), map_query2sbjct.getRowTo())) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str(cds_fragment), genomic_fragment)) nlength += 1 continue cds_seq = alignlib_lite.makeSequence(cds_fragment) genomic_seq = alignlib_lite.makeSequence(genomic_fragment) f = alignlib_lite.AlignmentFormatExplicit( map_query2sbjct, cds_seq, genomic_seq) row_ali = f.mRowAlignment col_ali = f.mColAlignment row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment( row_ali, col_ali) row_ali = Genomics.MaskStopCodons(row_ali) col_ali = Genomics.MaskStopCodons(col_ali) if len(row_ali) != len(col_ali): options.stdlog.write("# ERROR: wrong alignment lengths.\n") sys.exit(1) if len(row_ali) % 3 or len(col_ali) % 3: options.stdlog.write( "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line) options.stdlog.write("# %6i %s\n# %6i %s\n" % ( len(row_ali), str(row_ali), len(col_ali), str(col_ali))) n3 += 1 input = re.sub("[-X]", "", p.mTranslation) ref = re.sub("[-X]", "", Genomics.TranslateDNA2Protein(col_ali)) if input != ref: if options.loglevel >= 1: options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken, len(input), input, len(ref), ref)) nsanity += 1 continue options.stdout.write(">%s\n%s\n" % (p.mPredictionId, row_ali)) options.stdout.write(">%s_vs_%s_%s_%i_%i\n%s\n" % (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali)) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % ( ninput, noutput, nsanity, nlength, n3)) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option( "-b", "--boundaries", dest="filename_boundaries", type="string", help="filename with exon boundaries." ) parser.add_option( "-e", "--exons", dest="filename_exons", type="string", help="filename with exons (output)." ) parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences." ) parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true", help="print exons for predictions not found in reference." ) parser.add_option( "-q", "--quality-pide", dest="quality_threshold_pide", type="int", help="quality threshold (pide) for exons." ) parser.set_defaults( genome_file = "genome", filename_boundaries = None, filename_exons = None, filename_peptides = None, quality_threshold_pide = 0, write_notfound = False, ## allowed number of nucleotides for exon boundaries to ## be considered equivalent. slipping_exon_boundary = 9, ## stop codons to search for stop_codons = ("TAG", "TAA", "TGA"), ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) reference_exon_boundaries = {} if options.filename_boundaries: reference_exon_boundaries = Exons.ReadExonBoundaries( open( options.filename_boundaries, "r"), do_invert = 1, remove_utr = 1) E.info( "read exon boundaries for %i queries" % len(reference_exon_boundaries) ) if options.filename_exons: outfile_exons = open( options.filename_exons, "w") outfile_exons.write( "%s\n" % "\t".join( ( "prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame", "reference_id", "reference_from", "reference_to", "reference_phase", "pidentity", "psimilarity", "nframeshifts", "ngaps", "nstopcodons", "is_ok", "genome_exon_from", "genome_exon_to") ) ) else: outfile_exons = None if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r") ) E.info("read peptide sequences for %i queries" % len(peptide_sequences) ) else: peptide_sequences = {} entry = PredictionParser.PredictionParserEntry() last_filename_genome = None nfound, nmissed_exons, nmissed_length = 0, 0, 0 nempty_alignments = 0 fasta = IndexedFasta.IndexedFasta( options.genome_file ) options.stdout.write( "%s\n" % "\t".join( ( "prediction_id", "number", "dubious_exons", "boundaries_sum", "boundaries_max", "identical_exons", "inserted_exons", "deleted_exons", "inserted_introns", "deleted_introns", "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons", "deleted_Cexons", "inserted_Nexons", "inserted_Cexons" ) ) ) for line in sys.stdin: if line[0] == "#": continue try: entry.Read(line) except ValueError, msg: print "# parsing failed with msg %s in line %s" % (msg, line[:-1]) sys.exit(1) exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome, query_from = entry.mQueryFrom, sbjct_from = entry.mSbjctGenomeFrom, add_stop_codon = 0 ) if exons[-1][4] != entry.mSbjctGenomeTo: print "# WARNING: discrepancy in exon calculation!!!" for e in exons: print "#", str(e) print "#", str(entry) if options.loglevel >= 5: for e in exons: print "#", str(e) genomic_fragment = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ) skip = False if peptide_sequences.has_key( entry.mQueryToken ): query_sequence = alignlib_lite.makeSequence(peptide_sequences[entry.mQueryToken]) sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation) percent_similarity, percent_identity = 0, 0 if query_sequence.getLength() < entry.mMapPeptide2Translation.getRowTo(): print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken, query_sequence.getLength(), entry.mMapPeptide2Translation.getRowTo()) sys.stdout.flush() nmissed_length += 1 skip = True elif sbjct_sequence.getLength() < entry.mMapPeptide2Translation.getColTo(): print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken, sbjct_sequence.getLength(), entry.mMapPeptide2Translation.getColTo()) sys.stdout.flush() nmissed_length += 1 skip = True else: alignlib_lite.rescoreAlignment( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence, alignlib_lite.makeScorer( query_sequence, sbjct_sequence ) ) percent_identity = alignlib_lite.calculatePercentIdentity( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence ) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( entry.mMapPeptide2Translation ) * 100 E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % ( str(entry.mPredictionId), entry.mPercentSimilarity, entry.mPercentIdentity, percent_similarity, percent_identity ) ) else: query_sequence = None sbjct_sequence = None # default values exons_num_exons = "na" exons_boundaries_sum = "na" exons_boundaries_max = "na" dubious_exons = "na" ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0,0,0,0,0 truncated_Nterminal_exon, truncated_Cterminal_exon = 0,0 ndeleted_Nexons, ndeleted_Cexons = 0, 0 ninserted_Nexons, ninserted_Cexons = 0, 0 exons_offset = exons[0][3] if not reference_exon_boundaries.has_key( entry.mQueryToken ): print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken ) sys.stdout.flush() nmissed_exons += 1 skip = True if not skip: nfound += 1 ref_exons = reference_exon_boundaries[entry.mQueryToken] ref_exons_offset = ref_exons[0].mGenomeFrom exons_num_exons = len(ref_exons) - len(exons) exons_boundaries_sum = 0 exons_phase = 0 exons_boundaries_max = 0 dubious_exons = 0 inserted_exons = 0 temp_inserted_exons = 0 if options.loglevel >= 3: for e in exons: options.stdlog.write( "# %s\n" % str(e) ) for e in ref_exons: options.stdlog.write( "# %s\n" % str(e) ) min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100 in_sync = 0 e,r = 0,0 while e < len(exons) and r < len(ref_exons): this_e, this_r = e+1, r+1 percent_identity = 0 percent_similarity = 0 is_good_exon = 0 if options.loglevel >= 4: options.stdlog.write( "# current exons: %i and %i\n" % (e, r) ) sys.stdout.flush() exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[e][0:6] ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset ## get percent identity for exon exon_percent_identity = 0 exon_percent_similarity = 0 if query_sequence and sbjct_sequence: tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = exon_from / 3 xquery_to = exon_to / 3 alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to ) if tmp_ali.getLength() == 0: options.stdlog.write( "# WARNING: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) nempty_alignments += 1 else: if options.loglevel >= 5: options.stdlog.write( "# %s\n" % str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) ) ) exon_percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence ) * 100 exon_percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali ) * 100 if exon_percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 if e < len(exons) -1 : (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e+1][0:6] else: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, [] if r < len(ref_exons) - 1: next_ref_from, next_ref_to, next_ref_phase = (ref_exons[r+1].mPeptideFrom, ref_exons[r+1].mPeptideTo, ref_exons[r+1].frame) else: next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0 if options.loglevel >= 2: options.stdlog.write( "# %s\n" % "\t".join( map(str, (entry.mQueryToken, exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, ref_from, ref_to, ref_phase )))) sys.stdout.flush() # beware of small exons. # if less than options.slipping_exon_boundary: boundary is 0 # check if end is more than options.splipping_exon_boundary apart as well. if exon_to - exon_from <= options.slipping_exon_boundary or \ ref_to - ref_from <= options.slipping_exon_boundary: boundary = 0 else: boundary = options.slipping_exon_boundary if ref_to <= exon_from + boundary and \ ref_to <= exon_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if e == 0: ndeleted_Nexons += 1 else: ndeleted_exons += 1 r += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0 overlap = 0 elif exon_to <= ref_from + boundary and \ exon_to <= ref_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if r == 0: ninserted_Nexons += 1 else: ninserted_exons += 1 e += 1 ref_from, ref_to, ref_phase = 0, 0, 0 overlap = 0 else: ## overlap overlap = 1 dfrom = int(math.fabs(exon_from - ref_from)) dto = int(math.fabs(exon_to - ref_to)) ## get percent identity for overlapping fragment if query_sequence and sbjct_sequence: ## this the problem tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = max( ref_from / 3, exon_from / 3) xquery_to = min(ref_to / 3, exon_to / 3) alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to ) if tmp_ali.getLength() == 0: options.stdlog.write( "# warning: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to ))) percent_identity = 0 percent_similarity = 0 else: if options.loglevel >= 5: print str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) ) percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence ) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali ) * 100 if percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 dubious_exons += 1 ## adjust regions for terminal exons if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0: if is_good_exon: truncated_Nterminal_exon = dfrom dfrom = 0 ## truncated terminal exons if e == len(exons)-1 and r == len(ref_exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: if is_good_exon: truncated_Cterminal_exon = dto dto = 0 ## do not count deviations for terminal query exons if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0: dfrom = 0 if e == len(exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: dto = 0 ## permit difference of one codon (assumed to be stop) if e == len(exons)-1 and r == len(ref_exons)-1 and dto == 3: dto = 0 ## deal with different boundary conditions: if dfrom == 0 and dto == 0: if is_good_exon: nidentical_exons += 1 e += 1 r += 1 ## next exon within this ref_exon elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary: if is_good_exon: ninserted_introns += 1 e += 1 in_sync = 1 dto = 0 ## next ref_exon within this exon elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary: if is_good_exon: ndeleted_introns += 1 r += 1 in_sync = 1 dto = 0 else: e += 1 r += 1 if in_sync: dfrom = 0 if is_good_exon: exons_boundaries_sum += dfrom + dto exons_boundaries_max = max( dfrom, exons_boundaries_max ) exons_boundaries_max = max( dto, exons_boundaries_max ) ########################################################### ## count inserted/deleted introns and misplaced boundaries ## ## if exon and next_exon in ref_exon: inserted intron ## if ref_exon and next_ref_exon in exon: deleted intron if outfile_exons: if genomic_fragment and exon_genome_to: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment, border_stop_codon = 0 ) else: nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0 if exon_to == 0: this_e = 0 if ref_to == 0: this_r = 0 outfile_exons.write( string.join( map(str, (entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, this_r, ref_from, ref_to, ref_phase, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, is_good_exon, exon_genome_from, exon_genome_to, )), "\t") + "\n") while e < len(exons): exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[e][0:5] e += 1 ninserted_Cexons += 1 if outfile_exons: outfile_exons.write( string.join( map(str, (entry.mPredictionId, e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") while r < len(ref_exons): ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ndeleted_Cexons += 1 ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset r += 1 if outfile_exons: outfile_exons.write( string.join( map(str, (entry.mPredictionId, 0, 0, 0, 0, r, ref_from, ref_to, ref_phase, 0, 0, 0, 0, 0, 0, 0, 0, )), "\t") + "\n") else: if options.write_notfound: this_e = 0 ## use prediction's identity/similarity for exons. ## This will still then flag stop-codons in later analysis percent_identity = entry.mPercentIdentity percent_similarity = entry.mPercentSimilarity for exon in exons: this_e += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[0:6] if genomic_fragment: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment ) outfile_exons.write( string.join( map(str, (entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") options.stdout.write( "\t".join(map(str, (entry.mPredictionId, exons_num_exons, dubious_exons, exons_boundaries_sum, exons_boundaries_max, nidentical_exons, ninserted_exons, ndeleted_exons, ninserted_introns, ndeleted_introns, truncated_Nterminal_exon, truncated_Cterminal_exon, ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons, ninserted_Cexons))) + "\n" )
def PrintCluster(cluster, cluster_id, lengths, peptide_sequences=None, regex_preferred=None): """print a cluster. Take longest sequence as representative. If preferred is given, only take genes matching preferred identifier. """ if regex_preferred: rx = re.compile(regex_preferred) else: rx = None max_al = 0 max_pl = 0 rep_a = None rep_p = None for c in cluster: l = 0 if c in lengths: l = lengths[c] if l > max_al: max_al = l rep_a = c if rx and rx.search(c) and l > max_pl: max_pl = l rep_p = c if max_pl > 0: max_l = max_pl rep = rep_p else: max_l = max_al rep = rep_a for mem in cluster: l = 0 if mem in lengths: l = lengths[mem] if peptide_sequences: map_rep2mem = alignlib_lite.makeAlignmentVector() if rep == mem and rep in lengths: alignlib_lite.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0) elif mem in peptide_sequences and \ rep in peptide_sequences: alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) alignator.align(map_rep2mem, alignlib_lite.makeSequence( peptide_sequences[rep]), alignlib_lite.makeSequence(peptide_sequences[mem])) f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem) print string.join(map(str, (rep, mem, l, f)), "\t") else: print string.join(map(str, (rep, mem, l)), "\t") sys.stdout.flush() return cluster_id
def FilterConflicts(old_predictions, new_predictions, removed_predictions, min_overlap, peptide_sequences): """remove conflicts. Remove overlapping entries between different queries. Only remove those sequences, which are alignable. If they are alignable, take the sequence with the highest score and highest coverage. (Take both, if score and coverage are not correlated.) """ ################################################################################################## ## sort predictions by genomic region if isinstance(old_predictions, PredictionFile.PredictionFile): old_predictions.sort(('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo')) else: old_predictions.sort(lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x. mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y. mSbjctGenomeFrom, y.mSbjctGenomeTo))) ################################################################################################## ## filter predictions and resolve conflicts based on genomic overlap ## deleted segments are put in a temporary storage space. alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) result = alignlib_lite.makeAlignmentVector() alignments = {} noverlaps = 0 nredundants = 0 nnew = 0 last_prediction = None for this_prediction in old_predictions: try: this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None if not last_prediction: last_prediction = this_prediction last_query_gene = this_query_gene continue overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \ max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) union = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \ min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) # resolve overlap between different genes if overlap > 0 and \ (last_query_gene != this_query_gene or last_query_gene == None): noverlaps += 1 relative_overlap = 100 * overlap / union # Start conflict resolution, if overlap is above threshold. # Keep higher scoring segment. # # Check if queries are homologous. if relative_overlap >= param_max_percent_overlap: if peptide_sequences: if last_prediction.mQueryToken < this_prediction.mQueryToken: key = "%s-%s" % (last_prediction.mQueryToken, this_prediction.mQueryToken) else: key = "%s-%s" % (this_prediction.mQueryToken, last_prediction.mQueryToken) if not alignments.has_key(key): result.clear() alignator.align( result, alignlib_lite.makeSequence(peptide_sequences[ this_prediction.mQueryToken]), alignlib_lite.makeSequence(peptide_sequences[ last_prediction.mQueryToken])) alignments[key] = result.getScore() if result.getScore() >= param_min_score_overlap: nredundants += 1 if alignments[key] >= param_min_score_overlap: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 0 if is_overlap: # take best prediction. If difference is very small, set # difference to 0 (difference does not matter). In this case, # the first prediction is taken. d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage if float(abs(d1)) / float(last_prediction.mQueryCoverage ) < param_conflicts_min_difference: d1 = 0 d2 = last_prediction.score - this_prediction.score if float(abs(d2)) / float( this_prediction.score) < param_conflicts_min_difference: d2 = 0 if d1 >= 0 and d2 >= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % ( last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(this_prediction)) if param_benchmarks: if CheckBenchmark(this_prediction, last_prediction): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap, str(last_prediction)) removed_predictions.append(this_prediction) continue elif d1 <= 0 and d2 <= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % ( this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(last_prediction)) if param_benchmarks: if CheckBenchmark(last_prediction, this_prediction): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap, str(this_prediction)) removed_predictions.append(last_prediction) last_prediction = this_prediction last_query_gene = this_query_gene continue else: if param_loglevel >= 2: print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \ (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, this_prediction.score, this_prediction.mQueryCoverage, this_prediction.mPercentIdentity, last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, last_prediction.score, last_prediction.mQueryCoverage, last_prediction.mPercentIdentity) new_predictions.append(last_prediction) nnew += 1 last_query_gene = this_query_gene last_prediction = this_prediction new_predictions.append(last_prediction) nnew += 1 if param_loglevel >= 1: print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \ (len(alignments), noverlaps, nredundants) return nnew
print "#", cds_fragment print "# genomic" print "#", genomic_fragment continue if map_query2sbjct.getRowTo() > len(cds_fragment): print "# ERROR: length mismatch: cds fragment (%i) shorter than last aligned residue (%i)" %\ (len(cds_fragment), map_query2sbjct.getRowTo()) print "#", line print "# cds" print "#", cds_fragment print "# genomic" print "#", genomic_fragment continue cds_seq = alignlib_lite.makeSequence(cds_fragment) genomic_seq = alignlib_lite.makeSequence(genomic_fragment) data = map(lambda x: string.split(x, "\t"), string.split(alignlib_lite.writePairAlignment(cds_seq, genomic_seq, map_query2sbjct), "\n")) row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment( data[0][1], data[1][1]) row_ali = Genomics.MaskStopCodons(row_ali) col_ali = Genomics.MaskStopCodons(col_ali) if len(row_ali) != len(col_ali): print "# ERROR: wrong alignment lengths."
if r.mGenomeTo < t.mGenomeFrom: rr += 1 continue elif t.mGenomeTo < r.mGenomeFrom: tt += 1 continue overlap += (min(r.mGenomeTo, t.mGenomeTo) - max(r.mGenomeFrom, t.mGenomeFrom)) rr += 1 tt += 1 if overlap == 0: continue map_reference2target.clear() row = alignlib_lite.makeSequence(reference.mTranslation) col = alignlib_lite.makeSequence(target.mTranslation) alignator.align(map_reference2target, row, col) f = alignlib_lite.AlignmentFormatEmissions(map_reference2target) row_ali, col_ali = f.mRowAlignment, f.mColAlignment pidentity = 100.0 * alignlib_lite.calculatePercentIdentity( map_reference2target, row, col) psimilarity = 100.0 * alignlib_lite.calculatePercentSimilarity( map_reference2target) union = max( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \ min( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom ) inter = min( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \ max( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom )
def ProcessRegion(predictions, region_id, region, peptide_sequences=None, filter_queries={}): """process a set of matches to a region. resolve region according to homology. """ if options.loglevel >= 3: options.stdlog.write( "###################################################################\n" ) options.stdlog.write("# resolving %i predictions in region %s\n" % (len(predictions), str(region))) sys.stdout.flush() predictions.sort(lambda x, y: cmp(x.score, y.score)) predictions.reverse() alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep) result = alignlib_lite.makeAlignmentVector() cluster = [] map_sequence2cluster = range(0, len(predictions)) edges = [] noutput, nskipped = 0, 0 if peptide_sequences: for x in range(len(predictions)): if options.loglevel >= 5: options.stdlog.write( "# filtering from %i with prediction %i: %s\n" % (x, predictions[x].mPredictionId, predictions[x].mQueryToken)) sys.stdout.flush() if map_sequence2cluster[x] != x: continue region_id += 1 edges = [] if predictions[x].mQueryToken not in filter_queries: edges.append(predictions[x]) else: nskipped += 1 for y in range(x + 1, len(predictions)): if map_sequence2cluster[y] != y: continue if predictions[x].mQueryToken < predictions[y].mQueryToken: key = "%s-%s" % (predictions[x].mQueryToken, predictions[y].mQueryToken) else: key = "%s-%s" % (predictions[y].mQueryToken, predictions[x].mQueryToken) # check if predictions are overlapping on the genomic sequence if min(predictions[x].mSbjctGenomeTo, predictions[y].mSbjctGenomeTo) - \ max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0: if options.loglevel >= 4: options.stdlog.write( "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" % (predictions[x].mPredictionId, predictions[y].mPredictionId)) sys.stdout.flush() continue if not global_alignments.has_key(key): seq1 = peptide_sequences[predictions[x].mQueryToken] seq2 = peptide_sequences[predictions[y].mQueryToken] result.clear() s1 = alignlib_lite.makeSequence(seq1) s2 = alignlib_lite.makeSequence(seq2) alignator.align(result, s1, s2) c1 = 100 * \ (result.getRowTo() - result.getRowFrom()) / len(seq1) c2 = 100 * \ (result.getColTo() - result.getColFrom()) / len(seq2) min_cov = min(c1, c2) max_cov = max(c1, c2) identity = alignlib_lite.calculatePercentIdentity( result, s1, s2) * 100 # check if predictions overlap and they are homologous if result.getScore() >= options.overlap_min_score and \ max_cov >= options.overlap_max_coverage and \ min_cov >= options.overlap_min_coverage and \ identity >= options.overlap_min_identity: global_alignments[key] = True else: global_alignments[key] = False if options.loglevel >= 4: options.stdlog.write( "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" % (key, result.getScore(), identity, c1, c2, min_cov, max_cov, global_alignments[key])) sys.stdout.flush() if global_alignments[key]: map_sequence2cluster[y] = x if predictions[y].mQueryToken not in filter_queries: edges.append(predictions[y]) else: nskipped += 1 noutput += PrintEdges(region_id, region, edges) return region_id, noutput, nskipped
def PrintCluster(cluster, cluster_id, lengths, peptide_sequences=None, regex_preferred=None): """print a cluster. Take longest sequence as representative. If preferred is given, only take genes matching preferred identifier. """ if regex_preferred: rx = re.compile(regex_preferred) else: rx = None max_al = 0 max_pl = 0 rep_a = None rep_p = None for c in cluster: l = 0 if c in lengths: l = lengths[c] if l > max_al: max_al = l rep_a = c if rx and rx.search(c) and l > max_pl: max_pl = l rep_p = c if max_pl > 0: max_l = max_pl rep = rep_p else: max_l = max_al rep = rep_a for mem in cluster: l = 0 if mem in lengths: l = lengths[mem] if peptide_sequences: map_rep2mem = alignlib_lite.makeAlignmentVector() if rep == mem and rep in lengths: alignlib_lite.addDiagonal2Alignment(map_rep2mem, 1, lengths[rep], 0) elif mem in peptide_sequences and \ rep in peptide_sequences: alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) alignator.align( map_rep2mem, alignlib_lite.makeSequence(peptide_sequences[rep]), alignlib_lite.makeSequence(peptide_sequences[mem])) f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem) print string.join(map(str, (rep, mem, l, f)), "\t") else: print string.join(map(str, (rep, mem, l)), "\t") sys.stdout.flush() return cluster_id
def EliminateRedundantEntries( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = None, this_quality = None ): """eliminate redundant entries in a set.""" eliminated = [] rep_id = rep.transcript_id rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep ) result = alignlib_lite.makeAlignmentVector() rep_seq = peptides[rep_id] rep_extended_seq = extended_peptides[rep_id] for entry in data: mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id, entry.mQueryCoverage, entry.mPid, entry.mQuality ) mem_seq = peptides[mem_id] mem_extended_seq = extended_peptides[mem_id] if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality)) if mem_id in eliminated_predictions: continue if mem_extended_seq == rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "i") ) elif mem_extended_seq in rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "p") ) else: if mem_quality != this_quality or \ mem_quality in options.quality_exclude_same: seq1 = alignlib_lite.makeSequence( str(rep_seq) ) seq2 = alignlib_lite.makeSequence( str(mem_seq) ) alignator.align( result, seq1, seq2 ) if options.loglevel >= 5: options.stdlog.write( "# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit( result, seq1, seq2 ) ) pidentity = 100 * alignlib_lite.calculatePercentIdentity( result, seq1, seq2 ) num_gaps = result.getNumGaps() if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\ ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) ) if pidentity >= options.min_identity: keep = False if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: keep = True reason = "covpid" elif num_gaps >= options.max_gaps and \ mem_coverage > rep_coverage - options.safety_coverage: keep = True reason = "gaps" elif mem_coverage >= rep_coverage - options.safety_coverage and \ 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage: keep = True reason = "memcov" if keep: options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "h") ) elif pidentity >= options.min_identity_non_genes and \ this_quality in options.quality_genes and \ mem_quality not in options.quality_genes: if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "l") ) return eliminated