def CheckAlignments(peptide_sequences, query_token, other_tokens): """check wether query aligns to all others. """ if param_loglevel >= 3: print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens)) sys.stdout.flush() if query_token not in peptide_sequences: return True result = alignlib_lite.makeAlignmentVector() alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token]) for x in other_tokens: if x not in peptide_sequences: continue col_seq = alignlib_lite.makeSequence(peptide_sequences[x]) alignator.align(result, row_seq, col_seq) if param_loglevel >= 5: print "# %s - %s = %f" % (query_token, x, result.getScore()) if result.getScore() > param_min_alignment_score: return True return False
def CheckAlignments(peptide_sequences, query_token, other_tokens): """check wether query aligns to all others. """ if param_loglevel >= 3: print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens)) sys.stdout.flush() if query_token not in peptide_sequences: return True result = alignlib_lite.makeAlignmentVector() alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token]) for x in other_tokens: if x not in peptide_sequences: continue col_seq = alignlib_lite.makeSequence(peptide_sequences[x]) alignator.align(result, row_seq, col_seq) if param_loglevel >= 5: print "# %s - %s = %f" % (query_token, x, result.getScore()) if result.getScore() > param_min_alignment_score: return True return False
def FilterConflicts(old_predictions, new_predictions, removed_predictions, min_overlap, peptide_sequences): """remove conflicts. Remove overlapping entries between different queries. Only remove those sequences, which are alignable. If they are alignable, take the sequence with the highest score and highest coverage. (Take both, if score and coverage are not correlated.) """ ################################################################################################## ## sort predictions by genomic region if isinstance(old_predictions, PredictionFile.PredictionFile): old_predictions.sort(('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo')) else: old_predictions.sort(lambda x, y: cmp( (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x. mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y. mSbjctGenomeFrom, y.mSbjctGenomeTo))) ################################################################################################## ## filter predictions and resolve conflicts based on genomic overlap ## deleted segments are put in a temporary storage space. alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) result = alignlib_lite.makeAlignmentVector() alignments = {} noverlaps = 0 nredundants = 0 nnew = 0 last_prediction = None for this_prediction in old_predictions: try: this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None if not last_prediction: last_prediction = this_prediction last_query_gene = this_query_gene continue overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \ max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) union = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \ min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) # resolve overlap between different genes if overlap > 0 and \ (last_query_gene != this_query_gene or last_query_gene == None): noverlaps += 1 relative_overlap = 100 * overlap / union # Start conflict resolution, if overlap is above threshold. # Keep higher scoring segment. # # Check if queries are homologous. if relative_overlap >= param_max_percent_overlap: if peptide_sequences: if last_prediction.mQueryToken < this_prediction.mQueryToken: key = "%s-%s" % (last_prediction.mQueryToken, this_prediction.mQueryToken) else: key = "%s-%s" % (this_prediction.mQueryToken, last_prediction.mQueryToken) if not alignments.has_key(key): result.clear() alignator.align( result, alignlib_lite.makeSequence(peptide_sequences[ this_prediction.mQueryToken]), alignlib_lite.makeSequence(peptide_sequences[ last_prediction.mQueryToken])) alignments[key] = result.getScore() if result.getScore() >= param_min_score_overlap: nredundants += 1 if alignments[key] >= param_min_score_overlap: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 0 if is_overlap: # take best prediction. If difference is very small, set # difference to 0 (difference does not matter). In this case, # the first prediction is taken. d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage if float(abs(d1)) / float(last_prediction.mQueryCoverage ) < param_conflicts_min_difference: d1 = 0 d2 = last_prediction.score - this_prediction.score if float(abs(d2)) / float( this_prediction.score) < param_conflicts_min_difference: d2 = 0 if d1 >= 0 and d2 >= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % ( last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(this_prediction)) if param_benchmarks: if CheckBenchmark(this_prediction, last_prediction): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap, str(last_prediction)) removed_predictions.append(this_prediction) continue elif d1 <= 0 and d2 <= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % ( this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(last_prediction)) if param_benchmarks: if CheckBenchmark(last_prediction, this_prediction): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % ( overlap, relative_overlap, str(this_prediction)) removed_predictions.append(last_prediction) last_prediction = this_prediction last_query_gene = this_query_gene continue else: if param_loglevel >= 2: print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \ (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, this_prediction.score, this_prediction.mQueryCoverage, this_prediction.mPercentIdentity, last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, last_prediction.score, last_prediction.mQueryCoverage, last_prediction.mPercentIdentity) new_predictions.append(last_prediction) nnew += 1 last_query_gene = this_query_gene last_prediction = this_prediction new_predictions.append(last_prediction) nnew += 1 if param_loglevel >= 1: print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \ (len(alignments), noverlaps, nredundants) return nnew
def ProcessRegion(predictions, region_id, region, peptide_sequences=None, filter_queries={}): """process a set of matches to a region. resolve region according to homology. """ if options.loglevel >= 3: options.stdlog.write( "###################################################################\n" ) options.stdlog.write("# resolving %i predictions in region %s\n" % (len(predictions), str(region))) sys.stdout.flush() predictions.sort(lambda x, y: cmp(x.score, y.score)) predictions.reverse() alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep) result = alignlib_lite.makeAlignmentVector() cluster = [] map_sequence2cluster = range(0, len(predictions)) edges = [] noutput, nskipped = 0, 0 if peptide_sequences: for x in range(len(predictions)): if options.loglevel >= 5: options.stdlog.write( "# filtering from %i with prediction %i: %s\n" % (x, predictions[x].mPredictionId, predictions[x].mQueryToken)) sys.stdout.flush() if map_sequence2cluster[x] != x: continue region_id += 1 edges = [] if predictions[x].mQueryToken not in filter_queries: edges.append(predictions[x]) else: nskipped += 1 for y in range(x + 1, len(predictions)): if map_sequence2cluster[y] != y: continue if predictions[x].mQueryToken < predictions[y].mQueryToken: key = "%s-%s" % (predictions[x].mQueryToken, predictions[y].mQueryToken) else: key = "%s-%s" % (predictions[y].mQueryToken, predictions[x].mQueryToken) # check if predictions are overlapping on the genomic sequence if min(predictions[x].mSbjctGenomeTo, predictions[y].mSbjctGenomeTo) - \ max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0: if options.loglevel >= 4: options.stdlog.write( "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" % (predictions[x].mPredictionId, predictions[y].mPredictionId)) sys.stdout.flush() continue if not global_alignments.has_key(key): seq1 = peptide_sequences[predictions[x].mQueryToken] seq2 = peptide_sequences[predictions[y].mQueryToken] result.clear() s1 = alignlib_lite.makeSequence(seq1) s2 = alignlib_lite.makeSequence(seq2) alignator.align(result, s1, s2) c1 = 100 * \ (result.getRowTo() - result.getRowFrom()) / len(seq1) c2 = 100 * \ (result.getColTo() - result.getColFrom()) / len(seq2) min_cov = min(c1, c2) max_cov = max(c1, c2) identity = alignlib_lite.calculatePercentIdentity( result, s1, s2) * 100 # check if predictions overlap and they are homologous if result.getScore() >= options.overlap_min_score and \ max_cov >= options.overlap_max_coverage and \ min_cov >= options.overlap_min_coverage and \ identity >= options.overlap_min_identity: global_alignments[key] = True else: global_alignments[key] = False if options.loglevel >= 4: options.stdlog.write( "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" % (key, result.getScore(), identity, c1, c2, min_cov, max_cov, global_alignments[key])) sys.stdout.flush() if global_alignments[key]: map_sequence2cluster[y] = x if predictions[y].mQueryToken not in filter_queries: edges.append(predictions[y]) else: nskipped += 1 noutput += PrintEdges(region_id, region, edges) return region_id, noutput, nskipped
def EliminateRedundantEntries( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality=None, this_quality=None ): """eliminate redundant entries in a set.""" eliminated = [] rep_id = rep.transcript_id rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep) result = alignlib_lite.makeAlignmentVector() rep_seq = peptides[rep_id] rep_extended_seq = extended_peptides[rep_id] for entry in data: mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id, entry.mQueryCoverage, entry.mPid, entry.mQuality, ) mem_seq = peptides[mem_id] mem_extended_seq = extended_peptides[mem_id] if options.loglevel >= 4: options.stdlog.write("# processing: id=%s class=%s\n" % (mem_id, mem_quality)) if mem_id in eliminated_predictions: continue if mem_extended_seq == rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append((mem_id, "i")) elif mem_extended_seq in rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append((mem_id, "p")) else: if mem_quality != this_quality or mem_quality in options.quality_exclude_same: seq1 = alignlib_lite.makeSequence(str(rep_seq)) seq2 = alignlib_lite.makeSequence(str(mem_seq)) alignator.align(result, seq1, seq2) if options.loglevel >= 5: options.stdlog.write("# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit(result, seq1, seq2)) pidentity = 100 * alignlib_lite.calculatePercentIdentity(result, seq1, seq2) num_gaps = result.getNumGaps() if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" % (mem_id, mem_quality, pidentity, rep_coverage, mem_coverage) ) if pidentity >= options.min_identity: keep = False if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide: keep = True reason = "covpid" elif num_gaps >= options.max_gaps and mem_coverage > rep_coverage - options.safety_coverage: keep = True reason = "gaps" elif ( mem_coverage >= rep_coverage - options.safety_coverage and 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage ): keep = True reason = "memcov" if keep: options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" % (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append((mem_id, "h")) elif ( pidentity >= options.min_identity_non_genes and this_quality in options.quality_genes and mem_quality not in options.quality_genes ): if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide: options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" % (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append((mem_id, "l")) return eliminated
def PrintCluster(cluster, cluster_id, lengths, peptide_sequences=None, regex_preferred=None): """print a cluster. Take longest sequence as representative. If preferred is given, only take genes matching preferred identifier. """ if regex_preferred: rx = re.compile(regex_preferred) else: rx = None max_al = 0 max_pl = 0 rep_a = None rep_p = None for c in cluster: l = 0 if c in lengths: l = lengths[c] if l > max_al: max_al = l rep_a = c if rx and rx.search(c) and l > max_pl: max_pl = l rep_p = c if max_pl > 0: max_l = max_pl rep = rep_p else: max_l = max_al rep = rep_a for mem in cluster: l = 0 if mem in lengths: l = lengths[mem] if peptide_sequences: map_rep2mem = alignlib_lite.makeAlignmentVector() if rep == mem and rep in lengths: alignlib_lite.addDiagonal2Alignment( map_rep2mem, 1, lengths[rep], 0) elif mem in peptide_sequences and \ rep in peptide_sequences: alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) alignator.align(map_rep2mem, alignlib_lite.makeSequence( peptide_sequences[rep]), alignlib_lite.makeSequence(peptide_sequences[mem])) f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem) print string.join(map(str, (rep, mem, l, f)), "\t") else: print string.join(map(str, (rep, mem, l)), "\t") sys.stdout.flush() return cluster_id
def FilterConflicts(old_predictions, new_predictions, removed_predictions, min_overlap, peptide_sequences): """remove conflicts. Remove overlapping entries between different queries. Only remove those sequences, which are alignable. If they are alignable, take the sequence with the highest score and highest coverage. (Take both, if score and coverage are not correlated.) """ ########################################################################## # sort predictions by genomic region if isinstance(old_predictions, PredictionFile.PredictionFile): old_predictions.sort( ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo')) else: old_predictions.sort(lambda x, y: cmp((x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo))) ########################################################################## # filter predictions and resolve conflicts based on genomic overlap # deleted segments are put in a temporary storage space. alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) result = alignlib_lite.makeAlignmentVector() alignments = {} noverlaps = 0 nredundants = 0 nnew = 0 last_prediction = None for this_prediction in old_predictions: try: (this_query_peptide, this_query_status, this_query_gene, this_query_transcript) = \ re.split("\s+", this_prediction.mQueryToken) except ValueError: this_query_gene = None if not last_prediction: last_prediction = this_prediction last_query_gene = this_query_gene continue overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) -\ max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) union = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) -\ min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom) # resolve overlap between different genes if overlap > 0 and \ (last_query_gene != this_query_gene or last_query_gene is None): noverlaps += 1 relative_overlap = 100 * overlap / union # Start conflict resolution, if overlap is above threshold. # Keep higher scoring segment. # # Check if queries are homologous. if relative_overlap >= param_max_percent_overlap: if peptide_sequences: if last_prediction.mQueryToken < this_prediction.mQueryToken: key = "%s-%s" % (last_prediction.mQueryToken, this_prediction.mQueryToken) else: key = "%s-%s" % (this_prediction.mQueryToken, last_prediction.mQueryToken) if not alignments.has_key(key): result.clear() alignator.align(result, alignlib_lite.makeSequence( peptide_sequences[this_prediction.mQueryToken]), alignlib_lite.makeSequence(peptide_sequences[last_prediction.mQueryToken])) alignments[key] = result.getScore() if result.getScore() >= param_min_score_overlap: nredundants += 1 if alignments[key] >= param_min_score_overlap: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 1 else: is_overlap = 0 else: is_overlap = 0 if is_overlap: # take best prediction. If difference is very small, set # difference to 0 (difference does not matter). In this case, # the first prediction is taken. d1 = last_prediction.mQueryCoverage - \ this_prediction.mQueryCoverage if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference: d1 = 0 d2 = last_prediction.score - this_prediction.score if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference: d2 = 0 if d1 >= 0 and d2 >= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(this_prediction)) if param_benchmarks: if CheckBenchmark(this_prediction, last_prediction): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap, str(last_prediction)) removed_predictions.append(this_prediction) continue elif d1 <= 0 and d2 <= 0: if param_loglevel >= 2: print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, overlap, relative_overlap, str(last_prediction)) if param_benchmarks: if CheckBenchmark(last_prediction, this_prediction): print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap, str(this_prediction)) removed_predictions.append(last_prediction) last_prediction = this_prediction last_query_gene = this_query_gene continue else: if param_loglevel >= 2: print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \ (this_prediction.mPredictionId, this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom, this_prediction.score, this_prediction.mQueryCoverage, this_prediction.mPercentIdentity, last_prediction.mPredictionId, last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom, last_prediction.score, last_prediction.mQueryCoverage, last_prediction.mPercentIdentity) new_predictions.append(last_prediction) nnew += 1 last_query_gene = this_query_gene last_prediction = this_prediction new_predictions.append(last_prediction) nnew += 1 if param_loglevel >= 1: print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \ (len(alignments), noverlaps, nredundants) return nnew
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2, peptides2, cds2): """sort out ortholog relationships between transcripts of orthologous genes. Orthologs have: the same number of exons compatible intron/exon boundaries For the remaining transcript pairs, take reciprocal bet hits. I see the following: 0: 0(100%), 1: 0(94%), 2: 0,1(100%) 0: 0(100%), 1: 0,1,2(100%) Selecting 1-0 first, would result in a suboptimal match, because one transcript is longer than the other, while matching up 0-0 and 2-1 would be better. Objective function: it is the maximal matching/assignment problem. Use greedy implementation instead. Assign as much as possible according to descending weights. """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) # for long sequence: use dot alignment with tuple size of three dottor = alignlib_lite.makeAlignatorTuples(3) alignator_dots = alignlib_lite.makeAlignatorDotsSquared( param_gop, param_gep, dottor) seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]), transcripts1) seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]), transcripts2) if param_loglevel >= 4: print "# building sequence 1" for i in range(len(seqs1)): if not cds1.has_key(transcripts1[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# building sequence 2" for i in range(len(seqs2)): if not cds2.has_key(transcripts2[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# all-vs-all alignment" # do all versus all alignment alis1 = [] alis2 = [] for i in range(len(seqs1)): alis1.append([]) for i in range(len(seqs2)): alis2.append([]) if param_loglevel >= 3: print "#################################" for i in range(len(seqs1)): for cd in cds1[transcripts1[i][0]]: print "#", str(cd) print "# versus" for i in range(len(seqs2)): for cd in cds2[transcripts2[i][0]]: print "#", str(cd) sys.stdout.flush() weights = {} for i in range(len(seqs1)): prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[ i] for j in range(len(seqs2)): prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[ j] map_a2b = alignlib_lite.makeAlignmentVector() m = seqs1[i].getLength() * seqs2[j].getLength() if param_loglevel >= 3: print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\ (i, j, prediction_id1, seqs1[ i].getLength(), prediction_id2, seqs2[j].getLength()) sys.stdout.flush() if m > param_max_matrix_size: # switch to tuple alignment if sequences are too large if param_loglevel >= 2: print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % ( seqs1[i].getLength(), seqs2[j].getLength()) sys.stdout.flush() alignator_dots.align(map_a2b, seqs1[i], seqs2[j]) else: alignator.align(map_a2b, seqs1[i], seqs2[j]) coverage_a = 100.0 * \ (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \ seqs1[i].getLength() coverage_b = 100.0 * \ (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \ seqs2[j].getLength() # get copy of cds, but only those overlapping with alignment c1 = Exons.GetExonsRange( cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3, (map_a2b.getRowTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) c2 = Exons.GetExonsRange( cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3, (map_a2b.getColTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) # check exon boundaries, look at starts, skip first exon def MyMap(a, x): while x <= a.getRowTo(): c = a.mapRowToCol(x) if c: return c x += 1 else: return 0 mapped_boundaries = map( lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:]) mapped_boundaries.sort() reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1, c2[1:]) reference_boundaries.sort() nmissed_cmp2ref = Exons.CountMissedBoundaries( mapped_boundaries, reference_boundaries, param_boundaries_max_slippage) nmissed_ref2cmp = Exons.CountMissedBoundaries( reference_boundaries, mapped_boundaries, param_boundaries_max_slippage) min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp) # set is_ok for the whole thing # no intron: is ok is_ok = 0 if (len(c1) == 1 and len(c2) == 1): is_ok = 1 else: # allow for missed boundaries, if param_boundaries_allow_missed # > 0 if min_nmissed == 0: is_ok = 1 else: if param_boundaries_allow_missed and \ len(mapped_boundaries) >= param_boundaries_allow_missed and \ min_nmissed <= param_boundaries_max_missed: is_ok = 1 cc = min(coverage_a, coverage_b) if cc >= param_min_coverage: is_ok_coverage = 1 else: is_ok_coverage = 0 # check for missing introns is_ok_exons = 1 if abs(len(c1) - len(c2)) != 0: if param_missing_max_missing: if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or (min(len(c1), len(c2)) < param_missing_min_present)): is_ok_exons = 0 else: is_ok_exons = 0 if param_loglevel >= 3: print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \ "boundaries_ok=", is_ok, \ "nexons_ok=", is_ok_exons, \ "missed_c2r=", nmissed_cmp2ref, \ "missed_r2c=", nmissed_ref2cmp, \ "min_cov=", cc, \ "mapped=", mapped_boundaries, \ "reference=", reference_boundaries print "#", string.join( map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b), map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") sys.stdout.flush() # dump out pairs for method in param_write_pairs: if method == "all": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength(), map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b, nmissed_cmp2ref, mapped_boundaries, nmissed_ref2cmp, reference_boundaries, i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t") elif method == "alignment": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") elif method == "location": print string.join( map(str, ("pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t") if not is_ok_exons: if param_loglevel >= 4: print "# rejected %i and %i: too many exons difference." % ( i, j) continue if param_check_exon_boundaries: if not is_ok: continue if cc < param_min_coverage: continue if not weights.has_key(cc): weights[cc] = [] alis1[i].append((coverage_a, j)) alis2[j].append((coverage_b, i)) weights[cc].append((i, j, map_a2b)) # sort out alignments ww = weights.keys() ww.sort() ww.reverse() pairs = [] assigned1 = {} assigned2 = {} if param_loglevel >= 3: print "# alis1=", alis1 print "# alis2=", alis2 print "# --------------------------------------" for w in ww: for i, j, map_a2b in weights[w]: if not assigned1.has_key(i) and not assigned2.has_key(j): pairs.append((transcripts1[i], transcripts2[j], w, map_a2b)) assigned1[i] = 1 assigned2[j] = 1 if len(assigned1) == len(transcripts1): break if len(assigned2) == len(transcripts2): break return pairs
elif param_mode == "transcripts": transcript_ids1 = {} transcript_ids2 = {} for x, y in pairs: transcript_ids1[x] = 1 transcript_ids2[y] = 1 transcripts1, transcripts2, cds1, cds2 = ReadTranscriptsAndCds( transcript_ids1, transcript_ids2) if param_loglevel >= 1: print "# reading has finished." sys.stdout.flush() alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) for q1, q2 in pairs: ninput += 1 if param_loglevel >= 1: print "# processing %s and %s" % (q1, q2) if q1 in transcripts1 and q2 in transcripts2: map_a2b = alignlib_lite.makeAlignmentVector() alignator.align(map_a2b, alignlib_lite.makeSequence(peptides1[q1]), alignlib_lite.makeSequence(peptides2[q2]))
nintrons, nsplits, nstopcodons, pidentity, psimilarity, sequence, sbjct_genome_from, sbjct_genome_to, map_query2genome FROM %s AS p WHERE p.sbjct_token = '%s' AND p.sbjct_strand = '%s' AND OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) map_reference2target = alignlib_lite.makeAlignmentVector() assignment_id = 0 for line in cr.fetchall(): reference = PredictionParser.PredictionParserEntry() reference.FillFromTable(line) ct = dbhandle.cursor() ct.execute(statement % (param_tablename_predictions_target, reference.mSbjctToken, reference.mSbjctStrand, reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo)) reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome, 0,
def PrintCluster(cluster, cluster_id, lengths, peptide_sequences=None, regex_preferred=None): """print a cluster. Take longest sequence as representative. If preferred is given, only take genes matching preferred identifier. """ if regex_preferred: rx = re.compile(regex_preferred) else: rx = None max_al = 0 max_pl = 0 rep_a = None rep_p = None for c in cluster: l = 0 if c in lengths: l = lengths[c] if l > max_al: max_al = l rep_a = c if rx and rx.search(c) and l > max_pl: max_pl = l rep_p = c if max_pl > 0: max_l = max_pl rep = rep_p else: max_l = max_al rep = rep_a for mem in cluster: l = 0 if mem in lengths: l = lengths[mem] if peptide_sequences: map_rep2mem = alignlib_lite.makeAlignmentVector() if rep == mem and rep in lengths: alignlib_lite.addDiagonal2Alignment(map_rep2mem, 1, lengths[rep], 0) elif mem in peptide_sequences and \ rep in peptide_sequences: alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0) alignator.align( map_rep2mem, alignlib_lite.makeSequence(peptide_sequences[rep]), alignlib_lite.makeSequence(peptide_sequences[mem])) f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem) print string.join(map(str, (rep, mem, l, f)), "\t") else: print string.join(map(str, (rep, mem, l)), "\t") sys.stdout.flush() return cluster_id
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2, peptides2, cds2): """sort out ortholog relationships between transcripts of orthologous genes. Orthologs have: the same number of exons compatible intron/exon boundaries For the remaining transcript pairs, take reciprocal bet hits. I see the following: 0: 0(100%), 1: 0(94%), 2: 0,1(100%) 0: 0(100%), 1: 0,1,2(100%) Selecting 1-0 first, would result in a suboptimal match, because one transcript is longer than the other, while matching up 0-0 and 2-1 would be better. Objective function: it is the maximal matching/assignment problem. Use greedy implementation instead. Assign as much as possible according to descending weights. """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0) # for long sequence: use dot alignment with tuple size of three dottor = alignlib_lite.makeAlignatorTuples(3) alignator_dots = alignlib_lite.makeAlignatorDotsSquared( param_gop, param_gep, dottor) seqs1 = map(lambda x: alignlib_lite.makeSequence( peptides1[x[0]]), transcripts1) seqs2 = map(lambda x: alignlib_lite.makeSequence( peptides2[x[0]]), transcripts2) if param_loglevel >= 4: print "# building sequence 1" for i in range(len(seqs1)): if not cds1.has_key(transcripts1[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# building sequence 2" for i in range(len(seqs2)): if not cds2.has_key(transcripts2[i][0]): if param_loglevel >= 4: print "# %s not found" % transcripts1[i][0] if param_loglevel >= 4: print "# all-vs-all alignment" # do all versus all alignment alis1 = [] alis2 = [] for i in range(len(seqs1)): alis1.append([]) for i in range(len(seqs2)): alis2.append([]) if param_loglevel >= 3: print "#################################" for i in range(len(seqs1)): for cd in cds1[transcripts1[i][0]]: print "#", str(cd) print "# versus" for i in range(len(seqs2)): for cd in cds2[transcripts2[i][0]]: print "#", str(cd) sys.stdout.flush() weights = {} for i in range(len(seqs1)): prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[ i] for j in range(len(seqs2)): prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[ j] map_a2b = alignlib_lite.makeAlignmentVector() m = seqs1[i].getLength() * seqs2[j].getLength() if param_loglevel >= 3: print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\ (i, j, prediction_id1, seqs1[ i].getLength(), prediction_id2, seqs2[j].getLength()) sys.stdout.flush() if m > param_max_matrix_size: # switch to tuple alignment if sequences are too large if param_loglevel >= 2: print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (seqs1[i].getLength(), seqs2[j].getLength()) sys.stdout.flush() alignator_dots.align(map_a2b, seqs1[i], seqs2[j]) else: alignator.align(map_a2b, seqs1[i], seqs2[j]) coverage_a = 100.0 * \ (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \ seqs1[i].getLength() coverage_b = 100.0 * \ (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \ seqs2[j].getLength() # get copy of cds, but only those overlapping with alignment c1 = Exons.GetExonsRange(cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3, (map_a2b.getRowTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) c2 = Exons.GetExonsRange(cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3, (map_a2b.getColTo()) * 3 + 1, full=False, min_overlap=param_min_alignment_exon_overlap, min_exon_size=param_min_exon_size) # check exon boundaries, look at starts, skip first exon def MyMap(a, x): while x <= a.getRowTo(): c = a.mapRowToCol(x) if c: return c x += 1 else: return 0 mapped_boundaries = map( lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:]) mapped_boundaries.sort() reference_boundaries = map( lambda x: x.mPeptideFrom / 3 + 1, c2[1:]) reference_boundaries.sort() nmissed_cmp2ref = Exons.CountMissedBoundaries( mapped_boundaries, reference_boundaries, param_boundaries_max_slippage) nmissed_ref2cmp = Exons.CountMissedBoundaries( reference_boundaries, mapped_boundaries, param_boundaries_max_slippage) min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp) # set is_ok for the whole thing # no intron: is ok is_ok = 0 if (len(c1) == 1 and len(c2) == 1): is_ok = 1 else: # allow for missed boundaries, if param_boundaries_allow_missed # > 0 if min_nmissed == 0: is_ok = 1 else: if param_boundaries_allow_missed and \ len(mapped_boundaries) >= param_boundaries_allow_missed and \ min_nmissed <= param_boundaries_max_missed: is_ok = 1 cc = min(coverage_a, coverage_b) if cc >= param_min_coverage: is_ok_coverage = 1 else: is_ok_coverage = 0 # check for missing introns is_ok_exons = 1 if abs(len(c1) - len(c2)) != 0: if param_missing_max_missing: if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or (min(len(c1), len(c2)) < param_missing_min_present)): is_ok_exons = 0 else: is_ok_exons = 0 if param_loglevel >= 3: print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \ "boundaries_ok=", is_ok, \ "nexons_ok=", is_ok_exons, \ "missed_c2r=", nmissed_cmp2ref, \ "missed_r2c=", nmissed_ref2cmp, \ "min_cov=", cc, \ "mapped=", mapped_boundaries, \ "reference=", reference_boundaries print "#", string.join(map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b), map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") sys.stdout.flush() # dump out pairs for method in param_write_pairs: if method == "all": print string.join(map(str, ( "pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[ i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[ j].getLength(), map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b, nmissed_cmp2ref, mapped_boundaries, nmissed_ref2cmp, reference_boundaries, i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t") elif method == "alignment": print string.join(map(str, ( "pair", method, prediction_id1, prediction_id2, map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(), map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t") elif method == "location": print string.join(map(str, ( "pair", method, prediction_id1, prediction_id2, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[ i].getLength(), sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t") if not is_ok_exons: if param_loglevel >= 4: print "# rejected %i and %i: too many exons difference." % (i, j) continue if param_check_exon_boundaries: if not is_ok: continue if cc < param_min_coverage: continue if not weights.has_key(cc): weights[cc] = [] alis1[i].append((coverage_a, j)) alis2[j].append((coverage_b, i)) weights[cc].append((i, j, map_a2b)) # sort out alignments ww = weights.keys() ww.sort() ww.reverse() pairs = [] assigned1 = {} assigned2 = {} if param_loglevel >= 3: print "# alis1=", alis1 print "# alis2=", alis2 print "# --------------------------------------" for w in ww: for i, j, map_a2b in weights[w]: if not assigned1.has_key(i) and not assigned2.has_key(j): pairs.append((transcripts1[i], transcripts2[j], w, map_a2b)) assigned1[i] = 1 assigned2[j] = 1 if len(assigned1) == len(transcripts1): break if len(assigned2) == len(transcripts2): break return pairs
def ProcessRegion(predictions, region_id, region, peptide_sequences=None, filter_queries={}): """process a set of matches to a region. resolve region according to homology. """ if options.loglevel >= 3: options.stdlog.write( "###################################################################\n") options.stdlog.write( "# resolving %i predictions in region %s\n" % (len(predictions), str(region))) sys.stdout.flush() predictions.sort(lambda x, y: cmp(x.score, y.score)) predictions.reverse() alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep) result = alignlib_lite.makeAlignmentVector() cluster = [] map_sequence2cluster = range(0, len(predictions)) edges = [] noutput, nskipped = 0, 0 if peptide_sequences: for x in range(len(predictions)): if options.loglevel >= 5: options.stdlog.write("# filtering from %i with prediction %i: %s\n" % ( x, predictions[x].mPredictionId, predictions[x].mQueryToken)) sys.stdout.flush() if map_sequence2cluster[x] != x: continue region_id += 1 edges = [] if predictions[x].mQueryToken not in filter_queries: edges.append(predictions[x]) else: nskipped += 1 for y in range(x + 1, len(predictions)): if map_sequence2cluster[y] != y: continue if predictions[x].mQueryToken < predictions[y].mQueryToken: key = "%s-%s" % (predictions[x].mQueryToken, predictions[y].mQueryToken) else: key = "%s-%s" % (predictions[y].mQueryToken, predictions[x].mQueryToken) # check if predictions are overlapping on the genomic sequence if min(predictions[x].mSbjctGenomeTo, predictions[y].mSbjctGenomeTo) - \ max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0: if options.loglevel >= 4: options.stdlog.write("# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" % (predictions[x].mPredictionId, predictions[y].mPredictionId)) sys.stdout.flush() continue if not global_alignments.has_key(key): seq1 = peptide_sequences[predictions[x].mQueryToken] seq2 = peptide_sequences[predictions[y].mQueryToken] result.clear() s1 = alignlib_lite.makeSequence(seq1) s2 = alignlib_lite.makeSequence(seq2) alignator.align(result, s1, s2) c1 = 100 * \ (result.getRowTo() - result.getRowFrom()) / len(seq1) c2 = 100 * \ (result.getColTo() - result.getColFrom()) / len(seq2) min_cov = min(c1, c2) max_cov = max(c1, c2) identity = alignlib_lite.calculatePercentIdentity( result, s1, s2) * 100 # check if predictions overlap and they are homologous if result.getScore() >= options.overlap_min_score and \ max_cov >= options.overlap_max_coverage and \ min_cov >= options.overlap_min_coverage and \ identity >= options.overlap_min_identity: global_alignments[key] = True else: global_alignments[key] = False if options.loglevel >= 4: options.stdlog.write("# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" % (key, result.getScore(), identity, c1, c2, min_cov, max_cov, global_alignments[key])) sys.stdout.flush() if global_alignments[key]: map_sequence2cluster[y] = x if predictions[y].mQueryToken not in filter_queries: edges.append(predictions[y]) else: nskipped += 1 noutput += PrintEdges(region_id, region, edges) return region_id, noutput, nskipped
nintrons, nsplits, nstopcodons, pidentity, psimilarity, sequence, sbjct_genome_from, sbjct_genome_to, map_query2genome FROM %s AS p WHERE p.sbjct_token = '%s' AND p.sbjct_strand = '%s' AND OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 """ alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep) map_reference2target = alignlib_lite.makeAlignmentVector() assignment_id = 0 for line in cr.fetchall(): reference = PredictionParser.PredictionParserEntry() reference.FillFromTable(line) ct = dbhandle.cursor() ct.execute(statement % (param_tablename_predictions_target, reference.mSbjctToken, reference.mSbjctStrand, reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo)) reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
def EliminateRedundantEntries( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = None, this_quality = None ): """eliminate redundant entries in a set.""" eliminated = [] rep_id = rep.transcript_id rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep ) result = alignlib_lite.makeAlignmentVector() rep_seq = peptides[rep_id] rep_extended_seq = extended_peptides[rep_id] for entry in data: mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id, entry.mQueryCoverage, entry.mPid, entry.mQuality ) mem_seq = peptides[mem_id] mem_extended_seq = extended_peptides[mem_id] if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality)) if mem_id in eliminated_predictions: continue if mem_extended_seq == rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "i") ) elif mem_extended_seq in rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "p") ) else: if mem_quality != this_quality or \ mem_quality in options.quality_exclude_same: seq1 = alignlib_lite.makeSequence( str(rep_seq) ) seq2 = alignlib_lite.makeSequence( str(mem_seq) ) alignator.align( result, seq1, seq2 ) if options.loglevel >= 5: options.stdlog.write( "# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit( result, seq1, seq2 ) ) pidentity = 100 * alignlib_lite.calculatePercentIdentity( result, seq1, seq2 ) num_gaps = result.getNumGaps() if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\ ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) ) if pidentity >= options.min_identity: keep = False if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: keep = True reason = "covpid" elif num_gaps >= options.max_gaps and \ mem_coverage > rep_coverage - options.safety_coverage: keep = True reason = "gaps" elif mem_coverage >= rep_coverage - options.safety_coverage and \ 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage: keep = True reason = "memcov" if keep: options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "h") ) elif pidentity >= options.min_identity_non_genes and \ this_quality in options.quality_genes and \ mem_quality not in options.quality_genes: if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "l") ) return eliminated