def _fix_stop_codon(transcript): """This private function will fix the CDS and stop codons when the transcript comes from GTF2 and therefore has, incorrectly, the stop codon outside the CDS.""" if transcript.strand == "-": # We need to check whether the stop codon is actually in the same exon. if transcript.stop_codon[-1][1] == transcript.combined_cds[0][0] - 1: phase = transcript.phases.pop(transcript.combined_cds[0], None) transcript.combined_cds[0] = (transcript.stop_codon.pop(-1)[0], transcript.combined_cds[0][1]) transcript.phases[transcript.combined_cds[0]] = phase transcript.combined_cds = [tuple(_) for _ in transcript.stop_codon] + transcript.combined_cds for pos, utr in enumerate(transcript.combined_utr): if utr[0] > transcript.combined_cds[-1][1]: continue # Skip the 5' over = overlap(utr, transcript.combined_cds[0]) if over < 0: continue elif over > 3: raise InvalidCDS("Invalid overlap between UTR and CDS found") else: if over == utr[1] - utr[0] + 1: # This is equivalent to a fragment. Remove. transcript.combined_utr[pos] = None else: transcript.combined_utr[pos] = (utr[0], max(utr[0], transcript.combined_cds[0][0] - 1)) else: # Expand the last CDS if transcript.stop_codon[0][0] == transcript.combined_cds[-1][1] + 1: phase = transcript.phases.pop(transcript.combined_cds[-1], None) transcript.combined_cds[-1] = (transcript.combined_cds[-1][0], transcript.stop_codon.pop(0)[1]) transcript.phases[transcript.combined_cds[-1]] = phase transcript.combined_cds.extend([tuple(_) for _ in transcript.stop_codon]) for pos, utr in enumerate(transcript.combined_utr): if utr[1] < transcript.combined_cds[0][0]: continue # Skip the 5' over = overlap(utr, transcript.combined_cds[-1]) if over < 0: continue elif over > 3: raise InvalidCDS("Invalid overlap between UTR and CDS found") else: if over == utr[1] - utr[0] + 1: # This is equivalent to a fragment. Remove. transcript.combined_utr[pos] = None else: transcript.combined_utr[pos] = (min(utr[1], transcript.combined_cds[-1][1] + 1), utr[1]) transcript.combined_utr = [_ for _ in transcript.combined_utr if _ is not None] # Remove the deleted UTRs return transcript
def __check_internal_orf(transcript, index): """ Method that verifies that an internal ORF does not have any internal gap. :param transcript: the transcript to analyse :type transcript: Mikado.loci.Transcript :param index: index of the internal orf to check :type index: int :return: the updated transcript :rtype: Mikado.loci.Transcript """ if transcript._trust_orf is True and index == 0: if (transcript.is_coding and transcript.phases) or not transcript.is_coding: new_orf = [] for segment in transcript.internal_orfs[index]: if segment[0] == "CDS": segment = tuple([ segment[0], segment[1], transcript.phases[segment[1]] ]) new_orf.append(segment) transcript.internal_orfs[index] = new_orf return transcript else: pass orf, new_orf = transcript.internal_orfs[index], [] exons = sorted(transcript.exons, reverse=(transcript.strand == "-")) coding = sorted([_ for _ in orf if _[0] == "CDS"], key=operator.itemgetter(1)) transcript.logger.debug("ORF for %s: %s", transcript.id, coding) if not coding: raise InvalidCDS("No ORF for {}, index {}!".format( transcript.id, index)) before = sorted( [_ for _ in orf if _[0] == "UTR" and _[1][1] < coding[0][1][0]], key=operator.itemgetter(1)) after = sorted( [_ for _ in orf if _[0] == "UTR" and _[1][0] > coding[-1][1][1]], key=operator.itemgetter(1)) first = min(coding[0][1][0], float("inf") if not before else before[0][1][0]) last = max(coding[-1][1][1], float("-inf") if not after else after[-1][1][1]) if first != transcript.start or last != transcript.end: raise InvalidCDS("""Invalid start and stop of the ORF for {} First: {} Start: {} Last: {} End {} Coding: {} Before: {} After: {} dict: {}""".format(transcript.id, first, transcript.start, last, transcript.end, coding, before, after, transcript.__dict__)) # Check that the number of exons with a coding section is correct and that they are in the correct order. coding_exons = [ _ for _ in enumerate(exons) if _[1][1] >= coding[0][1][1] and _[1][0] <= coding[-1][1][0] ] if len(coding_exons) != len(coding) or coding_exons[-1][0] - coding_exons[ 0][0] + 1 != len(coding): raise InvalidCDS(""""Invalid number of coding exons for {} ({} vs {}) Coding: {} Coding_exons (recalculated): {}""".format(transcript.id, len(coding), len(coding_exons), coding, coding_exons)) # Now it's time to check the phases if transcript.strand == "-": coding = list(reversed(coding)) five_utr = list(reversed(after)) three_utr = list(reversed(before)) else: five_utr = before three_utr = after del before, after phase_orf = [] for _ in coding: if len(_) == 3: if _[2] not in (None, 0, 1, 2): raise ValueError("Invalid phase value for {}".format( transcript.id)) phase_orf.append(_[2]) elif len(_) == 2: continue else: raise ValueError("Invalid CDS fragment: {}".format(_)) if len(phase_orf) != 0 and len(phase_orf) != len(coding): transcript.logger.warning("Invalid phases for %s. Resetting.", transcript.id) phase_orf = [] if not phase_orf and transcript.phases: phases_keys = sorted(transcript.phases.keys(), reverse=(transcript.strand == "-")) phase_orf = [transcript.phases[_] for _ in phases_keys] # Calculating the complement of the phase so that # previous = (3 - phase_orf[0]) % 3 previous = phase_orf[0] # transcript.logger.warning(previous) elif not phase_orf and transcript._first_phase is not None: previous = transcript._first_phase phase_orf = [] elif phase_orf: previous = phase_orf[0] else: phase_orf = [] for segment in sorted(orf, key=operator.itemgetter(1), reverse=(transcript.strand == "-")): if segment[0] != "CDS": continue else: if len(segment) == 3: phase_orf.append(segment[2]) else: break if phase_orf and len(phase_orf) == len(coding): previous = phase_orf[0] else: previous = 0 phase_orf = [] total_cds_length, __calculated_phases = __calculate_phases( coding, previous) new_phases_keys = sorted(__calculated_phases.keys(), reverse=(transcript.strand == "-")) new_phase_orf = [__calculated_phases[_] for _ in new_phases_keys] if len(__calculated_phases) != len(coding): # This is a mistake which should crash the program raise ValueError("Error in calculating the phases!") if phase_orf and new_phase_orf != phase_orf: transcript.logger.debug( "Wrong phases for %s, using recalculated ones (\n%s\nvs\n%s)", transcript.id, phase_orf, __calculated_phases) else: transcript.logger.debug("Correct phases for %s: %s", transcript.id, __calculated_phases) if total_cds_length % 3 != 0 and three_utr and five_utr: # The transcript is truncated. raise InvalidCDS( """"Both UTR presents with a truncated ORF (length {}, modulo {}) in {}; 5'UTR: {} 3' UTR: {}""".format(total_cds_length, total_cds_length % 3, transcript.id, five_utr, three_utr)) elif total_cds_length % 3 != 0 and three_utr: for num in (0, 1, 2): total_cds_length, __calculated_phases = __calculate_phases( coding, num) if total_cds_length % 3 == 0: break if total_cds_length % 3 != 0: raise InvalidCDS("Persistently wrong ORF for %s at 5' end", transcript.id) # new_phase_orf = [__calculated_phases[_] for _ in phases_keys] if ((__calculated_phases[sorted(__calculated_phases.keys(), reverse=(transcript.strand == "-"))[0]] != 0) and five_utr): raise InvalidCDS( "5'UTR present with a truncated ORF at 5' end for {}".format( transcript.id)) transcript.phases = __calculated_phases transcript.logger.debug("Total CDS length %d", total_cds_length) new_orf = five_utr[:] new_orf.extend([(_[0][0], _[0][1], _[1]) for _ in zip(coding, [ __calculated_phases[_] for _ in sorted(__calculated_phases.keys(), reverse=(transcript.strand == "-")) ])]) new_orf.extend(three_utr) new_orf.extend([("exon", _) for _ in transcript.exons]) new_orf = sorted(new_orf, key=operator.itemgetter(1, 0)) transcript.internal_orfs[index] = new_orf return transcript
def _check_cdna_vs_utr(transcript): """ Verify that cDNA + UTR in the transcript add up. :return: """ transcript.logger.debug("Checking the cDNA for %s", transcript.id) if transcript.cdna_length > transcript.combined_utr_length + transcript.combined_cds_length: if transcript.combined_utr == transcript.combined_cds == []: # non-coding transcript transcript.logger.debug("%s is non coding, returning", transcript.id) return assert transcript.combined_cds != [] transcript.logger.debug( "Recalculating the UTR for %s. Reason: cDNA length %s, UTR %s, CDS %s (total %s)", transcript.id, transcript.cdna_length, transcript.combined_utr_length, transcript.combined_cds_length, transcript.combined_utr_length + transcript.combined_cds_length) transcript.combined_utr = [] # Reset transcript.combined_cds = sorted(transcript.combined_cds, key=operator.itemgetter(0, 1)) combined_cds = IntervalTree.from_tuples(transcript.combined_cds) orfs = [ IntervalTree.from_tuples([_[1] for _ in orf if _[0] == "CDS"]) for orf in transcript.internal_orfs ] assert isinstance(combined_cds, IntervalTree) exons = IntervalTree.from_intervals( [Interval(*exon) for exon in transcript.exons]) mapper = defaultdict(list) for cds in transcript.combined_cds: fexon = exons.find(cds[0] - 1, cds[1], strict=False) if len(fexon) > 1: raise InvalidCDS( "{} has a CDS ({}) which straddles {} different exons ({})." .format(transcript.id, cds, len(fexon), fexon)) elif len(fexon) == 0: raise InvalidCDS( "{} has a CDS ({}) which is not mapped to any exon.". format(transcript.id, cds, len(fexon), fexon)) mapper[fexon[0]].append(cds) for exon in transcript.exons: if exon not in mapper: transcript.combined_utr.append(exon) continue elif len(mapper[exon]) == 1: cds = mapper[exon][0] if cds[0] == exon[0] and exon[1] == cds[1]: continue else: before = None after = None if cds[0] < exon[0] or cds[1] > exon[1]: raise InvalidCDS("{} in {} debords its exon {}".format( cds, transcript.id, exon)) if cds[0] > exon[0]: before = (exon[0], max(cds[0] - 1, exon[0])) transcript.combined_utr.append(before) if cds[1] < exon[1]: after = (min(cds[1] + 1, exon[1]), exon[1]) transcript.combined_utr.append(after) assert before or after, (exon, cds) else: transcript.logger.debug("Starting to find the UTRs for %s", exon) found = sorted(mapper[exon]) utrs = [] for pos, interval in enumerate(found): if pos == len(found) - 1: if exon[1] > interval[1]: utrs.append((min(exon[1], interval[1] + 1), exon[1])) continue if pos == 0 and exon[0] < interval[0]: utrs.append((exon[0], max(exon[0], interval[0] - 1))) next_interval = found[pos + 1] if not (interval[1] + 1 <= next_interval[0] - 1): raise InvalidCDS( "Error while inferring the UTR for a transcript with multiple ORFs: overlapping CDS found." ) utrs.append((interval[1] + 1, next_interval[0] - 1)) assert utrs, found utr_sum = sum([_[1] - _[0] + 1 for _ in utrs]) cds_sum = sum(_[1] - _[0] + 1 for _ in found) assert utr_sum + cds_sum == exon[1] - exon[0] + 1, (utr_sum, cds_sum, exon[1] - exon[0] + 1, utrs, found) transcript.combined_utr.extend(utrs) # If no CDS and no UTR are present, all good equality_one = (transcript.combined_cds_length == transcript.combined_utr_length == 0) # Otherwise, if cDNA length == UTR + CDS, all good equality_two = ( transcript.cdna_length == transcript.combined_utr_length + transcript.combined_cds_length) if not (equality_one or equality_two): # Something fishy going on raise InvalidCDS(""""Failed to create the UTR: ID: {} Exons: {} Combined CDS: {} Combined UTR: {} CDS == UTR == 0: {} CDNA == CDS + UTR: {} CDNA == {} CDS == {} UTR == {}""".format(transcript.id, transcript.exons, transcript.combined_cds, transcript.combined_utr, equality_one, equality_two, transcript.cdna_length, transcript.combined_cds_length, transcript.combined_utr_length))
def _check_cdna_vs_utr(transcript): """ Verify that cDNA + UTR in the transcript add up. :return: """ transcript.logger.debug("Checking the cDNA for %s", transcript.id) if transcript.cdna_length > transcript.combined_utr_length + transcript.combined_cds_length: if transcript.combined_utr == transcript.combined_cds == []: # non-coding transcript transcript.logger.debug("%s is non coding, returning", transcript.id) return assert transcript.combined_cds != [] transcript.logger.debug("Recalculating the UTR for %s. Reason: cDNA length %s, UTR %s, CDS %s (total %s)", transcript.id, transcript.cdna_length, transcript.combined_utr_length, transcript.combined_cds_length, transcript.combined_utr_length + transcript.combined_cds_length) transcript.combined_utr = [] # Reset transcript.combined_cds = sorted(transcript.combined_cds, key=operator.itemgetter(0, 1)) combined_cds = IntervalTree.from_tuples(transcript.combined_cds) orfs = [IntervalTree.from_tuples([_[1] for _ in orf if _[0] == "CDS"]) for orf in transcript.internal_orfs] assert isinstance(combined_cds, IntervalTree) for exon in transcript.exons: assert isinstance(exon, tuple), type(exon) found = combined_cds.find(exon[0], exon[1]) if len(found) == 0 and exon not in transcript.combined_cds: # Second condition due to BUG # Exon completely noncoding transcript.combined_utr.append(exon) elif len(found) == 0: # Bug, see above continue elif len(found) == 1: found = found[0] if found.start == exon[0] and found.end == exon[1]: # The exon is completely coding continue else: # I have to find all the regions of the exon which are not coding before = None after = None if found.start > exon[0]: before = (exon[0], max(found.start - 1, exon[0])) transcript.combined_utr.append(before) if found.end < exon[1]: after = (min(found.end + 1, exon[1]), exon[1]) transcript.combined_utr.append(after) assert before or after, (exon, found) else: # The exon is overlapping *two* different CDS segments! This is valid *only* if there are multiple ORFs if len(found) > len(transcript.internal_orfs): raise InvalidCDS( "Found in {} an exon ({}) which is overlapping with more CDS segments than there are ORFs.".format( transcript.id, exon )) # Now we have to check for each internal ORF that things are OK for orf in orfs: orf_found = orf.find(exon[0], exon[1]) if len(orf_found) > 1: raise InvalidCDS( "Found in {} an exon ({}) which is overlapping with more CDS segments in a single ORF.".format( transcript.id, exon )) # If we are here, it means that the internal UTR is legit. We should now add the untranslated regions # to the store. transcript.logger.debug("Starting to find the UTRs for %s", exon) found = sorted(found) utrs = [] for pos, interval in enumerate(found): if pos == len(found) - 1: if exon[1] > interval.end: utrs.append((min(exon[1], interval.end + 1), exon[1])) continue if pos == 0 and exon[0] < interval.start: utrs.append((exon[0], max(exon[0], interval.start - 1))) next_interval = found[pos + 1] if not (interval.end + 1 <= next_interval.start - 1): raise InvalidCDS( "Error while inferring the UTR for a transcript with multiple ORFs: overlapping CDS found.") utrs.append((interval.end + 1, next_interval.start - 1)) assert utrs, found utr_sum = sum([_[1] - _[0] + 1 for _ in utrs]) cds_sum = sum(_.end - _.start + 1 for _ in found) assert utr_sum + cds_sum == exon[1] - exon[0] + 1, (utr_sum, cds_sum, exon[1] - exon[0] + 1, utrs, found) transcript.combined_utr.extend(utrs) # If no CDS and no UTR are present, all good equality_one = (transcript.combined_cds_length == transcript.combined_utr_length == 0) # Otherwise, if cDNA length == UTR + CDS, all good equality_two = (transcript.cdna_length == transcript.combined_utr_length + transcript.combined_cds_length) if not (equality_one or equality_two): # Something fishy going on raise InvalidCDS( """"Failed to create the UTR: ID: {} Exons: {} Combined CDS: {} Combined UTR: {} CDS == UTR == 0: {} CDNA == CDS + UTR: {} CDNA == {} CDS == {} UTR == {}""".format(transcript.id, transcript.exons, transcript.combined_cds, transcript.combined_utr, equality_one, equality_two, transcript.cdna_length, transcript.combined_cds_length, transcript.combined_utr_length))
def _check_cdna_vs_utr(transcript): """ Verify that cDNA + UTR in the transcript add up. :return: """ transcript.logger.debug("Checking the cDNA for %s", transcript.id) if transcript.cdna_length > transcript.combined_utr_length + transcript.combined_cds_length: if transcript.combined_utr == transcript.combined_cds == []: # non-coding transcript transcript.logger.debug("%s is non coding, returning", transcript.id) return assert transcript.combined_cds != [] transcript.logger.debug("Recalculating the UTR for %s", transcript.id) transcript.combined_utr = [] # Reset transcript.combined_cds = sorted(transcript.combined_cds, key=operator.itemgetter(0, 1)) for exon in transcript.exons: assert isinstance(exon, tuple) if exon in transcript.combined_cds: continue # The end of the exon is before the first ORF start # or the start is after the last ORF segment: UTR segment elif (exon[1] < transcript.combined_cds[0][0] or exon[0] > transcript.combined_cds[-1][1]): transcript.combined_utr.append(exon) # The last base of the exon is the first ORF base elif (exon[0] < transcript.combined_cds[0][0] and exon[1] == transcript.combined_cds[0][1]): transcript.combined_utr.append( tuple([exon[0], transcript.combined_cds[0][0] - 1])) # The first base of the exon is the first base of the last ORF segment: # UTR after elif (exon[1] > transcript.combined_cds[-1][1] and exon[0] == transcript.combined_cds[-1][0]): transcript.combined_utr.append( tuple([transcript.combined_cds[-1][1] + 1, exon[1]])) else: # If the ORF is contained inside a single exon, with UTR # at both sites, then we create the two UTR segments if len(transcript.combined_cds) == 1: transcript.combined_utr.append( tuple([exon[0], transcript.combined_cds[0][0] - 1])) transcript.combined_utr.append( tuple([transcript.combined_cds[-1][1] + 1, exon[1]])) else: # This means there is an INTERNAL UTR region between # two CDS segments: something is clearly wrong! raise InvalidCDS("Error while inferring the UTR", exon, transcript.id, transcript.exons, transcript.combined_cds) # If no CDS and no UTR are present, all good equality_one = (transcript.combined_cds_length == transcript.combined_utr_length == 0) # Otherwise, if cDNA length == UTR + CDS, all good equality_two = ( transcript.cdna_length == transcript.combined_utr_length + transcript.combined_cds_length) if not (equality_one or equality_two): # Something fishy going on raise InvalidCDS(""""Failed to create the UTR: ID: {} Exons: {} Combined CDS: {} Combined UTR: {} CDS == UTR == 0: {} CDNA == CDS + UTR: {} CDNA == {} CDS == {} UTR == {}""".format(transcript.id, transcript.exons, transcript.combined_cds, transcript.combined_utr, equality_one, equality_two, transcript.cdna_length, transcript.combined_cds_length, transcript.combined_utr_length))