예제 #1
0
def _fix_stop_codon(transcript):

    """This private function will fix the CDS and stop codons when the transcript comes from GTF2
    and therefore has, incorrectly, the stop codon outside the CDS."""

    if transcript.strand == "-":
        # We need to check whether the stop codon is actually in the same exon.
        if transcript.stop_codon[-1][1] == transcript.combined_cds[0][0] - 1:
            phase = transcript.phases.pop(transcript.combined_cds[0], None)
            transcript.combined_cds[0] = (transcript.stop_codon.pop(-1)[0],
                                      transcript.combined_cds[0][1])
            transcript.phases[transcript.combined_cds[0]] = phase
        transcript.combined_cds = [tuple(_) for _ in transcript.stop_codon] + transcript.combined_cds
        for pos, utr in enumerate(transcript.combined_utr):
            if utr[0] > transcript.combined_cds[-1][1]:
                continue  # Skip the 5'
            over = overlap(utr, transcript.combined_cds[0])
            if over < 0:
                continue
            elif over > 3:
                raise InvalidCDS("Invalid overlap between UTR and CDS found")
            else:
                if over == utr[1] - utr[0] + 1:  # This is equivalent to a fragment. Remove.
                    transcript.combined_utr[pos] = None
                else:
                    transcript.combined_utr[pos] = (utr[0], max(utr[0], transcript.combined_cds[0][0] - 1))
    else:
        # Expand the last CDS
        if transcript.stop_codon[0][0] == transcript.combined_cds[-1][1] + 1:
            phase = transcript.phases.pop(transcript.combined_cds[-1], None)
            transcript.combined_cds[-1] = (transcript.combined_cds[-1][0],
                                           transcript.stop_codon.pop(0)[1])
            transcript.phases[transcript.combined_cds[-1]] = phase
        transcript.combined_cds.extend([tuple(_) for _ in transcript.stop_codon])
        for pos, utr in enumerate(transcript.combined_utr):
            if utr[1] < transcript.combined_cds[0][0]:
                continue  # Skip the 5'
            over = overlap(utr, transcript.combined_cds[-1])
            if over < 0:
                continue
            elif over > 3:
                raise InvalidCDS("Invalid overlap between UTR and CDS found")
            else:
                if over == utr[1] - utr[0] + 1:  # This is equivalent to a fragment. Remove.
                    transcript.combined_utr[pos] = None
                else:
                    transcript.combined_utr[pos] = (min(utr[1], transcript.combined_cds[-1][1] + 1),
                                                    utr[1])
    transcript.combined_utr = [_ for _ in transcript.combined_utr if _ is not None]  # Remove the deleted UTRs
    return transcript
예제 #2
0
def __check_internal_orf(transcript, index):
    """
    Method that verifies that an internal ORF does not have any internal gap.

    :param transcript: the transcript to analyse
    :type transcript: Mikado.loci.Transcript
    :param index: index of the internal orf to check
    :type index: int

    :return: the updated transcript
    :rtype: Mikado.loci.Transcript
    """

    if transcript._trust_orf is True and index == 0:
        if (transcript.is_coding
                and transcript.phases) or not transcript.is_coding:
            new_orf = []
            for segment in transcript.internal_orfs[index]:
                if segment[0] == "CDS":
                    segment = tuple([
                        segment[0], segment[1], transcript.phases[segment[1]]
                    ])
                new_orf.append(segment)
            transcript.internal_orfs[index] = new_orf
            return transcript
        else:
            pass

    orf, new_orf = transcript.internal_orfs[index], []

    exons = sorted(transcript.exons, reverse=(transcript.strand == "-"))

    coding = sorted([_ for _ in orf if _[0] == "CDS"],
                    key=operator.itemgetter(1))
    transcript.logger.debug("ORF for %s: %s", transcript.id, coding)

    if not coding:
        raise InvalidCDS("No ORF for {}, index {}!".format(
            transcript.id, index))
    before = sorted(
        [_ for _ in orf if _[0] == "UTR" and _[1][1] < coding[0][1][0]],
        key=operator.itemgetter(1))
    after = sorted(
        [_ for _ in orf if _[0] == "UTR" and _[1][0] > coding[-1][1][1]],
        key=operator.itemgetter(1))

    first = min(coding[0][1][0],
                float("inf") if not before else before[0][1][0])
    last = max(coding[-1][1][1],
               float("-inf") if not after else after[-1][1][1])

    if first != transcript.start or last != transcript.end:
        raise InvalidCDS("""Invalid start and stop of the ORF for {}
First: {} Start: {}
Last: {} End {}
Coding: {}
Before: {}
After: {}
dict: {}""".format(transcript.id, first, transcript.start, last,
                   transcript.end, coding, before, after, transcript.__dict__))

    # Check that the number of exons with a coding section is correct and that they are in the correct order.
    coding_exons = [
        _ for _ in enumerate(exons)
        if _[1][1] >= coding[0][1][1] and _[1][0] <= coding[-1][1][0]
    ]
    if len(coding_exons) != len(coding) or coding_exons[-1][0] - coding_exons[
            0][0] + 1 != len(coding):
        raise InvalidCDS(""""Invalid number of coding exons for {} ({} vs {})
Coding: {}
Coding_exons (recalculated): {}""".format(transcript.id, len(coding),
                                          len(coding_exons), coding,
                                          coding_exons))

    # Now it's time to check the phases
    if transcript.strand == "-":
        coding = list(reversed(coding))
        five_utr = list(reversed(after))
        three_utr = list(reversed(before))
    else:
        five_utr = before
        three_utr = after

    del before, after

    phase_orf = []
    for _ in coding:
        if len(_) == 3:
            if _[2] not in (None, 0, 1, 2):
                raise ValueError("Invalid phase value for {}".format(
                    transcript.id))
            phase_orf.append(_[2])
        elif len(_) == 2:
            continue
        else:
            raise ValueError("Invalid CDS fragment: {}".format(_))

    if len(phase_orf) != 0 and len(phase_orf) != len(coding):
        transcript.logger.warning("Invalid phases for %s. Resetting.",
                                  transcript.id)
        phase_orf = []

    if not phase_orf and transcript.phases:
        phases_keys = sorted(transcript.phases.keys(),
                             reverse=(transcript.strand == "-"))
        phase_orf = [transcript.phases[_] for _ in phases_keys]
        # Calculating the complement of the phase so that
        # previous = (3 - phase_orf[0]) % 3
        previous = phase_orf[0]
        # transcript.logger.warning(previous)
    elif not phase_orf and transcript._first_phase is not None:
        previous = transcript._first_phase
        phase_orf = []
    elif phase_orf:
        previous = phase_orf[0]
    else:
        phase_orf = []
        for segment in sorted(orf,
                              key=operator.itemgetter(1),
                              reverse=(transcript.strand == "-")):
            if segment[0] != "CDS":
                continue
            else:
                if len(segment) == 3:
                    phase_orf.append(segment[2])
                else:
                    break
        if phase_orf and len(phase_orf) == len(coding):
            previous = phase_orf[0]
        else:
            previous = 0
            phase_orf = []

    total_cds_length, __calculated_phases = __calculate_phases(
        coding, previous)
    new_phases_keys = sorted(__calculated_phases.keys(),
                             reverse=(transcript.strand == "-"))
    new_phase_orf = [__calculated_phases[_] for _ in new_phases_keys]

    if len(__calculated_phases) != len(coding):
        # This is a mistake which should crash the program
        raise ValueError("Error in calculating the phases!")

    if phase_orf and new_phase_orf != phase_orf:
        transcript.logger.debug(
            "Wrong phases for %s, using recalculated ones (\n%s\nvs\n%s)",
            transcript.id, phase_orf, __calculated_phases)
    else:
        transcript.logger.debug("Correct phases for %s: %s", transcript.id,
                                __calculated_phases)
    if total_cds_length % 3 != 0 and three_utr and five_utr:
        # The transcript is truncated.
        raise InvalidCDS(
            """"Both UTR presents with a truncated ORF (length {}, modulo {}) in {};
5'UTR: {}
3' UTR: {}""".format(total_cds_length, total_cds_length % 3, transcript.id,
                     five_utr, three_utr))
    elif total_cds_length % 3 != 0 and three_utr:
        for num in (0, 1, 2):
            total_cds_length, __calculated_phases = __calculate_phases(
                coding, num)
            if total_cds_length % 3 == 0:
                break

        if total_cds_length % 3 != 0:
            raise InvalidCDS("Persistently wrong ORF for %s at 5' end",
                             transcript.id)

    # new_phase_orf = [__calculated_phases[_] for _ in phases_keys]
    if ((__calculated_phases[sorted(__calculated_phases.keys(),
                                    reverse=(transcript.strand == "-"))[0]] !=
         0) and five_utr):
        raise InvalidCDS(
            "5'UTR present with a truncated ORF at 5' end for {}".format(
                transcript.id))

    transcript.phases = __calculated_phases

    transcript.logger.debug("Total CDS length %d", total_cds_length)

    new_orf = five_utr[:]
    new_orf.extend([(_[0][0], _[0][1], _[1])
                    for _ in zip(coding, [
                        __calculated_phases[_]
                        for _ in sorted(__calculated_phases.keys(),
                                        reverse=(transcript.strand == "-"))
                    ])])
    new_orf.extend(three_utr)

    new_orf.extend([("exon", _) for _ in transcript.exons])
    new_orf = sorted(new_orf, key=operator.itemgetter(1, 0))

    transcript.internal_orfs[index] = new_orf
    return transcript
예제 #3
0
def _check_cdna_vs_utr(transcript):
    """
    Verify that cDNA + UTR in the transcript add up.
    :return:
    """

    transcript.logger.debug("Checking the cDNA for %s", transcript.id)
    if transcript.cdna_length > transcript.combined_utr_length + transcript.combined_cds_length:
        if transcript.combined_utr == transcript.combined_cds == []:
            # non-coding transcript
            transcript.logger.debug("%s is non coding, returning",
                                    transcript.id)
            return
        assert transcript.combined_cds != []

        transcript.logger.debug(
            "Recalculating the UTR for %s. Reason: cDNA length %s, UTR %s, CDS %s (total %s)",
            transcript.id, transcript.cdna_length,
            transcript.combined_utr_length, transcript.combined_cds_length,
            transcript.combined_utr_length + transcript.combined_cds_length)
        transcript.combined_utr = []  # Reset
        transcript.combined_cds = sorted(transcript.combined_cds,
                                         key=operator.itemgetter(0, 1))

        combined_cds = IntervalTree.from_tuples(transcript.combined_cds)
        orfs = [
            IntervalTree.from_tuples([_[1] for _ in orf if _[0] == "CDS"])
            for orf in transcript.internal_orfs
        ]
        assert isinstance(combined_cds, IntervalTree)

        exons = IntervalTree.from_intervals(
            [Interval(*exon) for exon in transcript.exons])

        mapper = defaultdict(list)
        for cds in transcript.combined_cds:
            fexon = exons.find(cds[0] - 1, cds[1], strict=False)
            if len(fexon) > 1:
                raise InvalidCDS(
                    "{} has a CDS ({}) which straddles {} different exons ({})."
                    .format(transcript.id, cds, len(fexon), fexon))
            elif len(fexon) == 0:
                raise InvalidCDS(
                    "{} has a CDS ({}) which is not mapped to any exon.".
                    format(transcript.id, cds, len(fexon), fexon))
            mapper[fexon[0]].append(cds)

        for exon in transcript.exons:
            if exon not in mapper:
                transcript.combined_utr.append(exon)
                continue
            elif len(mapper[exon]) == 1:
                cds = mapper[exon][0]
                if cds[0] == exon[0] and exon[1] == cds[1]:
                    continue
                else:
                    before = None
                    after = None
                    if cds[0] < exon[0] or cds[1] > exon[1]:
                        raise InvalidCDS("{} in {} debords its exon {}".format(
                            cds, transcript.id, exon))
                    if cds[0] > exon[0]:
                        before = (exon[0], max(cds[0] - 1, exon[0]))
                        transcript.combined_utr.append(before)
                    if cds[1] < exon[1]:
                        after = (min(cds[1] + 1, exon[1]), exon[1])
                        transcript.combined_utr.append(after)
                    assert before or after, (exon, cds)
            else:
                transcript.logger.debug("Starting to find the UTRs for %s",
                                        exon)
                found = sorted(mapper[exon])
                utrs = []
                for pos, interval in enumerate(found):
                    if pos == len(found) - 1:
                        if exon[1] > interval[1]:
                            utrs.append((min(exon[1],
                                             interval[1] + 1), exon[1]))
                        continue
                    if pos == 0 and exon[0] < interval[0]:
                        utrs.append((exon[0], max(exon[0], interval[0] - 1)))
                    next_interval = found[pos + 1]
                    if not (interval[1] + 1 <= next_interval[0] - 1):
                        raise InvalidCDS(
                            "Error while inferring the UTR for a transcript with multiple ORFs: overlapping CDS found."
                        )
                    utrs.append((interval[1] + 1, next_interval[0] - 1))
                assert utrs, found
                utr_sum = sum([_[1] - _[0] + 1 for _ in utrs])
                cds_sum = sum(_[1] - _[0] + 1 for _ in found)
                assert utr_sum + cds_sum == exon[1] - exon[0] + 1, (utr_sum,
                                                                    cds_sum,
                                                                    exon[1] -
                                                                    exon[0] +
                                                                    1, utrs,
                                                                    found)
                transcript.combined_utr.extend(utrs)

        # If no CDS and no UTR are present, all good
        equality_one = (transcript.combined_cds_length ==
                        transcript.combined_utr_length == 0)
        # Otherwise, if cDNA length == UTR + CDS, all good
        equality_two = (
            transcript.cdna_length == transcript.combined_utr_length +
            transcript.combined_cds_length)
        if not (equality_one or equality_two):
            # Something fishy going on
            raise InvalidCDS(""""Failed to create the UTR:
ID: {}
Exons: {}
Combined CDS: {}
Combined UTR: {}
CDS == UTR == 0: {}
CDNA == CDS + UTR: {}
CDNA == {}
CDS == {}
UTR == {}""".format(transcript.id, transcript.exons, transcript.combined_cds,
                    transcript.combined_utr, equality_one, equality_two,
                    transcript.cdna_length, transcript.combined_cds_length,
                    transcript.combined_utr_length))
예제 #4
0
파일: finalizing.py 프로젝트: benhg/mikado
def _check_cdna_vs_utr(transcript):

    """
    Verify that cDNA + UTR in the transcript add up.
    :return:
    """

    transcript.logger.debug("Checking the cDNA for %s", transcript.id)
    if transcript.cdna_length > transcript.combined_utr_length + transcript.combined_cds_length:
        if transcript.combined_utr == transcript.combined_cds == []:
            # non-coding transcript
            transcript.logger.debug("%s is non coding, returning", transcript.id)
            return
        assert transcript.combined_cds != []

        transcript.logger.debug("Recalculating the UTR for %s. Reason: cDNA length %s, UTR %s, CDS %s (total %s)",
                                transcript.id, transcript.cdna_length, transcript.combined_utr_length,
                                transcript.combined_cds_length,
                                transcript.combined_utr_length + transcript.combined_cds_length)
        transcript.combined_utr = []  # Reset
        transcript.combined_cds = sorted(transcript.combined_cds,
                                         key=operator.itemgetter(0, 1))

        combined_cds = IntervalTree.from_tuples(transcript.combined_cds)
        orfs = [IntervalTree.from_tuples([_[1] for _ in orf if _[0] == "CDS"]) for orf in transcript.internal_orfs]
        assert isinstance(combined_cds, IntervalTree)

        for exon in transcript.exons:
            assert isinstance(exon, tuple), type(exon)
            found = combined_cds.find(exon[0], exon[1])
            if len(found) == 0 and exon not in transcript.combined_cds:  # Second condition due to BUG
                # Exon completely noncoding
                transcript.combined_utr.append(exon)
            elif len(found) == 0:
                # Bug, see above
                continue
            elif len(found) == 1:
                found = found[0]
                if found.start == exon[0] and found.end == exon[1]:
                    # The exon is completely coding
                    continue
                else:
                    # I have to find all the regions of the exon which are not coding
                    before = None
                    after = None
                    if found.start > exon[0]:
                        before = (exon[0], max(found.start - 1, exon[0]))
                        transcript.combined_utr.append(before)
                    if found.end < exon[1]:
                        after = (min(found.end + 1, exon[1]), exon[1])
                        transcript.combined_utr.append(after)
                    assert before or after, (exon, found)
            else:
                # The exon is overlapping *two* different CDS segments! This is valid *only* if there are multiple ORFs
                if len(found) > len(transcript.internal_orfs):
                    raise InvalidCDS(
                        "Found in {} an exon ({}) which is overlapping with more CDS segments than there are ORFs.".format(
                            transcript.id, exon
                        ))
                # Now we have to check for each internal ORF that things are OK
                for orf in orfs:
                    orf_found = orf.find(exon[0], exon[1])
                    if len(orf_found) > 1:
                        raise InvalidCDS(
                            "Found in {} an exon ({}) which is overlapping with more CDS segments in a single ORF.".format(
                                transcript.id, exon
                            ))
                # If we are here, it means that the internal UTR is legit. We should now add the untranslated regions
                # to the store.
                transcript.logger.debug("Starting to find the UTRs for %s", exon)
                found = sorted(found)
                utrs = []
                for pos, interval in enumerate(found):
                    if pos == len(found) - 1:
                        if exon[1] > interval.end:
                            utrs.append((min(exon[1], interval.end + 1), exon[1]))
                        continue
                    if pos == 0 and exon[0] < interval.start:
                        utrs.append((exon[0], max(exon[0], interval.start - 1)))
                    next_interval = found[pos + 1]
                    if not (interval.end + 1 <= next_interval.start - 1):
                        raise InvalidCDS(
                            "Error while inferring the UTR for a transcript with multiple ORFs: overlapping CDS found.")
                    utrs.append((interval.end + 1, next_interval.start - 1))
                assert utrs, found
                utr_sum = sum([_[1] - _[0] + 1 for _ in utrs])
                cds_sum = sum(_.end - _.start + 1 for _ in found)
                assert utr_sum + cds_sum == exon[1] - exon[0] + 1, (utr_sum, cds_sum,
                                                                    exon[1] - exon[0] + 1, utrs, found)
                transcript.combined_utr.extend(utrs)

        # If no CDS and no UTR are present, all good
        equality_one = (transcript.combined_cds_length == transcript.combined_utr_length == 0)
        # Otherwise, if cDNA length == UTR + CDS, all good
        equality_two = (transcript.cdna_length ==
                        transcript.combined_utr_length + transcript.combined_cds_length)
        if not (equality_one or equality_two):
            # Something fishy going on
            raise InvalidCDS(
                """"Failed to create the UTR:
ID: {}
Exons: {}
Combined CDS: {}
Combined UTR: {}
CDS == UTR == 0: {}
CDNA == CDS + UTR: {}
CDNA == {}
CDS == {}
UTR == {}""".format(transcript.id,
                    transcript.exons,
                    transcript.combined_cds,
                    transcript.combined_utr, equality_one, equality_two,
                    transcript.cdna_length, transcript.combined_cds_length, transcript.combined_utr_length))
예제 #5
0
def _check_cdna_vs_utr(transcript):
    """
    Verify that cDNA + UTR in the transcript add up.
    :return:
    """

    transcript.logger.debug("Checking the cDNA for %s", transcript.id)
    if transcript.cdna_length > transcript.combined_utr_length + transcript.combined_cds_length:
        if transcript.combined_utr == transcript.combined_cds == []:
            # non-coding transcript
            transcript.logger.debug("%s is non coding, returning",
                                    transcript.id)
            return
        assert transcript.combined_cds != []
        transcript.logger.debug("Recalculating the UTR for %s", transcript.id)
        transcript.combined_utr = []  # Reset
        transcript.combined_cds = sorted(transcript.combined_cds,
                                         key=operator.itemgetter(0, 1))
        for exon in transcript.exons:
            assert isinstance(exon, tuple)
            if exon in transcript.combined_cds:
                continue
            # The end of the exon is before the first ORF start
            # or the start is after the last ORF segment: UTR segment
            elif (exon[1] < transcript.combined_cds[0][0]
                  or exon[0] > transcript.combined_cds[-1][1]):
                transcript.combined_utr.append(exon)

            # The last base of the exon is the first ORF base
            elif (exon[0] < transcript.combined_cds[0][0]
                  and exon[1] == transcript.combined_cds[0][1]):
                transcript.combined_utr.append(
                    tuple([exon[0], transcript.combined_cds[0][0] - 1]))
            # The first base of the exon is the first base of the last ORF segment:
            # UTR after
            elif (exon[1] > transcript.combined_cds[-1][1]
                  and exon[0] == transcript.combined_cds[-1][0]):
                transcript.combined_utr.append(
                    tuple([transcript.combined_cds[-1][1] + 1, exon[1]]))
            else:
                # If the ORF is contained inside a single exon, with UTR
                # at both sites, then we create the two UTR segments
                if len(transcript.combined_cds) == 1:
                    transcript.combined_utr.append(
                        tuple([exon[0], transcript.combined_cds[0][0] - 1]))
                    transcript.combined_utr.append(
                        tuple([transcript.combined_cds[-1][1] + 1, exon[1]]))
                else:
                    # This means there is an INTERNAL UTR region between
                    # two CDS segments: something is clearly wrong!
                    raise InvalidCDS("Error while inferring the UTR", exon,
                                     transcript.id, transcript.exons,
                                     transcript.combined_cds)

        # If no CDS and no UTR are present, all good
        equality_one = (transcript.combined_cds_length ==
                        transcript.combined_utr_length == 0)
        # Otherwise, if cDNA length == UTR + CDS, all good
        equality_two = (
            transcript.cdna_length == transcript.combined_utr_length +
            transcript.combined_cds_length)
        if not (equality_one or equality_two):
            # Something fishy going on
            raise InvalidCDS(""""Failed to create the UTR:
ID: {}
Exons: {}
Combined CDS: {}
Combined UTR: {}
CDS == UTR == 0: {}
CDNA == CDS + UTR: {}
CDNA == {}
CDS == {}
UTR == {}""".format(transcript.id, transcript.exons, transcript.combined_cds,
                    transcript.combined_utr, equality_one, equality_two,
                    transcript.cdna_length, transcript.combined_cds_length,
                    transcript.combined_utr_length))