Beispiel #1
0
def _fix_stop_codon(transcript):

    """This private function will fix the CDS and stop codons when the transcript comes from GTF2
    and therefore has, incorrectly, the stop codon outside the CDS."""

    if transcript.strand == "-":
        # We need to check whether the stop codon is actually in the same exon.
        if transcript.stop_codon[-1][1] == transcript.combined_cds[0][0] - 1:
            phase = transcript.phases.pop(transcript.combined_cds[0], None)
            transcript.combined_cds[0] = (transcript.stop_codon.pop(-1)[0],
                                      transcript.combined_cds[0][1])
            transcript.phases[transcript.combined_cds[0]] = phase
        transcript.combined_cds = [tuple(_) for _ in transcript.stop_codon] + transcript.combined_cds
        for pos, utr in enumerate(transcript.combined_utr):
            if utr[0] > transcript.combined_cds[-1][1]:
                continue  # Skip the 5'
            over = overlap(utr, transcript.combined_cds[0])
            if over < 0:
                continue
            elif over > 3:
                raise InvalidCDS("Invalid overlap between UTR and CDS found")
            else:
                if over == utr[1] - utr[0] + 1:  # This is equivalent to a fragment. Remove.
                    transcript.combined_utr[pos] = None
                else:
                    transcript.combined_utr[pos] = (utr[0], max(utr[0], transcript.combined_cds[0][0] - 1))
    else:
        # Expand the last CDS
        if transcript.stop_codon[0][0] == transcript.combined_cds[-1][1] + 1:
            phase = transcript.phases.pop(transcript.combined_cds[-1], None)
            transcript.combined_cds[-1] = (transcript.combined_cds[-1][0],
                                           transcript.stop_codon.pop(0)[1])
            transcript.phases[transcript.combined_cds[-1]] = phase
        transcript.combined_cds.extend([tuple(_) for _ in transcript.stop_codon])
        for pos, utr in enumerate(transcript.combined_utr):
            if utr[1] < transcript.combined_cds[0][0]:
                continue  # Skip the 5'
            over = overlap(utr, transcript.combined_cds[-1])
            if over < 0:
                continue
            elif over > 3:
                raise InvalidCDS("Invalid overlap between UTR and CDS found")
            else:
                if over == utr[1] - utr[0] + 1:  # This is equivalent to a fragment. Remove.
                    transcript.combined_utr[pos] = None
                else:
                    transcript.combined_utr[pos] = (min(utr[1], transcript.combined_cds[-1][1] + 1),
                                                    utr[1])
    transcript.combined_utr = [_ for _ in transcript.combined_utr if _ is not None]  # Remove the deleted UTRs
    return transcript
    def _analyse_cDNAs(self, cdnas, beds, peps):

        result, cigar = transfer.get_and_prepare_cigar(*cdnas)

        t1bed, t2bed = beds

        assert sorted(t1bed.blocks)[-1][1] == len(cdnas[0]), (t1bed.blocks,
                                                              len(cdnas[0]))
        assert sorted(t2bed.blocks)[-1][1] == len(cdnas[1]), (t2bed.blocks,
                                                              len(cdnas[1]))

        try:
            c_t1_exons, c_t2_exons, common = self.transfer_exon_coordinates(
                cigar, t1bed.blocks, t2bed.blocks)
        except (ValueError, AssertionError) as exc:
            raise ValueError(exc)

        # Common: list(zip(query_array, target_array))

        identical = sum(length for length, op in cigar if op in ("=", ))
        if identical == 0:
            identity = 0
        else:
            identity = round(100 * identical / len(common), 2)
        result = array_compare(np.ravel(np.array(c_t1_exons)),
                               np.ravel(np.array(c_t2_exons)), identity)
        result, ccode = result[:-1].reshape((2, 3)), int(result[-1])
        # Now that we have analysed the cDNAs, it is time for the CDS

        if identity > 0 and t1bed.coding and t2bed.coding and all(peps):

            t1_coding_exons = [
                (max(t1bed.thick_start - 1, _[0]), min(t1bed.thick_end, _[1]))
                for _ in t1bed.blocks
                if overlap(_, (t1bed.thick_start - 1, t1bed.thick_end)) > 0
            ]
            assert t1_coding_exons, (t1bed.blocks, t1bed.block_starts,
                                     t1bed.block_sizes, t1bed.thick_start,
                                     t1bed.thick_end)
            t2_coding_exons = [
                (max(t2bed.thick_start - 1, _[0]), min(t2bed.thick_end, _[1]))
                for _ in t2bed.blocks
                if overlap(_, (t2bed.thick_start - 1, t2bed.thick_end)) > 0
            ]
            assert t2_coding_exons

            query_array, target_array = list(zip(*common))
            c_t1_coding = transfer.transfer_exons(t1_coding_exons, query_array)
            c_t2_coding = transfer.transfer_exons(t2_coding_exons,
                                                  target_array)

            t1pep, t2pep = peps

            self.log.debug("CDS: %s:%s-%s: %s", t2bed.chrom,
                           t2bed.thick_start - 2, t2bed.thick_end, t2pep)
            # print(t2bed.chrom, t2bed.thick_start-2, t2bed.thick_end, t2_coding_exons, t2pep)

            coding_result, coding_cigar = transfer.get_and_prepare_cigar(
                t1pep, t2pep)
            coding_common = transfer.cigar_length_in_common(coding_cigar)
            coding_identical = sum(length for length, op in coding_cigar
                                   if op in ("=", "M"))
            if coding_identical == 0:
                self.log.warning(
                    "No protein overlap at all for %s and %s.\nProtein 1: %s\nProtein 2: %s",
                    t1bed.name, t2bed.name, t1pep, t2pep)
                c_t1_coding = "NA"
                c_t2_coding = "NA"
                coding_identity = 0
                coding_result = np.zeros((2, 3))
                coding_ccode = 0
                # raise ValueError((t1bed.name, coding_cigar, t1pep, t2pep))
            else:
                coding_identity = round(100 * coding_identical / coding_common,
                                        2)

                coding_result = array_compare(
                    np.ravel(np.array(c_t1_coding, dtype=np.int)),
                    np.ravel(np.array(c_t2_coding, dtype=np.int)),
                    coding_identity)
                coding_result, coding_ccode = coding_result[:-1].reshape(
                    (2, 3)), int(coding_result[-1])

        else:
            c_t1_coding = "NA"
            c_t2_coding = "NA"
            coding_identity = 0
            coding_result = np.zeros((2, 3))
            coding_ccode = 0

        return (c_t1_exons, c_t2_exons, identity, result, ccode, c_t1_coding,
                c_t2_coding, coding_identity, coding_result, coding_ccode)