Esempio n. 1
0
    def shift_sites(self, sites):
        """
        Calculate the list of splice sites on the mutated string, given a list
        of splice sites on the original string.

        @arg sites: List of splice sites on the original string.
        @type sites: list(int)

        @return: List of splice sites on the mutated string.
        @rtype: list(int)


        Example 1 (DNA): NG_012772.1(BRCA2_v001)

                  ...---------[=========]----------...
                              ^         ^
                            18964     19013

          Variant           Expected new location for splice site 18964
          g.18963del        18963
          g.18964del        18964
          g.18963_18964ins  18964
          g.18964_18965ins  18964

          Variant           Expected new location for splice site 19013
          g.19013del        19012
          g.19014del        19013
          g.19013_19014ins  19014


        Example 2 (RNA): NM_000088.3

                      ...============][==============...
                                     /\
                                  229  230

          Variant           Expected new location for splice sites 229,230
          n.228del          228,229
          n.229del          228,229
          n.230del          229,230
          n.231del          229,230
          n.228_229ins      230,231
          n.229_230ins      229,230 or 230,231
          n.230_231ins      229,230
        """

        # We use shiftpos(i+1)-1 instead of shiftpos(i) (and its mirror)
        # to make sure insertions directly before or after an exon are
        # placed inside the exon.
        #
        # Example:
        #
        #   -----SPLICE[======]SPLICE----------SPLICE[=======]SPLICE-----
        #                      ^                    ^
        #                      ins                  ins
        #
        #   These two insertions should be mapped inside the exons because
        #   they are before and after (respectively) their exons and don't
        #   hit the (biological) splice sites.
        #
        # This also makes sure deletions of the last exon base are really
        # removed from the exon. The problem is that positions following
        # (but not including) the deletion get a shift, but the splice site
        # is stored by the position of the last exon base. So the splice
        # site position would not be decremented without the +1-1 dance.

        new_sites = []

        prev_donor = None
        filtered_sites = [s for s in sites if s not in self._removed_sites]
        for acceptor, donor in util.grouper(filtered_sites):

            # We don't want to do the -1+1 dance if
            # 1) there is a deletion directly before the exon, or
            # 2) there is another exon directly before this exon, or
            # 3) this is the first site in the list.
            #
            # A consequence of check 2) is that insertions between two
            # directly adjacent exons are seen as insertions in the first
            # exon.
            #
            # Condition 3) makes sure we don't include insertions directly
            # in front of CDS start in the CDS. It also affects translation
            # start, but this should be no problem.
            if not prev_donor or prev_donor == acceptor - 1 or \
                    self._shift_minus_at(acceptor):
                new_sites.append(self.shift(acceptor))
            else:
                new_sites.append(self.shift(acceptor - 1) + 1)

            # Should never happen since splice sites come in pairs.
            if not donor: continue

            # We don't want to do the +1-1 dance if this is the last site
            # in the list. This makes sure we don't include insertions
            # directly at CDS end in the CDS. It also affects translation
            # end, but this should be no problem.
            if donor == sites[-1]:
                new_sites.append(self.shift(donor))
            else:
                new_sites.append(self.shift(donor + 1) - 1)

            prev_donor = donor

        return new_sites
Esempio n. 2
0
    def shift_sites(self, sites):
        """
        Calculate the list of splice sites on the mutated string, given a list
        of splice sites on the original string.

        @arg sites: List of splice sites on the original string.
        @type sites: list(int)

        @return: List of splice sites on the mutated string.
        @rtype: list(int)


        Example 1 (DNA): NG_012772.1(BRCA2_v001)

                  ...---------[=========]----------...
                              ^         ^
                            18964     19013

          Variant           Expected new location for splice site 18964
          g.18963del        18963
          g.18964del        18964
          g.18963_18964ins  18964
          g.18964_18965ins  18964

          Variant           Expected new location for splice site 19013
          g.19013del        19012
          g.19014del        19013
          g.19013_19014ins  19014


        Example 2 (RNA): NM_000088.3

                      ...============][==============...
                                     /\
                                  229  230

          Variant           Expected new location for splice sites 229,230
          n.228del          228,229
          n.229del          228,229
          n.230del          229,230
          n.231del          229,230
          n.228_229ins      230,231
          n.229_230ins      229,230 or 230,231
          n.230_231ins      229,230
        """

        # We use shiftpos(i+1)-1 instead of shiftpos(i) (and its mirror)
        # to make sure insertions directly before or after an exon are
        # placed inside the exon.
        #
        # Example:
        #
        #   -----SPLICE[======]SPLICE----------SPLICE[=======]SPLICE-----
        #                      ^                    ^
        #                      ins                  ins
        #
        #   These two insertions should be mapped inside the exons because
        #   they are before and after (respectively) their exons and don't
        #   hit the (biological) splice sites.
        #
        # This also makes sure deletions of the last exon base are really
        # removed from the exon. The problem is that positions following
        # (but not including) the deletion get a shift, but the splice site
        # is stored by the position of the last exon base. So the splice
        # site position would not be decremented without the +1-1 dance.

        new_sites = []

        prev_donor = None
        filtered_sites = [s for s in sites if s not in self._removed_sites]
        for acceptor, donor in util.grouper(filtered_sites):

            # We don't want to do the -1+1 dance if
            # 1) there is a deletion directly before the exon, or
            # 2) there is another exon directly before this exon, or
            # 3) this is the first site in the list.
            #
            # A consequence of check 2) is that insertions between two
            # directly adjacent exons are seen as insertions in the first
            # exon.
            #
            # Condition 3) makes sure we don't include insertions directly
            # in front of CDS start in the CDS. It also affects translation
            # start, but this should be no problem.
            if not prev_donor or prev_donor == acceptor - 1 or \
                    self._shift_minus_at(acceptor):
                new_sites.append(self.shift(acceptor))
            else:
                new_sites.append(self.shift(acceptor - 1) + 1)

            # Should never happen since splice sites come in pairs.
            if not donor: continue

            # We don't want to do the +1-1 dance if this is the last site
            # in the list. This makes sure we don't include insertions
            # directly at CDS end in the CDS. It also affects translation
            # end, but this should be no problem.
            if donor == sites[-1]:
                new_sites.append(self.shift(donor))
            else:
                new_sites.append(self.shift(donor + 1) - 1)

            prev_donor = donor

        return new_sites
Esempio n. 3
0
    def checkRecord(self):
        """
        Check if the record in self.record is compatible with mutalyzer.
        Update the mRNA PList with the exon and CDS data.

        @todo: This function should really check the record for minimal
        requirements
        """

        # TODO:  This function should really check
        #       the record for minimal requirements.
        for i in self.record.geneList:
            """
            if len(i.transcriptList) == 2 :
                if i.transcriptList[0].CDS and not i.transcriptList[1].CDS and \
                   i.transcriptList[1].mRNA and not i.transcriptList[0].mRNA :
                    i.transcriptList[0].mRNA = i.transcriptList[1].mRNA
                if i.transcriptList[1].CDS and not i.transcriptList[0].CDS and \
                   i.transcriptList[0].mRNA and not i.transcriptList[1].mRNA :
                    i.transcriptList[0].CDS = i.transcriptList[1].CDS
                i.transcriptList = [i.transcriptList[0]]
                i.transcriptList[0].transcribe = True
                i.transcriptList[0].translate = True
            #if
            """
            for j in i.transcriptList:
                if not j.mRNA:
                    usableExonList = self.__checkExonList(j.exon, j.CDS)
                    if self.record.molType == "n" and j.exon:
                        if not all(p1 + 1 == p2 for p1, p2 in util.grouper(j.exon.positionList[1:-1])):
                            code = "WEXON_ANNOTATION" if j.current else "WEXON_ANNOTATION_OTHER"
                            self.__output.addMessage(
                                __file__,
                                2,
                                code,
                                "Exons for gene %s, transcript variant %s were "
                                "found not to be adjacent. This signifies a "
                                "possible problem in the annotation of the "
                                "reference sequence." % (i.name, j.name),
                            )
                    if not j.exon or not usableExonList:
                        if self.record.molType == "g":
                            code = "WNOMRNA" if j.current else "WNOMRNA_OTHER"
                            self.__output.addMessage(
                                __file__,
                                2,
                                code,
                                "No mRNA field found for gene %s, transcript "
                                "variant %s in record, constructing "
                                "it from CDS. Please note that descriptions "
                                "exceeding CDS boundaries are invalid." % (i.name, j.name),
                            )
                        if j.exon and j.exon.positionList and not usableExonList:
                            code = "WNOMRNA" if j.current else "WNOMRNA_OTHER"
                            self.__output.addMessage(
                                __file__,
                                2,
                                code,
                                "Exons were found for gene %s, transcript "
                                "variant %s but were not usable. "
                                "Please note that descriptions "
                                "exceeding CDS boundaries are invalid." % (i.name, j.name),
                            )
                        if j.CDS:
                            if not j.CDS.positionList:
                                # self.__output.addMessage(__file__, 2,
                                #    "WNOCDSLIST", "No CDS list found for " \
                                #    "gene %s, transcript variant %s in " \
                                #    "record, constructing it from " \
                                #    "CDS location." % (i.name, j.name))
                                j.mRNA = j.CDS
                                j.mRNA.positionList = j.CDS.location
                            # if
                            else:
                                j.mRNA = j.CDS
                            j.linkMethod = "construction"
                            j.transcribe = True
                            j.translate = True
                        # if
                        else:
                            self.__output.addMessage(
                                __file__,
                                2,
                                "WNOCDS",
                                "No CDS found for gene %s, transcript "
                                "variant %s in record, "
                                "constructing it from gene location." % (i.name, j.name),
                            )
                            j.CDS = None  # PList()
                            # j.CDS.location = i.location
                            j.mRNA = PList()
                            j.mRNA.location = i.location
                            # j.mRNA.positionList = i.location
                            j.molType = "n"
                        # else
                    # if
                    else:
                        # self.__output.addMessage(__file__, 2, "WNOMRNA",
                        #    "No mRNA field found for gene %s, transcript " \
                        #    "variant %s in record, constructing " \
                        #    "it from gathered exon information." % (
                        #    i.name, j.name))
                        j.mRNA = j.exon
                    # else
                # if
                # else :
                #    j.transcribe = True

                if not j.mRNA.positionList:
                    j.mRNA.positionList = j.mRNA.location
                if j.mRNA.positionList and j.CDS and j.CDS.positionList != None:
                    if not j.CDS.positionList:
                        # self.__output.addMessage(__file__, 2, "WNOCDS",
                        #    "No CDS list found for gene %s, transcript " \
                        #    "variant %s in record, constructing " \
                        #    "it from mRNA list and CDS location." % (i.name,
                        #    j.name))
                        if j.mRNA.positionList:
                            j.CDS.positionList = self.__constructCDS(j.mRNA.positionList, j.CDS.location)
                        else:
                            j.CDS.positionList = self.__constructCDS(j.mRNA.location, j.CDS.location)
                        j.transcribe = True
                        j.translate = True
                    # if
                    j.CM = Crossmap.Crossmap(j.mRNA.positionList, j.CDS.location, i.orientation)
                # if
                else:
                    j.molType = "n"
                    if j.mRNA.positionList:
                        j.CM = Crossmap.Crossmap(j.mRNA.positionList, [], i.orientation)
                        j.transcribe = True
                    else:
                        j.description = "?"
Esempio n. 4
0
    def checkRecord(self):
        """
        Check if the record in self.record is compatible with mutalyzer.
        Update the mRNA PList with the exon and CDS data.

        @todo: This function should really check the record for minimal
        requirements
        """

        #TODO:  This function should really check
        #       the record for minimal requirements.
        for i in self.record.geneList:
            """
            if len(i.transcriptList) == 2 :
                if i.transcriptList[0].CDS and not i.transcriptList[1].CDS and \
                   i.transcriptList[1].mRNA and not i.transcriptList[0].mRNA :
                    i.transcriptList[0].mRNA = i.transcriptList[1].mRNA
                if i.transcriptList[1].CDS and not i.transcriptList[0].CDS and \
                   i.transcriptList[0].mRNA and not i.transcriptList[1].mRNA :
                    i.transcriptList[0].CDS = i.transcriptList[1].CDS
                i.transcriptList = [i.transcriptList[0]]
                i.transcriptList[0].transcribe = True
                i.transcriptList[0].translate = True
            #if
            """
            for j in i.transcriptList:
                if not j.mRNA:
                    usableExonList = self.__checkExonList(j.exon, j.CDS)
                    if self.record.molType == 'n' and j.exon:
                        if not all(p1 + 1 == p2 for p1, p2 in util.grouper(
                                j.exon.positionList[1:-1])):
                            code = 'WEXON_ANNOTATION' if j.current else 'WEXON_ANNOTATION_OTHER'
                            self.__output.addMessage(
                                __file__, 2, code,
                                "Exons for gene %s, transcript variant %s were "
                                "found not to be adjacent. This signifies a "
                                "possible problem in the annotation of the "
                                "reference sequence." % (i.name, j.name))
                    if not j.exon or not usableExonList:
                        if self.record.molType == 'g':
                            code = 'WNOMRNA' if j.current else 'WNOMRNA_OTHER'
                            self.__output.addMessage(__file__, 2, code,
                                "No mRNA field found for gene %s, transcript " \
                                "variant %s in record, constructing " \
                                "it from CDS. Please note that descriptions "\
                                "exceeding CDS boundaries are invalid." % (
                                i.name, j.name))
                        if j.exon and j.exon.positionList and \
                           not usableExonList :
                            code = 'WNOMRNA' if j.current else 'WNOMRNA_OTHER'
                            self.__output.addMessage(__file__, 2, code,
                                "Exons were found for gene %s, transcript " \
                                "variant %s but were not usable. " \
                                "Please note that descriptions "\
                                "exceeding CDS boundaries are invalid." % (
                                i.name, j.name))
                        if j.CDS:
                            if not j.CDS.positionList:
                                #self.__output.addMessage(__file__, 2,
                                #    "WNOCDSLIST", "No CDS list found for " \
                                #    "gene %s, transcript variant %s in " \
                                #    "record, constructing it from " \
                                #    "CDS location." % (i.name, j.name))
                                j.mRNA = j.CDS
                                j.mRNA.positionList = j.CDS.location
                            #if
                            else:
                                j.mRNA = j.CDS
                            j.linkMethod = "construction"
                            j.transcribe = True
                            j.translate = True
                        #if
                        else:
                            self.__output.addMessage(__file__, 2, "WNOCDS",
                                "No CDS found for gene %s, transcript " \
                                "variant %s in record, " \
                                "constructing it from gene location." % (
                                i.name, j.name))
                            j.CDS = None  #PList()
                            #j.CDS.location = i.location
                            j.mRNA = PList()
                            j.mRNA.location = i.location
                            #j.mRNA.positionList = i.location
                            j.molType = 'n'
                        #else
                    #if
                    else:
                        #self.__output.addMessage(__file__, 2, "WNOMRNA",
                        #    "No mRNA field found for gene %s, transcript " \
                        #    "variant %s in record, constructing " \
                        #    "it from gathered exon information." % (
                        #    i.name, j.name))
                        j.mRNA = j.exon
                    #else
                #if
                #else :
                #    j.transcribe = True

                if not j.mRNA.positionList:
                    j.mRNA.positionList = j.mRNA.location
                if j.mRNA.positionList and j.CDS and j.CDS.positionList != None:
                    if not j.CDS.positionList:
                        #self.__output.addMessage(__file__, 2, "WNOCDS",
                        #    "No CDS list found for gene %s, transcript " \
                        #    "variant %s in record, constructing " \
                        #    "it from mRNA list and CDS location." % (i.name,
                        #    j.name))
                        if j.mRNA.positionList:
                            j.CDS.positionList = self.__constructCDS(
                                j.mRNA.positionList, j.CDS.location)
                        else:
                            j.CDS.positionList = self.__constructCDS(
                                j.mRNA.location, j.CDS.location)
                        j.transcribe = True
                        j.translate = True
                    #if
                    j.CM = Crossmap.Crossmap(j.mRNA.positionList,
                                             j.CDS.location, i.orientation)
                #if
                else:
                    j.molType = 'n'
                    if j.mRNA.positionList:
                        j.CM = Crossmap.Crossmap(j.mRNA.positionList, [],
                                                 i.orientation)
                        j.transcribe = True
                    else:
                        j.description = '?'