def overlapping(self, release, chromosome, start, end, padding=0):
     """Return overlapping ``RegElement`` objects."""
     return (super().all().filter(
         release=release,
         chromosome=chromosome,
         bin__in=binning.overlapping_bins(start - 1, end),
         start__lte=end + padding,
         end__gte=max(0, start - padding),
     ))
def test_overlapping(intervals, interval):
    start, stop = interval

    # Intervals overlapping the query interval.
    overlapping = set((x, y) for x, y in intervals
                      if x < stop and start < y)

    # Pre-selection of intervals using binning.
    binned = set((x, y) for x, y in intervals
                 if binning.assign_bin(x, y)
                 in binning.overlapping_bins(start, stop))

    assert binned.issuperset(overlapping)
Exemple #3
0
    def chrom2c(self, variant, rt, gene=None):
        """
        @arg variant: a variant description
        @type variant: unicode
        @arg rt: the return type
        @type rt: unicode
        @kwarg gene: Optional gene name. If given, return variant descriptions
            on all transcripts for this gene.
        @type gene: unicode

        @return: HGVS_notatations ;
        @rtype: dictionary or list
        """

        if not self._parseInput(variant):
            return None

        acc = self.parseTree.LrgAcc or self.parseTree.RefSeqAcc
        version = self.parseTree.Version

        chromosome = Chromosome.query \
            .filter_by(assembly=self.assembly,
                       accession='%s.%s' % (acc, version)).first()
        if not chromosome:
            self.__output.addMessage(
                __file__, 4, "ENOTINDB",
                "Accession number %s could not be found in our database or is "
                "not suitable for the requested conversion." % acc)
            return None

        #if

        if self.parseTree.SingleAlleleVarSet:
            variants = [v.RawVar for v in self.parseTree.SingleAlleleVarSet]
        else:
            variants = [self.parseTree.RawVar]

        min_loc = 9000000000
        max_loc = 0
        for variant in variants:
            #FIXME This should be a proper conversion.
            loc = int(variant.StartLoc.PtLoc.Main)
            if variant.EndLoc:
                loc2 = int(variant.EndLoc.PtLoc.Main)
            else:
                loc2 = loc

            if loc2 < loc:
                self.__output.addMessage(
                    __file__, 3, 'ERANGE', 'End position is '
                    'smaller than the begin position.')
                return None

            min_loc = min(min_loc, loc)
            max_loc = max(max_loc, loc2)

        if gene:
            mappings = chromosome.transcript_mappings.filter_by(gene=gene)
        else:
            start = max(min_loc - 5000, 1)
            stop = min(max_loc + 5000, binning.MAX_POSITION + 1)
            bins = binning.overlapping_bins(start - 1, stop)
            mappings = chromosome.transcript_mappings.filter(
                TranscriptMapping.bin.in_(bins),
                TranscriptMapping.start <= stop,
                TranscriptMapping.stop >= start).order_by(
                    TranscriptMapping.start, TranscriptMapping.stop,
                    TranscriptMapping.gene, TranscriptMapping.accession,
                    TranscriptMapping.version, TranscriptMapping.transcript)

        HGVS_notatations = defaultdict(list)
        NM_list = []
        for mapping in mappings:
            self._reset()
            self.mapping = mapping
            core_mapping = self._coreMapping()
            if not core_mapping:
                #balen
                continue
            reference = self.mapping.reference
            geneName = self.mapping.gene
            strand = self.mapping.orientation == 'forward'

            # Check if n or c type
            # Note: Originally, the below check using crossmap.info() was
            #     used (commented out now), but I do not understand this
            #     logic. Also, it breaks n. notation on non-coding mtDNA
            #     transcripts, so I replaced it with a simple .CDS check.
            #info = self.crossmap.info()
            #if info[0] == 1 and info[1] == info[2] :
            #    mtype = 'n'
            #else :
            #    mtype = 'c'
            if self.crossmap.CDS:
                mtype = 'c'
            else:
                mtype = 'n'

            mutations = []
            for variant, cmap in zip(variants, core_mapping):
                try:
                    f_change = _construct_change(variant)
                    r_change = _construct_change(variant, reverse=True)
                except NotImplementedError as e:
                    self.__output.addMessage(__file__,
                                             4, "ENOTIMPLEMENTEDERROR",
                                             unicode(e))
                    return None

                startp = self.crossmap.tuple2string(
                    (cmap.startmain, cmap.startoffset))
                endp = self.crossmap.tuple2string(
                    (cmap.endmain, cmap.endoffset))

                if strand:
                    change = f_change
                else:
                    change = r_change
                    startp, endp = endp, startp

                if cmap.start_g != cmap.end_g:
                    loca = "%s_%s" % (startp, endp)
                else:
                    loca = "%s" % startp

                mutations.append('%s%s' % (loca, change))

            if len(mutations) == 1:
                mutation = mutations[0]
            else:
                mutation = '[' + ';'.join(mutations) + ']'

            description = "%s:%c.%s" % (reference, mtype, mutation)
            HGVS_notatations[geneName].append(description)
            NM_list.append(description)
        #for
        if rt == "list":
            return NM_list
        return HGVS_notatations
Exemple #4
0
    def chrom2c(self, variant, rt, gene=None):
        """
        @arg variant: a variant description
        @type variant: unicode
        @arg rt: the return type
        @type rt: unicode
        @kwarg gene: Optional gene name. If given, return variant descriptions
            on all transcripts for this gene.
        @type gene: unicode

        @return: HGVS_notatations ;
        @rtype: dictionary or list
        """

        if not self._parseInput(variant) :
            return None

        acc = self.parseTree.LrgAcc or self.parseTree.RefSeqAcc
        version = self.parseTree.Version

        chromosome = Chromosome.query \
            .filter_by(assembly=self.assembly,
                       accession='%s.%s' % (acc, version)).first()
        if not chromosome :
            self.__output.addMessage(__file__, 4, "ENOTINDB",
                "Accession number %s could not be found in our database or is "
                "not suitable for the requested conversion." %
                acc)
            return None

        #if

        if self.parseTree.SingleAlleleVarSet:
            variants = [v.RawVar for v in self.parseTree.SingleAlleleVarSet]
        else:
            variants = [self.parseTree.RawVar]

        min_loc = 9000000000
        max_loc = 0
        for variant in variants:
            #FIXME This should be a proper conversion.
            loc = int(variant.StartLoc.PtLoc.Main)
            if variant.EndLoc :
                loc2 = int(variant.EndLoc.PtLoc.Main)
            else :
                loc2 = loc

            if loc2 < loc:
                self.__output.addMessage(__file__, 3, 'ERANGE', 'End position is '
                                         'smaller than the begin position.')
                return None

            min_loc = min(min_loc, loc)
            max_loc = max(max_loc, loc2)

        if gene:
            mappings = chromosome.transcript_mappings.filter_by(gene=gene)
        else:
            start = max(min_loc - 5000, 1)
            stop = min(max_loc + 5000, binning.MAX_POSITION + 1)
            bins = binning.overlapping_bins(start - 1, stop)
            mappings = chromosome.transcript_mappings.filter(
                TranscriptMapping.bin.in_(bins),
                TranscriptMapping.start <= stop,
                TranscriptMapping.stop >= start
            ).order_by(
                TranscriptMapping.start,
                TranscriptMapping.stop,
                TranscriptMapping.gene,
                TranscriptMapping.accession,
                TranscriptMapping.version,
                TranscriptMapping.transcript)

        HGVS_notatations = defaultdict(list)
        NM_list = []
        for mapping in mappings:
            self._reset()
            self.mapping = mapping
            core_mapping = self._coreMapping()
            if not core_mapping:
                #balen
                continue
            reference = self.mapping.reference
            geneName = self.mapping.gene
            strand = self.mapping.orientation == 'forward'

            # Check if n or c type
            # Note: Originally, the below check using crossmap.info() was
            #     used (commented out now), but I do not understand this
            #     logic. Also, it breaks n. notation on non-coding mtDNA
            #     transcripts, so I replaced it with a simple .CDS check.
            #info = self.crossmap.info()
            #if info[0] == 1 and info[1] == info[2] :
            #    mtype = 'n'
            #else :
            #    mtype = 'c'
            if self.crossmap.CDS:
                mtype = 'c'
            else:
                mtype = 'n'

            mutations = []
            for variant, cmap in zip(variants, core_mapping):
                try:
                    f_change = _construct_change(variant)
                    r_change = _construct_change(variant, reverse=True)
                except NotImplementedError as e:
                    self.__output.addMessage(__file__, 4,
                                             "ENOTIMPLEMENTEDERROR", unicode(e))
                    return None

                startp = self.crossmap.tuple2string((cmap.startmain, cmap.startoffset))
                endp = self.crossmap.tuple2string((cmap.endmain, cmap.endoffset))

                if strand :
                    change = f_change
                else :
                    change = r_change
                    startp, endp = endp, startp

                if cmap.start_g != cmap.end_g :
                    loca = "%s_%s" % (startp, endp)
                else :
                    loca = "%s" % startp

                mutations.append('%s%s' % (loca, change))

            if len(mutations) == 1:
                mutation = mutations[0]
            else:
                mutation = '[' + ';'.join(mutations) + ']'

            description = "%s:%c.%s" % (reference, mtype, mutation)
            HGVS_notatations[geneName].append(description)
            NM_list.append(description)
        #for
        if rt == "list" :
            return NM_list
        return HGVS_notatations
def test_overlapping_bins(start, stop, expected):
    assert binning.overlapping_bins(start, stop) == expected