Example #1
0
def test_normalize_chromosome():
    assert normalize_chromosome("X") == "X"
    assert normalize_chromosome("chrX") == "X"
    assert normalize_chromosome("x") == "X"

    assert normalize_chromosome(1) == "1"
    assert normalize_chromosome("1") == "1"
    assert normalize_chromosome("chr1") == "1"

    assert normalize_chromosome("chrM") == "MT"
    assert normalize_chromosome("chrMT") == "MT"
    assert normalize_chromosome("M") == "MT"
    assert normalize_chromosome("MT") == "MT"

    with assert_raises(TypeError, None):
        normalize_chromosome({"a": "b"})

    with assert_raises(TypeError, None):
        normalize_chromosome([])

    with assert_raises(TypeError, None):
        normalize_chromosome(None)

    with assert_raises(ValueError, None):
        normalize_chromosome("")

    with assert_raises(ValueError, None):
        normalize_chromosome(0)
Example #2
0
    def __init__(
            self,
            contig,
            start,
            ref,
            alt,
            ensembl=ensembl_grch38,
            allow_extended_nucleotides=False,
            normalize_contig_name=True):
        """
        Construct a Variant object.

        Parameters
        ----------
        contig : str
            Chromosome that this variant is on

        start : int
            1-based position on the chromosome of first reference nucleotide

        ref : str
            Reference nucleotide(s)

        alt : str
            Alternate nucleotide(s)

        ensembl : Genome or EnsemblRelease
            Object used for determining gene/transcript annotations

        allow_extended_nucleotides : bool
            Extended nucleotides include 'Y' for pyrimidies or 'N' for any base

        normalize_contig_name : bool
            By default the contig name will be normalized by trimming a 'chr'
            prefix and converting all letters to upper-case. If we don't want
            this behavior then pass normalize_contig_name=False.
        """

        # first initialize the _genes and _transcripts fields we use to cache
        # lists of overlapping pyensembl Gene and Transcript objects
        self._genes = self._transcripts = None

        # user might supply Ensembl release as an integer, reference name,
        # or pyensembl.Genome object
        if isinstance(ensembl, Genome):
            self.ensembl = ensembl
        elif isinstance(ensembl, int):
            self.ensembl = cached_release(ensembl)
        elif isinstance(ensembl, str):
            self.ensembl = genome_for_reference_name(ensembl)
        else:
            raise TypeError(
                ("Expected ensembl to be an int, string, or pyensembl.Genome "
                 "instance, got %s : %s") % (type(ensembl), str(ensembl)))

        self.normalize_contig_name = normalize_contig_name
        self.allow_extended_nucleotides = allow_extended_nucleotides
        self.original_contig = contig
        self.contig = normalize_chromosome(contig) if normalize_contig_name else contig

        if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES:
            # Optimization for common case.
            self.original_ref = self.ref = ref
            self.original_alt = self.alt = alt
            self.original_start = self.start = self.end = int(start)
            return

        # we want to preserve the ref/alt/pos both as they appeared in the
        # original VCF or MAF file but also normalize variants to get rid
        # of shared prefixes/suffixes between the ref and alt nucleotide
        # strings e.g. g.10 CTT>T can be normalized into g.10delCT
        #
        # The normalized variant properties go into fields
        #    Variant.{original_ref, original_alt, original_pos}
        # whereas the trimmed fields are:
        #    Variant.{ref, alt, start, end}

        # the original entries must preserve the number of nucleotides in
        # ref and alt but we still want to normalize e.g. '-' and '.' into ''
        self.original_ref = normalize_nucleotide_string(
            ref,
            allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_alt = normalize_nucleotide_string(
            alt,
            allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_start = int(start)

        # normalize the variant by trimming any shared prefix or suffix
        # between ref and alt nucleotide sequences and then
        # offset the variant position in a strand-dependent manner
        (trimmed_ref, trimmed_alt, prefix, suffix) = (
            trim_shared_flanking_strings(self.original_ref, self.original_alt))

        self.ref = trimmed_ref
        self.alt = trimmed_alt

        if len(trimmed_ref) == 0:
            # insertions must be treated differently since the meaning of a
            # position for an insertion is:
            #   "insert the alt nucleotides after this position"
            #
            # Aside: what if both trimmed ref and alt strings are empty?
            # This means we had a "null" variant, probably from a VCF
            # generated by force-calling mutations which weren't actually
            # found in the sample.
            # Null variants are interepted as inserting zero nucleotides
            # after the whole reference sequence.
            #
            # Start and end both are base-1 nucleotide position before
            # insertion.
            self.start = self.original_start + max(0, len(prefix) - 1)
            self.end = self.start
        else:
            # for substitutions and deletions the [start:end] interval is
            # an inclusive selection of reference nucleotides
            self.start = self.original_start + len(prefix)
            self.end = self.start + len(trimmed_ref) - 1
Example #3
0
    def __init__(self,
                 contig,
                 start,
                 ref,
                 alt,
                 genome=None,
                 ensembl=None,
                 allow_extended_nucleotides=False,
                 normalize_contig_names=True,
                 convert_ucsc_contig_names=None):
        """
        Construct a Variant object.

        Parameters
        ----------
        contig : str
            Chromosome that this variant is on

        start : int
            1-based position on the chromosome of first reference nucleotide

        ref : str
            Reference nucleotide(s)

        alt : str
            Alternate nucleotide(s)

        genome : Genome, EnsemblRelease, or str, or int
            Name of reference genome, Ensembl release number, or object
            derived from pyensembl.Genome. Default to latest available release
            of GRCh38

        ensembl : Genome, EnsemblRelease, or str, or int (DEPRECATED)
            Previous name used instead of 'genome', the two arguments should
            be mutually exclusive.

        allow_extended_nucleotides : bool
            Extended nucleotides include 'Y' for pyrimidies or 'N' for any base

        normalize_contig_names : bool
            By default the contig name will be normalized by converting integers
            to strings (e.g. 1 -> "1"), and converting any letters after "chr"
            to uppercase (e.g. "chrx" -> "chrX"). If you don't want
            this behavior then pass normalize_contig_name=False.

        convert_ucsc_contig_names : bool, optional
            Setting this argument to True causes UCSC chromosome names to be
            coverted, such as "chr1" to "1". If the default value (None) is used
            then it defaults to whether or not a UCSC genome was pass in for
            the 'genome' argument.
        """

        # first initialize the fields we use to cache lists of overlapping
        # pyensembl Gene and Transcript objects, or their properties such
        # as names/IDs
        self._genes = None
        self._transcripts = None
        self._gene_ids = None
        self._gene_names = None

        # store the options which affect how properties of this variant
        # may be changed/transformed
        self.normalize_contig_names = normalize_contig_names
        self.allow_extended_nucleotides = allow_extended_nucleotides

        # if genome not specified, try the old name 'ensembl'
        # if ensembl is also None, then default to "GRCh38"
        if genome is None and ensembl is None:
            genome = "GRCh38"
        elif genome is None:
            genome = ensembl

        # user might supply Ensembl release as an integer, reference name,
        # or pyensembl.Genome object
        self.original_genome = genome
        self.genome, self.original_genome_was_ucsc = infer_genome(genome)

        self.reference_name = self.genome.reference_name
        if self.original_genome_was_ucsc:
            self.original_reference_name = ensembl_to_ucsc_reference_names[
                self.reference_name]
        else:
            self.original_reference_name = self.reference_name

        self.original_contig = contig
        self.contig = normalize_chromosome(
            contig) if normalize_contig_names else contig

        if convert_ucsc_contig_names is None:
            self.convert_ucsc_contig_names = self.original_genome_was_ucsc
        else:
            self.convert_ucsc_contig_names = convert_ucsc_contig_names

        # trim off the starting "chr" from hg19 chromosome names to make them
        # match GRCh37, also convert "chrM" to "MT".
        if self.convert_ucsc_contig_names:
            self.contig = self._convert_ucsc_contig_name_to_ensembl(
                self.contig)

        if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES:
            # Optimization for common case.
            self.original_ref = self.ref = ref
            self.original_alt = self.alt = alt
            self.original_start = self.start = self.end = int(start)
            return

        # we want to preserve the ref/alt/pos both as they appeared in the
        # original VCF or MAF file but also normalize variants to get rid
        # of shared prefixes/suffixes between the ref and alt nucleotide
        # strings e.g. g.10 CTT>T can be normalized into g.10delCT
        #
        # The normalized variant properties go into fields
        #    Variant.{original_ref, original_alt, original_pos}
        # whereas the trimmed fields are:
        #    Variant.{ref, alt, start, end}

        # the original entries must preserve the number of nucleotides in
        # ref and alt but we still want to normalize e.g. '-' and '.' into ''
        self.original_ref = normalize_nucleotide_string(
            ref, allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_alt = normalize_nucleotide_string(
            alt, allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_start = int(start)

        # normalize the variant by trimming any shared prefix or suffix
        # between ref and alt nucleotide sequences and then
        # offset the variant position in a strand-dependent manner
        (trimmed_ref, trimmed_alt, prefix, _) = \
            trim_shared_flanking_strings(self.original_ref, self.original_alt)

        self.ref = trimmed_ref
        self.alt = trimmed_alt

        if len(trimmed_ref) == 0:
            # insertions must be treated differently since the meaning of a
            # position for an insertion is:
            #   "insert the alt nucleotides after this position"
            #
            # Aside: what if both trimmed ref and alt strings are empty?
            # This means we had a "null" variant, probably from a VCF
            # generated by force-calling mutations which weren't actually
            # found in the sample.
            # Null variants are interepted as inserting zero nucleotides
            # after the whole reference sequence.
            #
            # Start and end both are base-1 nucleotide position before
            # insertion.
            self.start = self.original_start + max(0, len(prefix) - 1)
            self.end = self.start
        else:
            # for substitutions and deletions the [start:end] interval is
            # an inclusive selection of reference nucleotides
            self.start = self.original_start + len(prefix)
            self.end = self.start + len(trimmed_ref) - 1
Example #4
0
    def __init__(
            self,
            contig,
            start,
            ref,
            alt,
            ensembl=ensembl_grch38,
            allow_extended_nucleotides=False):
        """
        Construct a Variant object.

        Parameters
        ----------
        contig : str
            Chromosome that this variant is on

        start : int
            1-based position on the chromosome of first reference nucleotide

        ref : str
            Reference nucleotide(s)

        alt : str
            Alternate nucleotide(s)

        ensembl : Genome or EnsemblRelease
            Object used for determining gene/transcript annotations

        info : dict, optional
            Extra metadata about this variant
        """
        self.contig = normalize_chromosome(contig)

        # user might supply Ensembl release as an integer, reference name,
        # or pyensembl.Genome object
        if isinstance(ensembl, Genome):
            self.ensembl = ensembl
        elif isinstance(ensembl, int):
            self.ensembl = cached_release(ensembl)
        elif isinstance(ensembl, str):
            self.ensembl = genome_for_reference_name(ensembl)
        else:
            raise TypeError(
                ("Expected ensembl to be an int, string, or pyensembl.Genome "
                 "instance, got %s : %s") % (type(ensembl), str(ensembl)))

        if (ref in STANDARD_NUCLEOTIDES and
                alt in STANDARD_NUCLEOTIDES and
                ref != alt):

            # Optimization for common case.
            self.original_ref = self.ref = ref
            self.original_alt = self.alt = alt
            self.original_start = self.start = self.end = int(start)
            return

        # we want to preserve the ref/alt/pos both as they appeared in the
        # original VCF or MAF file but also normalize variants to get rid
        # of shared prefixes/suffixes between the ref and alt nucleotide
        # strings e.g. g.10 CTT>T can be normalized into g.10delCT
        #
        # The normalized variant properties go into fields
        #    Variant.{original_ref, original_alt, original_pos}
        # whereas the trimmed fields are:
        #    Variant.{ref, alt, start, end}

        # the original entries must preserve the number of nucleotides in
        # ref and alt but we still want to normalize e.g. '-' and '.' into ''
        self.original_ref = normalize_nucleotide_string(ref,
            allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_alt = normalize_nucleotide_string(alt,
            allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_start = int(start)

        # normalize the variant by trimming any shared prefix or suffix
        # between ref and alt nucleotide sequences and then
        # offset the variant position in a strand-dependent manner
        (trimmed_ref, trimmed_alt, prefix, suffix) = (
            trim_shared_flanking_strings(self.original_ref, self.original_alt))

        self.ref = trimmed_ref
        self.alt = trimmed_alt

        # insertions must be treated differently since the meaning of a
        # position for an insertion is
        #   "insert the alt nucleotides after this position"
        if len(trimmed_ref) == 0:
            # start and end both are nucleotide before insertion
            self.start = self.original_start + max(0, len(prefix) - 1)
            self.end = self.start
        else:
            # for substitutions and deletions the [start:end] interval is
            # an inclusive selection of reference nucleotides
            self.start = self.original_start + len(prefix)
            self.end = self.start + len(trimmed_ref) - 1
Example #5
0
    def __init__(self,
                 contig,
                 start,
                 ref,
                 alt,
                 ensembl=ensembl_grch38,
                 allow_extended_nucleotides=False,
                 normalize_contig_name=True):
        """
        Construct a Variant object.

        Parameters
        ----------
        contig : str
            Chromosome that this variant is on

        start : int
            1-based position on the chromosome of first reference nucleotide

        ref : str
            Reference nucleotide(s)

        alt : str
            Alternate nucleotide(s)

        ensembl : Genome or EnsemblRelease
            Object used for determining gene/transcript annotations

        allow_extended_nucleotides : bool
            Extended nucleotides include 'Y' for pyrimidies or 'N' for any base

        normalize_contig_name : bool
            By default the contig name will be normalized by trimming a 'chr'
            prefix and converting all letters to upper-case. If we don't want
            this behavior then pass normalize_contig_name=False.
        """

        # first initialize the _genes and _transcripts fields we use to cache
        # lists of overlapping pyensembl Gene and Transcript objects
        self._genes = self._transcripts = None

        # user might supply Ensembl release as an integer, reference name,
        # or pyensembl.Genome object
        if isinstance(ensembl, Genome):
            self.ensembl = ensembl
        elif isinstance(ensembl, int):
            self.ensembl = cached_release(ensembl)
        elif isinstance(ensembl, str):
            self.ensembl = genome_for_reference_name(ensembl)
        else:
            raise TypeError(
                ("Expected ensembl to be an int, string, or pyensembl.Genome "
                 "instance, got %s : %s") % (type(ensembl), str(ensembl)))

        self.normalize_contig_name = normalize_contig_name
        self.allow_extended_nucleotides = allow_extended_nucleotides
        self.original_contig = contig
        self.contig = normalize_chromosome(
            contig) if normalize_contig_name else contig

        if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES:
            # Optimization for common case.
            self.original_ref = self.ref = ref
            self.original_alt = self.alt = alt
            self.original_start = self.start = self.end = int(start)
            return

        # we want to preserve the ref/alt/pos both as they appeared in the
        # original VCF or MAF file but also normalize variants to get rid
        # of shared prefixes/suffixes between the ref and alt nucleotide
        # strings e.g. g.10 CTT>T can be normalized into g.10delCT
        #
        # The normalized variant properties go into fields
        #    Variant.{original_ref, original_alt, original_pos}
        # whereas the trimmed fields are:
        #    Variant.{ref, alt, start, end}

        # the original entries must preserve the number of nucleotides in
        # ref and alt but we still want to normalize e.g. '-' and '.' into ''
        self.original_ref = normalize_nucleotide_string(
            ref, allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_alt = normalize_nucleotide_string(
            alt, allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_start = int(start)

        # normalize the variant by trimming any shared prefix or suffix
        # between ref and alt nucleotide sequences and then
        # offset the variant position in a strand-dependent manner
        (trimmed_ref, trimmed_alt, prefix,
         suffix) = (trim_shared_flanking_strings(self.original_ref,
                                                 self.original_alt))

        self.ref = trimmed_ref
        self.alt = trimmed_alt

        if len(trimmed_ref) == 0:
            # insertions must be treated differently since the meaning of a
            # position for an insertion is:
            #   "insert the alt nucleotides after this position"
            #
            # Aside: what if both trimmed ref and alt strings are empty?
            # This means we had a "null" variant, probably from a VCF
            # generated by force-calling mutations which weren't actually
            # found in the sample.
            # Null variants are interepted as inserting zero nucleotides
            # after the whole reference sequence.
            #
            # Start and end both are base-1 nucleotide position before
            # insertion.
            self.start = self.original_start + max(0, len(prefix) - 1)
            self.end = self.start
        else:
            # for substitutions and deletions the [start:end] interval is
            # an inclusive selection of reference nucleotides
            self.start = self.original_start + len(prefix)
            self.end = self.start + len(trimmed_ref) - 1
Example #6
0
def test_normalize_chromosome():
    assert normalize_chromosome("X") == "X"
    assert normalize_chromosome("chrX") == "X"
    assert normalize_chromosome("x") == "X"

    assert normalize_chromosome(1) == "1"
    assert normalize_chromosome("1") == "1"
    assert normalize_chromosome("chr1") == "1"

    assert normalize_chromosome("chrM") == "MT"
    assert normalize_chromosome("chrMT") == "MT"
    assert normalize_chromosome("M") == "MT"
    assert normalize_chromosome("MT") == "MT"
    assert normalize_chromosome("m") == "MT"
    assert normalize_chromosome("chrm") == "MT"
    assert normalize_chromosome("mt") == "MT"

    with assert_raises(TypeError):
        normalize_chromosome({"a": "b"})

    with assert_raises(TypeError):
        normalize_chromosome([])

    with assert_raises(TypeError):
        normalize_chromosome(None)

    with assert_raises(ValueError):
        normalize_chromosome("")

    with assert_raises(ValueError):
        normalize_chromosome(0)