Esempio n. 1
0
def test_genome_arg_to_load_vcf_cached_75():
    eq_(load_vcf(HG19_VCF_FILENAME),
        load_vcf(HG19_VCF_FILENAME,
                 genome=cached_release(75), convert_ucsc_contig_names=True))
    assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
        HG19_VCF_FILENAME,
        genome=cached_release(75),
        convert_ucsc_contig_names=False)
Esempio n. 2
0
def infer_genome(genome_object_string_or_int):
    """
    If given an integer, return associated human EnsemblRelease for that
    Ensembl version.

    If given a string, return latest EnsemblRelease which has a reference
    of the same name.

    If given a PyEnsembl Genome, simply return it.
    """
    if isinstance(genome_object_string_or_int, Genome):
        return genome_object_string_or_int
    if is_integer(genome_object_string_or_int):
        return cached_release(genome_object_string_or_int)
    elif is_string(genome_object_string_or_int):
        # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37
        # and then get the associated PyEnsembl Genome object
        reference_name = infer_reference_name(genome_object_string_or_int)
        return genome_for_reference_name(reference_name)
    else:
        raise TypeError(
                ("Expected genome to be an int, string, or pyensembl.Genome "
                 "instance, got %s : %s") % (
                    str(genome_object_string_or_int),
                    type(genome_object_string_or_int)))
Esempio n. 3
0
def infer_genome(genome_object_string_or_int):
    """
    If given an integer, return associated human EnsemblRelease for that
    Ensembl version.

    If given a string, return latest EnsemblRelease which has a reference
    of the same name.

    If given a PyEnsembl Genome, simply return it.
    """
    if isinstance(genome_object_string_or_int, Genome):
        return genome_object_string_or_int
    if is_integer(genome_object_string_or_int):
        return cached_release(genome_object_string_or_int)
    elif is_string(genome_object_string_or_int):
        # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37
        # and then get the associated PyEnsembl Genome object
        reference_name = infer_reference_name(genome_object_string_or_int)
        return genome_for_reference_name(reference_name)
    else:
        raise TypeError(
            ("Expected genome to be an int, string, or pyensembl.Genome "
                "instance, got %s : %s") % (
                str(genome_object_string_or_int),
                type(genome_object_string_or_int)))
Esempio n. 4
0
    def __setstate__(self, fields):
        # This field require special logic.
        self.ensembl = cached_release(fields.pop("release"))

        # Remaining fields  are simple properties that just get set.
        for (key, value) in fields.items():
            setattr(self, key, value)
Esempio n. 5
0
def test_pandas_and_pyvcf_implementations_equivalent():
    paths = [
        {'path': data_path("somatic_hg19_14muts.vcf")},
        {'path': data_path("somatic_hg19_14muts.space_in_sample_name.vcf")},
        {'path': "/" + data_path("somatic_hg19_14muts.vcf")},
        {'path': data_path("somatic_hg19_14muts.vcf.gz")},
        {'path': data_path("multiallelic.vcf")},
        {'path': data_path("mutect-example.vcf")},
        {'path': data_path("strelka-example.vcf")},
        {'path': data_path("mutect-example-headerless.vcf"),
            'genome': cached_release(75)},
    ]
    if RUN_TESTS_REQUIRING_INTERNET:
        paths.append({'path': VCF_EXTERNAL_URL})
        paths.append({'path': VCF_EXTERNAL_URL + ".gz"})

    def do_test(kwargs):
        vcf_pandas = load_vcf_fast(**kwargs)
        vcf_pyvcf = load_vcf(**kwargs)
        eq_(vcf_pandas, vcf_pyvcf)
        eq_(len(vcf_pandas), len(vcf_pyvcf))
        eq_(vcf_pandas.elements, vcf_pyvcf.elements)
        eq_(vcf_pandas.metadata, vcf_pyvcf.metadata)
        assert len(vcf_pandas) > 1
        assert len(vcf_pyvcf) > 1

    for kwargs in paths:
        yield (do_test, kwargs)
Esempio n. 6
0
def test_genome_arg_to_load_vcf():
    variants = load_vcf(VCF_FILENAME)
    eq_(variants, load_vcf(VCF_FILENAME, genome=75))
    eq_(variants, load_vcf(VCF_FILENAME, genome=cached_release(75)))
    eq_(variants, load_vcf(VCF_FILENAME, genome="grch37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="GRCh37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="b37"))
    # TODO: actually make hg19 different from b37! They should use
    # different MT sequences
    eq_(variants, load_vcf(VCF_FILENAME, genome="hg19"))
Esempio n. 7
0
def test_genome_arg_to_load_vcf():
    variants = load_vcf(VCF_FILENAME)
    eq_(variants, load_vcf(VCF_FILENAME, genome=75))
    eq_(variants, load_vcf(VCF_FILENAME, genome=cached_release(75)))
    eq_(variants, load_vcf(VCF_FILENAME, genome="grch37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="GRCh37"))
    eq_(variants, load_vcf(VCF_FILENAME, genome="b37"))
    # TODO: actually make hg19 different from b37! They should use
    # different MT sequences
    eq_(variants, load_vcf(VCF_FILENAME, genome="hg19"))
def test_transcript_support_level():
    """ The Transcript Support Level (TSL) is a method to highlight the well-supported and poorly-supported transcript
        models for users, based on the type and quality of the alignments used to annotate the transcript.
        In the Ensembl database, it can be assigned to a value 1 through 5, or reported as NA, or missing, or missing
        completely in older releases. We translate it to an integer value, otherwise to None.
    """
    ensembl93 = cached_release(93)
    transcript = ensembl93.transcripts_by_name("DDX11L1-202")[0]
    eq_(transcript.support_level, 1)

    # For this transcript, the transcript_support_level value is missing in the database record:
    transcript = ensembl93.transcripts_by_name("OR4G11P-202")[0]
    eq_(transcript.support_level, None)

    # Some features are reported as "NA" in Ensembl: those are features like pseudogenes, single exon transcripts,
    # HLA, T-cell receptor and Ig transcripts that are not analysed in terms of TSL and therefore not given any
    # of the TSL categories. We translate NA to None as well.
    transcript = ensembl93.transcripts_by_name("MIR1302-2-201")[0]
    eq_(transcript.support_level, None)

    # Transcript_support_level column was missing completely in GRCh37 and older releases of GRCh38:
    ensembl77 = cached_release(77)
    transcript = ensembl77.transcripts_by_name("DDX11L1-002")[0]
    eq_(transcript.support_level, None)
def test_transcript_support_level():
    """ The Transcript Support Level (TSL) is a method to highlight the well-supported and poorly-supported transcript
        models for users, based on the type and quality of the alignments used to annotate the transcript.
        In the Ensembl database, it can be assigned to a value 1 through 5, or reported as NA, or missing, or missing
        completely in older releases. We translate it to an integer value, otherwise to None.
    """
    ensembl93 = cached_release(93)
    transcript = ensembl93.transcripts_by_name("DDX11L1-202")[0]
    eq_(transcript.support_level, 1)

    # For this transcript, the transcript_support_level value is missing in the database record:
    transcript = ensembl93.transcripts_by_name("OR4G11P-202")[0]
    eq_(transcript.support_level, None)

    # Some features are reported as "NA" in Ensembl: those are features like pseudogenes, single exon transcripts,
    # HLA, T-cell receptor and Ig transcripts that are not analysed in terms of TSL and therefore not given any
    # of the TSL categories. We translate NA to None as well.
    transcript = ensembl93.transcripts_by_name("MIR1302-2-201")[0]
    eq_(transcript.support_level, None)

    # Transcript_support_level column was missing completely in GRCh37 and older releases of GRCh38:
    ensembl77 = cached_release(77)
    transcript = ensembl77.transcripts_by_name("DDX11L1-002")[0]
    eq_(transcript.support_level, None)
Esempio n. 10
0
def test_ensembl_releases(*versions):
    """
    Run a unit test which takes an EnsemblRelease as an argument
    for multiple releases (most recent for each reference genome)
    """
    if len(versions) == 0:
        ensembl_releases = major_releases
    else:
        ensembl_releases = [cached_release(version) for version in versions]

    def decorator(test_fn):
        @functools.wraps(test_fn)
        def new_test_fn():
            for ensembl in ensembl_releases:
                test_fn(ensembl)
        return new_test_fn
    return decorator
Esempio n. 11
0
def test_pandas_and_pyvcf_implementations_equivalent():
    paths = [
        {
            'path': data_path("somatic_hg19_14muts.vcf")
        },
        {
            'path': data_path("somatic_hg19_14muts.space_in_sample_name.vcf")
        },
        {
            'path': "/" + data_path("somatic_hg19_14muts.vcf")
        },
        {
            'path': data_path("somatic_hg19_14muts.vcf.gz")
        },
        {
            'path': data_path("multiallelic.vcf")
        },
        {
            'path': data_path("mutect-example.vcf")
        },
        {
            'path': data_path("strelka-example.vcf")
        },
        {
            'path': data_path("mutect-example-headerless.vcf"),
            'genome': cached_release(75)
        },
    ]
    if RUN_TESTS_REQUIRING_INTERNET:
        paths.append({'path': VCF_EXTERNAL_URL})
        paths.append({'path': VCF_EXTERNAL_URL + ".gz"})

    def do_test(kwargs):
        vcf_pandas = load_vcf_fast(**kwargs)
        vcf_pyvcf = load_vcf(**kwargs)
        eq_(vcf_pandas, vcf_pyvcf)
        eq_(len(vcf_pandas), len(vcf_pyvcf))
        eq_(vcf_pandas.elements, vcf_pyvcf.elements)
        eq_(vcf_pandas.metadata, vcf_pyvcf.metadata)
        assert len(vcf_pandas) > 1
        assert len(vcf_pyvcf) > 1

    for kwargs in paths:
        yield (do_test, kwargs)
Esempio n. 12
0
def test_ensembl_releases(*versions):
    """
    Run a unit test which takes an EnsemblRelease as an argument
    for multiple releases (most recent for each reference genome)
    """
    if len(versions) == 0:
        ensembl_releases = major_releases
    else:
        ensembl_releases = [cached_release(version) for version in versions]

    def decorator(test_fn):
        @functools.wraps(test_fn)
        def new_test_fn():
            for ensembl in ensembl_releases:
                test_fn(ensembl)

        return new_test_fn

    return decorator
Esempio n. 13
0
def test_ensembl_releases(*versions):
    """
    Run a unit test which takes an EnsemblRelease as an argument
    for multiple releases (most recent for each reference genome)
    """

    if len(versions) == 0:
        ensembl_releases = major_releases
    else:
        if any(version > MAX_ENSEMBL_RELEASE for version in versions):
            raise ValueError("Invalid ensembl release numbers: %s" % (versions,))
        ensembl_releases = [cached_release(version) for version in versions]

    def decorator(test_fn):
        @functools.wraps(test_fn)
        def new_test_fn():
            for ensembl in ensembl_releases:
                test_fn(ensembl)
        return new_test_fn
    return decorator
Esempio n. 14
0
def test_ensembl_releases(*versions):
    """
    Run a unit test which takes an EnsemblRelease as an argument
    for multiple releases (most recent for each reference genome)
    """

    if len(versions) == 0:
        ensembl_releases = major_releases
    else:
        if any(version > MAX_ENSEMBL_RELEASE for version in versions):
            raise ValueError("Invalid ensembl release numbers: %s" %
                             (versions, ))
        ensembl_releases = [cached_release(version) for version in versions]

    def decorator(test_fn):
        @functools.wraps(test_fn)
        def new_test_fn():
            for ensembl in ensembl_releases:
                test_fn(ensembl)

        return new_test_fn

    return decorator
Esempio n. 15
0
"""
Test all methods which return collections of gene IDs that aren't converting
from some other type of name or ID.

TODO: Implement tests for EnsemblRelease.gene_ids
"""
from __future__ import absolute_import

from nose.tools import assert_raises, ok_
from pyensembl import ensembl_grch38, cached_release

from .common import test_ensembl_releases

ensembl77 = cached_release(77, "human")


def test_gene_ids_grch38_hla_a():
    # chr6:29,945,884  is a position for HLA-A
    # Gene ID = ENSG00000206503
    # based on:
    # http://useast.ensembl.org/Homo_sapiens/Gene/
    # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
    ids = ensembl_grch38.gene_ids_at_locus(6, 29945884)
    expected = "ENSG00000206503"
    assert ids == ["ENSG00000206503"], \
        "Expected HLA-A, gene ID = %s, got: %s" % (expected, ids)


def test_gene_ids_of_gene_name_hla_grch38():
    hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
    assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids
Esempio n. 16
0
    def __init__(self,
                 contig,
                 start,
                 ref,
                 alt,
                 ensembl=ensembl_grch38,
                 allow_extended_nucleotides=False,
                 normalize_contig_name=True):
        """
        Construct a Variant object.

        Parameters
        ----------
        contig : str
            Chromosome that this variant is on

        start : int
            1-based position on the chromosome of first reference nucleotide

        ref : str
            Reference nucleotide(s)

        alt : str
            Alternate nucleotide(s)

        ensembl : Genome or EnsemblRelease
            Object used for determining gene/transcript annotations

        allow_extended_nucleotides : bool
            Extended nucleotides include 'Y' for pyrimidies or 'N' for any base

        normalize_contig_name : bool
            By default the contig name will be normalized by trimming a 'chr'
            prefix and converting all letters to upper-case. If we don't want
            this behavior then pass normalize_contig_name=False.
        """

        # first initialize the _genes and _transcripts fields we use to cache
        # lists of overlapping pyensembl Gene and Transcript objects
        self._genes = self._transcripts = None

        # user might supply Ensembl release as an integer, reference name,
        # or pyensembl.Genome object
        if isinstance(ensembl, Genome):
            self.ensembl = ensembl
        elif isinstance(ensembl, int):
            self.ensembl = cached_release(ensembl)
        elif isinstance(ensembl, str):
            self.ensembl = genome_for_reference_name(ensembl)
        else:
            raise TypeError(
                ("Expected ensembl to be an int, string, or pyensembl.Genome "
                 "instance, got %s : %s") % (type(ensembl), str(ensembl)))

        self.normalize_contig_name = normalize_contig_name
        self.allow_extended_nucleotides = allow_extended_nucleotides
        self.original_contig = contig
        self.contig = normalize_chromosome(
            contig) if normalize_contig_name else contig

        if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES:
            # Optimization for common case.
            self.original_ref = self.ref = ref
            self.original_alt = self.alt = alt
            self.original_start = self.start = self.end = int(start)
            return

        # we want to preserve the ref/alt/pos both as they appeared in the
        # original VCF or MAF file but also normalize variants to get rid
        # of shared prefixes/suffixes between the ref and alt nucleotide
        # strings e.g. g.10 CTT>T can be normalized into g.10delCT
        #
        # The normalized variant properties go into fields
        #    Variant.{original_ref, original_alt, original_pos}
        # whereas the trimmed fields are:
        #    Variant.{ref, alt, start, end}

        # the original entries must preserve the number of nucleotides in
        # ref and alt but we still want to normalize e.g. '-' and '.' into ''
        self.original_ref = normalize_nucleotide_string(
            ref, allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_alt = normalize_nucleotide_string(
            alt, allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_start = int(start)

        # normalize the variant by trimming any shared prefix or suffix
        # between ref and alt nucleotide sequences and then
        # offset the variant position in a strand-dependent manner
        (trimmed_ref, trimmed_alt, prefix,
         suffix) = (trim_shared_flanking_strings(self.original_ref,
                                                 self.original_alt))

        self.ref = trimmed_ref
        self.alt = trimmed_alt

        if len(trimmed_ref) == 0:
            # insertions must be treated differently since the meaning of a
            # position for an insertion is:
            #   "insert the alt nucleotides after this position"
            #
            # Aside: what if both trimmed ref and alt strings are empty?
            # This means we had a "null" variant, probably from a VCF
            # generated by force-calling mutations which weren't actually
            # found in the sample.
            # Null variants are interepted as inserting zero nucleotides
            # after the whole reference sequence.
            #
            # Start and end both are base-1 nucleotide position before
            # insertion.
            self.start = self.original_start + max(0, len(prefix) - 1)
            self.end = self.start
        else:
            # for substitutions and deletions the [start:end] interval is
            # an inclusive selection of reference nucleotides
            self.start = self.original_start + len(prefix)
            self.end = self.start + len(trimmed_ref) - 1
Esempio n. 17
0
    StartLoss,
    AlternateStartCodon,
    PrematureStop,
    FrameShift,
    ExonLoss,
    ExonicSpliceSite,
    FrameShiftTruncation,
    # TODO: SpliceDonor, SpliceReceptor
)
from pyensembl import ensembl_grch37, cached_release

from .common import expect_effect

# tried using more recent releases but found that many of them
# are very specific to Ensembl data between releases 77-81
ensembl_grch38 = cached_release(81)

def test_incomplete():
    # transcript EGFR-009 (ENST00000450046 in Ensembl 78)
    # has an incomplete 3' end
    # chrom. 7 starting at 55,109,723
    # first exon begins: ATCATTCCTTTGGGCCTAGGA

    # change the first nucleotide of the 5' UTR A>T
    variant = Variant("7", 55109723, "A", "T", ensembl=ensembl_grch38)
    expect_effect(
        variant,
        transcript_id="ENST00000450046",
        effect_class=IncompleteTranscript,
        modifies_coding_sequence=False,
        modifies_protein_sequence=False)
"""Make sure we're getting correct transcritp sequence from Ensembl and that
it's a sequence type which correctly implements `complement`
and `reverse_complement`
"""

from __future__ import absolute_import
from nose.tools import eq_
from pyensembl import cached_release

ensembl54 = cached_release(54)
ensembl83 = cached_release(83)


def test_transcript_sequence_ensembl54():
    seq = ensembl54.transcript_sequence("ENST00000321606")
    assert len(seq) == 414, \
        "Expected transcript ENST00000321606 to have 414nt, got %s : %d" % (
            seq, len(seq))
    nucleotide_lines = [
        "CATGTCACCCACCTTCAGGCGGCCCAAGACACTGCGACTCCGGAGGCAGCCCAGATATCCTCGGAAGAG",
        "CACCCCCAGGAGAAACAAGCTTGGCCACTATGCTATCATCAAGTTTCCGCTGACCACTGAGTCGGCCGT",
        "GAAGAAGATAGAAGAAAACAACACGCTTGTGTTCACTGTGGATGTTAAAGCCAACAAGCACCAGATCAG",
        "ACAGGCTGTGAAGAAGCTCTATGACAGTGATGTGGCCAAGGTCACCACCCTGATTTGTCCTGATAAAGA",
        "GAACAAGGCATATGTTCGACTTGCTCCTGATTATGATGCTTTCGATGTTGTAACAAAATTGGGATCACC",
        "TAAACTGAGTCCAGCTGGCTAACTCTAAATATATGTGTATCTTTTCAGCATAAAAAAATAATGTTTTTC"
    ]
    full_transcript_sequence = "".join(nucleotide_lines)
    eq_(str(seq), full_transcript_sequence)

    # now get the same sequence via a Transcript object
    eq_(ensembl54.transcript_by_id("ENST00000321606").sequence, seq)
Esempio n. 19
0
from pyensembl import cached_release, genome_for_reference_name

grcm38 = genome_for_reference_name("grcm38")
grch38 = cached_release(87)
Esempio n. 20
0
    StartLoss,
    AlternateStartCodon,
    PrematureStop,
    FrameShift,
    ExonLoss,
    ExonicSpliceSite,
    FrameShiftTruncation,
    # TODO: SpliceDonor, SpliceReceptor
)
from pyensembl import ensembl_grch37, cached_release

from .common import expect_effect

# tried using more recent releases but found that many of them
# are very specific to Ensembl data between releases 77-81
ensembl_grch38 = cached_release(81)


def test_incomplete():
    # transcript EGFR-009 (ENST00000450046 in Ensembl 78)
    # has an incomplete 3' end
    # chrom. 7 starting at 55,109,723
    # first exon begins: ATCATTCCTTTGGGCCTAGGA

    # change the first nucleotide of the 5' UTR A>T
    variant = Variant("7", 55109723, "A", "T", ensembl=ensembl_grch38)
    expect_effect(variant,
                  transcript_id="ENST00000450046",
                  effect_class=IncompleteTranscript,
                  modifies_coding_sequence=False,
                  modifies_protein_sequence=False)
Esempio n. 21
0
"""
Test all methods which return collections of gene IDs that aren't converting
from some other type of name or ID.

TODO: Implement tests for EnsemblRelease.gene_ids
"""
from __future__ import absolute_import

from nose.tools import assert_raises, ok_
from pyensembl import ensembl_grch38, cached_release

from .common import test_ensembl_releases

ensembl77 = cached_release(77, "human")

def test_gene_ids_grch38_hla_a():
    # chr6:29,945,884  is a position for HLA-A
    # Gene ID = ENSG00000206503
    # based on:
    # http://useast.ensembl.org/Homo_sapiens/Gene/
    # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
    ids = ensembl_grch38.gene_ids_at_locus(6, 29945884)
    expected = "ENSG00000206503"
    assert ids == ["ENSG00000206503"], \
        "Expected HLA-A, gene ID = %s, got: %s" % (expected, ids)

def test_gene_ids_of_gene_name_hla_grch38():
    hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
    assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids

    hla_b_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-B")
Esempio n. 22
0
    def __init__(
            self,
            contig,
            start,
            ref,
            alt,
            ensembl=ensembl_grch38,
            allow_extended_nucleotides=False):
        """
        Construct a Variant object.

        Parameters
        ----------
        contig : str
            Chromosome that this variant is on

        start : int
            1-based position on the chromosome of first reference nucleotide

        ref : str
            Reference nucleotide(s)

        alt : str
            Alternate nucleotide(s)

        ensembl : Genome or EnsemblRelease
            Object used for determining gene/transcript annotations

        info : dict, optional
            Extra metadata about this variant
        """
        self.contig = normalize_chromosome(contig)

        # user might supply Ensembl release as an integer, reference name,
        # or pyensembl.Genome object
        if isinstance(ensembl, Genome):
            self.ensembl = ensembl
        elif isinstance(ensembl, int):
            self.ensembl = cached_release(ensembl)
        elif isinstance(ensembl, str):
            self.ensembl = genome_for_reference_name(ensembl)
        else:
            raise TypeError(
                ("Expected ensembl to be an int, string, or pyensembl.Genome "
                 "instance, got %s : %s") % (type(ensembl), str(ensembl)))

        if (ref in STANDARD_NUCLEOTIDES and
                alt in STANDARD_NUCLEOTIDES and
                ref != alt):

            # Optimization for common case.
            self.original_ref = self.ref = ref
            self.original_alt = self.alt = alt
            self.original_start = self.start = self.end = int(start)
            return

        # we want to preserve the ref/alt/pos both as they appeared in the
        # original VCF or MAF file but also normalize variants to get rid
        # of shared prefixes/suffixes between the ref and alt nucleotide
        # strings e.g. g.10 CTT>T can be normalized into g.10delCT
        #
        # The normalized variant properties go into fields
        #    Variant.{original_ref, original_alt, original_pos}
        # whereas the trimmed fields are:
        #    Variant.{ref, alt, start, end}

        # the original entries must preserve the number of nucleotides in
        # ref and alt but we still want to normalize e.g. '-' and '.' into ''
        self.original_ref = normalize_nucleotide_string(ref,
            allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_alt = normalize_nucleotide_string(alt,
            allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_start = int(start)

        # normalize the variant by trimming any shared prefix or suffix
        # between ref and alt nucleotide sequences and then
        # offset the variant position in a strand-dependent manner
        (trimmed_ref, trimmed_alt, prefix, suffix) = (
            trim_shared_flanking_strings(self.original_ref, self.original_alt))

        self.ref = trimmed_ref
        self.alt = trimmed_alt

        # insertions must be treated differently since the meaning of a
        # position for an insertion is
        #   "insert the alt nucleotides after this position"
        if len(trimmed_ref) == 0:
            # start and end both are nucleotide before insertion
            self.start = self.original_start + max(0, len(prefix) - 1)
            self.end = self.start
        else:
            # for substitutions and deletions the [start:end] interval is
            # an inclusive selection of reference nucleotides
            self.start = self.original_start + len(prefix)
            self.end = self.start + len(trimmed_ref) - 1
from .common import test_ensembl_releases
from .data import (
    FOXP3_001_transcript_id,
    CTNNBIP1_004_transcript_id,
    CTNNBIP1_004_UTR5,
    CTNNBIP1_004_UTR3,
    CTNNBIP1_004_CDS,
    CTNNBIP1_004_locus,
    CTTNNIP1_004_exon_lengths,
    CTTNNIP1_004_exon_ids,
    EGFR_001_protein_sequence,
    TP53_gene_id,
)

ensembl77 = cached_release(77)

def test_transcript_start_codon():
    """
    test_transcript_start_codon : Check that fields Transcript
    (for transcript named CTNNBIP1-004) matches known values.
    """
    CTNNBIP1_004_transcript = ensembl77.transcript_by_id(
        CTNNBIP1_004_transcript_id)
    assert Locus.__eq__(CTNNBIP1_004_locus, CTNNBIP1_004_transcript), \
        "Expected locus %s but got %s" % (
            CTNNBIP1_004_locus, Locus.__str__(CTNNBIP1_004_transcript))

    start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets
    assert len(start_offsets) == 3, \
        "Wrong length for start codon: %d (%s)" % (
Esempio n. 24
0
"""
Exon IDs of the TP53 gene and one of its transcripts (TP53-026) were copied
from the Ensembl website, make sure same IDs are found by pyensembl.
"""
from __future__ import absolute_import

from pyensembl import cached_release

ensembl = cached_release(77)

# all exons associated with TP53 gene in Ensembl release 77
TP53_EXON_IDS_RELEASE_77 = [
    'ENSE00002337729', 'ENSE00002419584',
    'ENSE00003625790', 'ENSE00003518480',
    'ENSE00003723991', 'ENSE00003712342',
    'ENSE00001657961', 'ENSE00003725258',
    'ENSE00003740946', 'ENSE00002204316',
    'ENSE00002064269', 'ENSE00003750554',
    'ENSE00003634848', 'ENSE00003492844',
    'ENSE00003735852', 'ENSE00003545950',
    'ENSE00003605891', 'ENSE00002051192',
    'ENSE00002084733', 'ENSE00003726882',
    'ENSE00001146308', 'ENSE00002667911',
    'ENSE00003752869', 'ENSE00003739898',
    'ENSE00003753508', 'ENSE00002034209',
    'ENSE00002030826', 'ENSE00001596491',
    'ENSE00002037735', 'ENSE00003736616',
    'ENSE00002672443', 'ENSE00002226620',
    'ENSE00003715195', 'ENSE00003750794',
    'ENSE00003745267', 'ENSE00003746220',
    'ENSE00003656695', 'ENSE00003669712',
Esempio n. 25
0
# Read the exported Differential Gene Expression find to find the Drug Candidates
df = pd.read_csv("diff_exp_results.csv")

#Stripping Ensembl ID name
df['Gene'] = df['Gene'].str[5:]

df = df[df.log2FoldChange > 3]

#Store the Newly found Drugs for the thresholded Genes in df_DG
df_DG = pd.DataFrame(columns=('Gene', 'Drug'))

#Import annotations. You will need to install:
#pyensembl install --release 100 --species homo_sapiens
#if the file doesn't work
geneDB = cached_release(100, "human")

#Reading finding drugs available for the genes
print("finding drugs available for the genes...")
genes_list = []
drugs_list = []
for i in range(len(df['Gene'])):
    try:
        genename = geneDB.gene_name_of_gene_id(df['Gene'][i])
        drugs = drugsfinder(genename)
        genes_list = genes_list + [genename] * len(drugs)
        drugs_list = drugs_list + drugs
    except:
        pass

df_DG['Gene'] = genes_list
Esempio n. 26
0
    def __init__(
            self,
            contig,
            start,
            ref,
            alt,
            ensembl=ensembl_grch38,
            allow_extended_nucleotides=False,
            normalize_contig_name=True):
        """
        Construct a Variant object.

        Parameters
        ----------
        contig : str
            Chromosome that this variant is on

        start : int
            1-based position on the chromosome of first reference nucleotide

        ref : str
            Reference nucleotide(s)

        alt : str
            Alternate nucleotide(s)

        ensembl : Genome or EnsemblRelease
            Object used for determining gene/transcript annotations

        allow_extended_nucleotides : bool
            Extended nucleotides include 'Y' for pyrimidies or 'N' for any base

        normalize_contig_name : bool
            By default the contig name will be normalized by trimming a 'chr'
            prefix and converting all letters to upper-case. If we don't want
            this behavior then pass normalize_contig_name=False.
        """

        # first initialize the _genes and _transcripts fields we use to cache
        # lists of overlapping pyensembl Gene and Transcript objects
        self._genes = self._transcripts = None

        # user might supply Ensembl release as an integer, reference name,
        # or pyensembl.Genome object
        if isinstance(ensembl, Genome):
            self.ensembl = ensembl
        elif isinstance(ensembl, int):
            self.ensembl = cached_release(ensembl)
        elif isinstance(ensembl, str):
            self.ensembl = genome_for_reference_name(ensembl)
        else:
            raise TypeError(
                ("Expected ensembl to be an int, string, or pyensembl.Genome "
                 "instance, got %s : %s") % (type(ensembl), str(ensembl)))

        self.normalize_contig_name = normalize_contig_name
        self.allow_extended_nucleotides = allow_extended_nucleotides
        self.original_contig = contig
        self.contig = normalize_chromosome(contig) if normalize_contig_name else contig

        if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES:
            # Optimization for common case.
            self.original_ref = self.ref = ref
            self.original_alt = self.alt = alt
            self.original_start = self.start = self.end = int(start)
            return

        # we want to preserve the ref/alt/pos both as they appeared in the
        # original VCF or MAF file but also normalize variants to get rid
        # of shared prefixes/suffixes between the ref and alt nucleotide
        # strings e.g. g.10 CTT>T can be normalized into g.10delCT
        #
        # The normalized variant properties go into fields
        #    Variant.{original_ref, original_alt, original_pos}
        # whereas the trimmed fields are:
        #    Variant.{ref, alt, start, end}

        # the original entries must preserve the number of nucleotides in
        # ref and alt but we still want to normalize e.g. '-' and '.' into ''
        self.original_ref = normalize_nucleotide_string(
            ref,
            allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_alt = normalize_nucleotide_string(
            alt,
            allow_extended_nucleotides=allow_extended_nucleotides)
        self.original_start = int(start)

        # normalize the variant by trimming any shared prefix or suffix
        # between ref and alt nucleotide sequences and then
        # offset the variant position in a strand-dependent manner
        (trimmed_ref, trimmed_alt, prefix, suffix) = (
            trim_shared_flanking_strings(self.original_ref, self.original_alt))

        self.ref = trimmed_ref
        self.alt = trimmed_alt

        if len(trimmed_ref) == 0:
            # insertions must be treated differently since the meaning of a
            # position for an insertion is:
            #   "insert the alt nucleotides after this position"
            #
            # Aside: what if both trimmed ref and alt strings are empty?
            # This means we had a "null" variant, probably from a VCF
            # generated by force-calling mutations which weren't actually
            # found in the sample.
            # Null variants are interepted as inserting zero nucleotides
            # after the whole reference sequence.
            #
            # Start and end both are base-1 nucleotide position before
            # insertion.
            self.start = self.original_start + max(0, len(prefix) - 1)
            self.end = self.start
        else:
            # for substitutions and deletions the [start:end] interval is
            # an inclusive selection of reference nucleotides
            self.start = self.original_start + len(prefix)
            self.end = self.start + len(trimmed_ref) - 1
Esempio n. 27
0
from .common import test_ensembl_releases
from .data import (
    FOXP3_001_transcript_id,
    CTNNBIP1_004_transcript_id,
    CTNNBIP1_004_UTR5,
    CTNNBIP1_004_UTR3,
    CTNNBIP1_004_CDS,
    CTNNBIP1_004_locus,
    CTTNNIP1_004_exon_lengths,
    CTTNNIP1_004_exon_ids,
    EGFR_001_protein_sequence,
    TP53_gene_id,
)

ensembl77 = cached_release(77)


def test_transcript_start_codon():
    """
    test_transcript_start_codon : Check that fields Transcript
    (for transcript named CTNNBIP1-004) matches known values.
    """
    CTNNBIP1_004_transcript = ensembl77.transcript_by_id(
        CTNNBIP1_004_transcript_id)
    assert Locus.__eq__(CTNNBIP1_004_locus, CTNNBIP1_004_transcript), \
        "Expected locus %s but got %s" % (
            CTNNBIP1_004_locus, Locus.__str__(CTNNBIP1_004_transcript))

    start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets
    assert len(start_offsets) == 3, \
"""Make sure we're getting correct transcritp sequence from Ensembl and that
it's a sequence type which correctly implements `complement`
and `reverse_complement`
"""

from __future__ import absolute_import
from nose.tools import eq_
from pyensembl import cached_release

ensembl54 = cached_release(54)
ensembl83 = cached_release(83)

def test_transcript_sequence_ensembl54():
    seq = ensembl54.transcript_sequence("ENST00000321606")
    assert len(seq) == 414, \
        "Expected transcript ENST00000321606 to have 414nt, got %s : %d" % (
            seq, len(seq))
    nucleotide_lines = [
        "CATGTCACCCACCTTCAGGCGGCCCAAGACACTGCGACTCCGGAGGCAGCCCAGATATCCTCGGAAGAG",
        "CACCCCCAGGAGAAACAAGCTTGGCCACTATGCTATCATCAAGTTTCCGCTGACCACTGAGTCGGCCGT",
        "GAAGAAGATAGAAGAAAACAACACGCTTGTGTTCACTGTGGATGTTAAAGCCAACAAGCACCAGATCAG",
        "ACAGGCTGTGAAGAAGCTCTATGACAGTGATGTGGCCAAGGTCACCACCCTGATTTGTCCTGATAAAGA",
        "GAACAAGGCATATGTTCGACTTGCTCCTGATTATGATGCTTTCGATGTTGTAACAAAATTGGGATCACC",
        "TAAACTGAGTCCAGCTGGCTAACTCTAAATATATGTGTATCTTTTCAGCATAAAAAAATAATGTTTTTC"
    ]
    full_transcript_sequence = "".join(nucleotide_lines)
    eq_(str(seq), full_transcript_sequence)

    # now get the same sequence via a Transcript object
    eq_(ensembl54.transcript_by_id("ENST00000321606").sequence, seq)
Esempio n. 29
0
"""
Exon IDs of the TP53 gene and one of its transcripts (TP53-026) were copied
from the Ensembl website, make sure same IDs are found by pyensembl.
"""
from __future__ import absolute_import

from pyensembl import cached_release

ensembl = cached_release(77)

# all exons associated with TP53 gene in Ensembl release 77
TP53_EXON_IDS_RELEASE_77 = [
    'ENSE00002337729', 'ENSE00002419584', 'ENSE00003625790', 'ENSE00003518480',
    'ENSE00003723991', 'ENSE00003712342', 'ENSE00001657961', 'ENSE00003725258',
    'ENSE00003740946', 'ENSE00002204316', 'ENSE00002064269', 'ENSE00003750554',
    'ENSE00003634848', 'ENSE00003492844', 'ENSE00003735852', 'ENSE00003545950',
    'ENSE00003605891', 'ENSE00002051192', 'ENSE00002084733', 'ENSE00003726882',
    'ENSE00001146308', 'ENSE00002667911', 'ENSE00003752869', 'ENSE00003739898',
    'ENSE00003753508', 'ENSE00002034209', 'ENSE00002030826', 'ENSE00001596491',
    'ENSE00002037735', 'ENSE00003736616', 'ENSE00002672443', 'ENSE00002226620',
    'ENSE00003715195', 'ENSE00003750794', 'ENSE00003745267', 'ENSE00003746220',
    'ENSE00003656695', 'ENSE00003669712', 'ENSE00002051873', 'ENSE00002048269',
    'ENSE00002670535', 'ENSE00002677565', 'ENSE00003532881', 'ENSE00003520683',
    'ENSE00002076714', 'ENSE00002062958', 'ENSE00002073243', 'ENSE00003670707',
    'ENSE00002065802', 'ENSE00002362269'
]


def test_exon_ids_of_gene_id():
    """
    test_exon_ids_of_gene_id: Ensure that gene_id ENSG00000141510 (name=TP53),