def test_genome_arg_to_load_vcf_cached_75(): eq_(load_vcf(HG19_VCF_FILENAME), load_vcf(HG19_VCF_FILENAME, genome=cached_release(75), convert_ucsc_contig_names=True)) assert load_vcf(HG19_VCF_FILENAME) != load_vcf( HG19_VCF_FILENAME, genome=cached_release(75), convert_ucsc_contig_names=False)
def infer_genome(genome_object_string_or_int): """ If given an integer, return associated human EnsemblRelease for that Ensembl version. If given a string, return latest EnsemblRelease which has a reference of the same name. If given a PyEnsembl Genome, simply return it. """ if isinstance(genome_object_string_or_int, Genome): return genome_object_string_or_int if is_integer(genome_object_string_or_int): return cached_release(genome_object_string_or_int) elif is_string(genome_object_string_or_int): # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37 # and then get the associated PyEnsembl Genome object reference_name = infer_reference_name(genome_object_string_or_int) return genome_for_reference_name(reference_name) else: raise TypeError( ("Expected genome to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % ( str(genome_object_string_or_int), type(genome_object_string_or_int)))
def __setstate__(self, fields): # This field require special logic. self.ensembl = cached_release(fields.pop("release")) # Remaining fields are simple properties that just get set. for (key, value) in fields.items(): setattr(self, key, value)
def test_pandas_and_pyvcf_implementations_equivalent(): paths = [ {'path': data_path("somatic_hg19_14muts.vcf")}, {'path': data_path("somatic_hg19_14muts.space_in_sample_name.vcf")}, {'path': "/" + data_path("somatic_hg19_14muts.vcf")}, {'path': data_path("somatic_hg19_14muts.vcf.gz")}, {'path': data_path("multiallelic.vcf")}, {'path': data_path("mutect-example.vcf")}, {'path': data_path("strelka-example.vcf")}, {'path': data_path("mutect-example-headerless.vcf"), 'genome': cached_release(75)}, ] if RUN_TESTS_REQUIRING_INTERNET: paths.append({'path': VCF_EXTERNAL_URL}) paths.append({'path': VCF_EXTERNAL_URL + ".gz"}) def do_test(kwargs): vcf_pandas = load_vcf_fast(**kwargs) vcf_pyvcf = load_vcf(**kwargs) eq_(vcf_pandas, vcf_pyvcf) eq_(len(vcf_pandas), len(vcf_pyvcf)) eq_(vcf_pandas.elements, vcf_pyvcf.elements) eq_(vcf_pandas.metadata, vcf_pyvcf.metadata) assert len(vcf_pandas) > 1 assert len(vcf_pyvcf) > 1 for kwargs in paths: yield (do_test, kwargs)
def test_genome_arg_to_load_vcf(): variants = load_vcf(VCF_FILENAME) eq_(variants, load_vcf(VCF_FILENAME, genome=75)) eq_(variants, load_vcf(VCF_FILENAME, genome=cached_release(75))) eq_(variants, load_vcf(VCF_FILENAME, genome="grch37")) eq_(variants, load_vcf(VCF_FILENAME, genome="GRCh37")) eq_(variants, load_vcf(VCF_FILENAME, genome="b37")) # TODO: actually make hg19 different from b37! They should use # different MT sequences eq_(variants, load_vcf(VCF_FILENAME, genome="hg19"))
def test_transcript_support_level(): """ The Transcript Support Level (TSL) is a method to highlight the well-supported and poorly-supported transcript models for users, based on the type and quality of the alignments used to annotate the transcript. In the Ensembl database, it can be assigned to a value 1 through 5, or reported as NA, or missing, or missing completely in older releases. We translate it to an integer value, otherwise to None. """ ensembl93 = cached_release(93) transcript = ensembl93.transcripts_by_name("DDX11L1-202")[0] eq_(transcript.support_level, 1) # For this transcript, the transcript_support_level value is missing in the database record: transcript = ensembl93.transcripts_by_name("OR4G11P-202")[0] eq_(transcript.support_level, None) # Some features are reported as "NA" in Ensembl: those are features like pseudogenes, single exon transcripts, # HLA, T-cell receptor and Ig transcripts that are not analysed in terms of TSL and therefore not given any # of the TSL categories. We translate NA to None as well. transcript = ensembl93.transcripts_by_name("MIR1302-2-201")[0] eq_(transcript.support_level, None) # Transcript_support_level column was missing completely in GRCh37 and older releases of GRCh38: ensembl77 = cached_release(77) transcript = ensembl77.transcripts_by_name("DDX11L1-002")[0] eq_(transcript.support_level, None)
def test_ensembl_releases(*versions): """ Run a unit test which takes an EnsemblRelease as an argument for multiple releases (most recent for each reference genome) """ if len(versions) == 0: ensembl_releases = major_releases else: ensembl_releases = [cached_release(version) for version in versions] def decorator(test_fn): @functools.wraps(test_fn) def new_test_fn(): for ensembl in ensembl_releases: test_fn(ensembl) return new_test_fn return decorator
def test_pandas_and_pyvcf_implementations_equivalent(): paths = [ { 'path': data_path("somatic_hg19_14muts.vcf") }, { 'path': data_path("somatic_hg19_14muts.space_in_sample_name.vcf") }, { 'path': "/" + data_path("somatic_hg19_14muts.vcf") }, { 'path': data_path("somatic_hg19_14muts.vcf.gz") }, { 'path': data_path("multiallelic.vcf") }, { 'path': data_path("mutect-example.vcf") }, { 'path': data_path("strelka-example.vcf") }, { 'path': data_path("mutect-example-headerless.vcf"), 'genome': cached_release(75) }, ] if RUN_TESTS_REQUIRING_INTERNET: paths.append({'path': VCF_EXTERNAL_URL}) paths.append({'path': VCF_EXTERNAL_URL + ".gz"}) def do_test(kwargs): vcf_pandas = load_vcf_fast(**kwargs) vcf_pyvcf = load_vcf(**kwargs) eq_(vcf_pandas, vcf_pyvcf) eq_(len(vcf_pandas), len(vcf_pyvcf)) eq_(vcf_pandas.elements, vcf_pyvcf.elements) eq_(vcf_pandas.metadata, vcf_pyvcf.metadata) assert len(vcf_pandas) > 1 assert len(vcf_pyvcf) > 1 for kwargs in paths: yield (do_test, kwargs)
def test_ensembl_releases(*versions): """ Run a unit test which takes an EnsemblRelease as an argument for multiple releases (most recent for each reference genome) """ if len(versions) == 0: ensembl_releases = major_releases else: if any(version > MAX_ENSEMBL_RELEASE for version in versions): raise ValueError("Invalid ensembl release numbers: %s" % (versions,)) ensembl_releases = [cached_release(version) for version in versions] def decorator(test_fn): @functools.wraps(test_fn) def new_test_fn(): for ensembl in ensembl_releases: test_fn(ensembl) return new_test_fn return decorator
def test_ensembl_releases(*versions): """ Run a unit test which takes an EnsemblRelease as an argument for multiple releases (most recent for each reference genome) """ if len(versions) == 0: ensembl_releases = major_releases else: if any(version > MAX_ENSEMBL_RELEASE for version in versions): raise ValueError("Invalid ensembl release numbers: %s" % (versions, )) ensembl_releases = [cached_release(version) for version in versions] def decorator(test_fn): @functools.wraps(test_fn) def new_test_fn(): for ensembl in ensembl_releases: test_fn(ensembl) return new_test_fn return decorator
""" Test all methods which return collections of gene IDs that aren't converting from some other type of name or ID. TODO: Implement tests for EnsemblRelease.gene_ids """ from __future__ import absolute_import from nose.tools import assert_raises, ok_ from pyensembl import ensembl_grch38, cached_release from .common import test_ensembl_releases ensembl77 = cached_release(77, "human") def test_gene_ids_grch38_hla_a(): # chr6:29,945,884 is a position for HLA-A # Gene ID = ENSG00000206503 # based on: # http://useast.ensembl.org/Homo_sapiens/Gene/ # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 ids = ensembl_grch38.gene_ids_at_locus(6, 29945884) expected = "ENSG00000206503" assert ids == ["ENSG00000206503"], \ "Expected HLA-A, gene ID = %s, got: %s" % (expected, ids) def test_gene_ids_of_gene_name_hla_grch38(): hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A") assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids
def __init__(self, contig, start, ref, alt, ensembl=ensembl_grch38, allow_extended_nucleotides=False, normalize_contig_name=True): """ Construct a Variant object. Parameters ---------- contig : str Chromosome that this variant is on start : int 1-based position on the chromosome of first reference nucleotide ref : str Reference nucleotide(s) alt : str Alternate nucleotide(s) ensembl : Genome or EnsemblRelease Object used for determining gene/transcript annotations allow_extended_nucleotides : bool Extended nucleotides include 'Y' for pyrimidies or 'N' for any base normalize_contig_name : bool By default the contig name will be normalized by trimming a 'chr' prefix and converting all letters to upper-case. If we don't want this behavior then pass normalize_contig_name=False. """ # first initialize the _genes and _transcripts fields we use to cache # lists of overlapping pyensembl Gene and Transcript objects self._genes = self._transcripts = None # user might supply Ensembl release as an integer, reference name, # or pyensembl.Genome object if isinstance(ensembl, Genome): self.ensembl = ensembl elif isinstance(ensembl, int): self.ensembl = cached_release(ensembl) elif isinstance(ensembl, str): self.ensembl = genome_for_reference_name(ensembl) else: raise TypeError( ("Expected ensembl to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % (type(ensembl), str(ensembl))) self.normalize_contig_name = normalize_contig_name self.allow_extended_nucleotides = allow_extended_nucleotides self.original_contig = contig self.contig = normalize_chromosome( contig) if normalize_contig_name else contig if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES: # Optimization for common case. self.original_ref = self.ref = ref self.original_alt = self.alt = alt self.original_start = self.start = self.end = int(start) return # we want to preserve the ref/alt/pos both as they appeared in the # original VCF or MAF file but also normalize variants to get rid # of shared prefixes/suffixes between the ref and alt nucleotide # strings e.g. g.10 CTT>T can be normalized into g.10delCT # # The normalized variant properties go into fields # Variant.{original_ref, original_alt, original_pos} # whereas the trimmed fields are: # Variant.{ref, alt, start, end} # the original entries must preserve the number of nucleotides in # ref and alt but we still want to normalize e.g. '-' and '.' into '' self.original_ref = normalize_nucleotide_string( ref, allow_extended_nucleotides=allow_extended_nucleotides) self.original_alt = normalize_nucleotide_string( alt, allow_extended_nucleotides=allow_extended_nucleotides) self.original_start = int(start) # normalize the variant by trimming any shared prefix or suffix # between ref and alt nucleotide sequences and then # offset the variant position in a strand-dependent manner (trimmed_ref, trimmed_alt, prefix, suffix) = (trim_shared_flanking_strings(self.original_ref, self.original_alt)) self.ref = trimmed_ref self.alt = trimmed_alt if len(trimmed_ref) == 0: # insertions must be treated differently since the meaning of a # position for an insertion is: # "insert the alt nucleotides after this position" # # Aside: what if both trimmed ref and alt strings are empty? # This means we had a "null" variant, probably from a VCF # generated by force-calling mutations which weren't actually # found in the sample. # Null variants are interepted as inserting zero nucleotides # after the whole reference sequence. # # Start and end both are base-1 nucleotide position before # insertion. self.start = self.original_start + max(0, len(prefix) - 1) self.end = self.start else: # for substitutions and deletions the [start:end] interval is # an inclusive selection of reference nucleotides self.start = self.original_start + len(prefix) self.end = self.start + len(trimmed_ref) - 1
StartLoss, AlternateStartCodon, PrematureStop, FrameShift, ExonLoss, ExonicSpliceSite, FrameShiftTruncation, # TODO: SpliceDonor, SpliceReceptor ) from pyensembl import ensembl_grch37, cached_release from .common import expect_effect # tried using more recent releases but found that many of them # are very specific to Ensembl data between releases 77-81 ensembl_grch38 = cached_release(81) def test_incomplete(): # transcript EGFR-009 (ENST00000450046 in Ensembl 78) # has an incomplete 3' end # chrom. 7 starting at 55,109,723 # first exon begins: ATCATTCCTTTGGGCCTAGGA # change the first nucleotide of the 5' UTR A>T variant = Variant("7", 55109723, "A", "T", ensembl=ensembl_grch38) expect_effect( variant, transcript_id="ENST00000450046", effect_class=IncompleteTranscript, modifies_coding_sequence=False, modifies_protein_sequence=False)
"""Make sure we're getting correct transcritp sequence from Ensembl and that it's a sequence type which correctly implements `complement` and `reverse_complement` """ from __future__ import absolute_import from nose.tools import eq_ from pyensembl import cached_release ensembl54 = cached_release(54) ensembl83 = cached_release(83) def test_transcript_sequence_ensembl54(): seq = ensembl54.transcript_sequence("ENST00000321606") assert len(seq) == 414, \ "Expected transcript ENST00000321606 to have 414nt, got %s : %d" % ( seq, len(seq)) nucleotide_lines = [ "CATGTCACCCACCTTCAGGCGGCCCAAGACACTGCGACTCCGGAGGCAGCCCAGATATCCTCGGAAGAG", "CACCCCCAGGAGAAACAAGCTTGGCCACTATGCTATCATCAAGTTTCCGCTGACCACTGAGTCGGCCGT", "GAAGAAGATAGAAGAAAACAACACGCTTGTGTTCACTGTGGATGTTAAAGCCAACAAGCACCAGATCAG", "ACAGGCTGTGAAGAAGCTCTATGACAGTGATGTGGCCAAGGTCACCACCCTGATTTGTCCTGATAAAGA", "GAACAAGGCATATGTTCGACTTGCTCCTGATTATGATGCTTTCGATGTTGTAACAAAATTGGGATCACC", "TAAACTGAGTCCAGCTGGCTAACTCTAAATATATGTGTATCTTTTCAGCATAAAAAAATAATGTTTTTC" ] full_transcript_sequence = "".join(nucleotide_lines) eq_(str(seq), full_transcript_sequence) # now get the same sequence via a Transcript object eq_(ensembl54.transcript_by_id("ENST00000321606").sequence, seq)
from pyensembl import cached_release, genome_for_reference_name grcm38 = genome_for_reference_name("grcm38") grch38 = cached_release(87)
StartLoss, AlternateStartCodon, PrematureStop, FrameShift, ExonLoss, ExonicSpliceSite, FrameShiftTruncation, # TODO: SpliceDonor, SpliceReceptor ) from pyensembl import ensembl_grch37, cached_release from .common import expect_effect # tried using more recent releases but found that many of them # are very specific to Ensembl data between releases 77-81 ensembl_grch38 = cached_release(81) def test_incomplete(): # transcript EGFR-009 (ENST00000450046 in Ensembl 78) # has an incomplete 3' end # chrom. 7 starting at 55,109,723 # first exon begins: ATCATTCCTTTGGGCCTAGGA # change the first nucleotide of the 5' UTR A>T variant = Variant("7", 55109723, "A", "T", ensembl=ensembl_grch38) expect_effect(variant, transcript_id="ENST00000450046", effect_class=IncompleteTranscript, modifies_coding_sequence=False, modifies_protein_sequence=False)
""" Test all methods which return collections of gene IDs that aren't converting from some other type of name or ID. TODO: Implement tests for EnsemblRelease.gene_ids """ from __future__ import absolute_import from nose.tools import assert_raises, ok_ from pyensembl import ensembl_grch38, cached_release from .common import test_ensembl_releases ensembl77 = cached_release(77, "human") def test_gene_ids_grch38_hla_a(): # chr6:29,945,884 is a position for HLA-A # Gene ID = ENSG00000206503 # based on: # http://useast.ensembl.org/Homo_sapiens/Gene/ # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 ids = ensembl_grch38.gene_ids_at_locus(6, 29945884) expected = "ENSG00000206503" assert ids == ["ENSG00000206503"], \ "Expected HLA-A, gene ID = %s, got: %s" % (expected, ids) def test_gene_ids_of_gene_name_hla_grch38(): hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A") assert 'ENSG00000206503' in hla_a_gene_ids, hla_a_gene_ids hla_b_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-B")
def __init__( self, contig, start, ref, alt, ensembl=ensembl_grch38, allow_extended_nucleotides=False): """ Construct a Variant object. Parameters ---------- contig : str Chromosome that this variant is on start : int 1-based position on the chromosome of first reference nucleotide ref : str Reference nucleotide(s) alt : str Alternate nucleotide(s) ensembl : Genome or EnsemblRelease Object used for determining gene/transcript annotations info : dict, optional Extra metadata about this variant """ self.contig = normalize_chromosome(contig) # user might supply Ensembl release as an integer, reference name, # or pyensembl.Genome object if isinstance(ensembl, Genome): self.ensembl = ensembl elif isinstance(ensembl, int): self.ensembl = cached_release(ensembl) elif isinstance(ensembl, str): self.ensembl = genome_for_reference_name(ensembl) else: raise TypeError( ("Expected ensembl to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % (type(ensembl), str(ensembl))) if (ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES and ref != alt): # Optimization for common case. self.original_ref = self.ref = ref self.original_alt = self.alt = alt self.original_start = self.start = self.end = int(start) return # we want to preserve the ref/alt/pos both as they appeared in the # original VCF or MAF file but also normalize variants to get rid # of shared prefixes/suffixes between the ref and alt nucleotide # strings e.g. g.10 CTT>T can be normalized into g.10delCT # # The normalized variant properties go into fields # Variant.{original_ref, original_alt, original_pos} # whereas the trimmed fields are: # Variant.{ref, alt, start, end} # the original entries must preserve the number of nucleotides in # ref and alt but we still want to normalize e.g. '-' and '.' into '' self.original_ref = normalize_nucleotide_string(ref, allow_extended_nucleotides=allow_extended_nucleotides) self.original_alt = normalize_nucleotide_string(alt, allow_extended_nucleotides=allow_extended_nucleotides) self.original_start = int(start) # normalize the variant by trimming any shared prefix or suffix # between ref and alt nucleotide sequences and then # offset the variant position in a strand-dependent manner (trimmed_ref, trimmed_alt, prefix, suffix) = ( trim_shared_flanking_strings(self.original_ref, self.original_alt)) self.ref = trimmed_ref self.alt = trimmed_alt # insertions must be treated differently since the meaning of a # position for an insertion is # "insert the alt nucleotides after this position" if len(trimmed_ref) == 0: # start and end both are nucleotide before insertion self.start = self.original_start + max(0, len(prefix) - 1) self.end = self.start else: # for substitutions and deletions the [start:end] interval is # an inclusive selection of reference nucleotides self.start = self.original_start + len(prefix) self.end = self.start + len(trimmed_ref) - 1
from .common import test_ensembl_releases from .data import ( FOXP3_001_transcript_id, CTNNBIP1_004_transcript_id, CTNNBIP1_004_UTR5, CTNNBIP1_004_UTR3, CTNNBIP1_004_CDS, CTNNBIP1_004_locus, CTTNNIP1_004_exon_lengths, CTTNNIP1_004_exon_ids, EGFR_001_protein_sequence, TP53_gene_id, ) ensembl77 = cached_release(77) def test_transcript_start_codon(): """ test_transcript_start_codon : Check that fields Transcript (for transcript named CTNNBIP1-004) matches known values. """ CTNNBIP1_004_transcript = ensembl77.transcript_by_id( CTNNBIP1_004_transcript_id) assert Locus.__eq__(CTNNBIP1_004_locus, CTNNBIP1_004_transcript), \ "Expected locus %s but got %s" % ( CTNNBIP1_004_locus, Locus.__str__(CTNNBIP1_004_transcript)) start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets assert len(start_offsets) == 3, \ "Wrong length for start codon: %d (%s)" % (
""" Exon IDs of the TP53 gene and one of its transcripts (TP53-026) were copied from the Ensembl website, make sure same IDs are found by pyensembl. """ from __future__ import absolute_import from pyensembl import cached_release ensembl = cached_release(77) # all exons associated with TP53 gene in Ensembl release 77 TP53_EXON_IDS_RELEASE_77 = [ 'ENSE00002337729', 'ENSE00002419584', 'ENSE00003625790', 'ENSE00003518480', 'ENSE00003723991', 'ENSE00003712342', 'ENSE00001657961', 'ENSE00003725258', 'ENSE00003740946', 'ENSE00002204316', 'ENSE00002064269', 'ENSE00003750554', 'ENSE00003634848', 'ENSE00003492844', 'ENSE00003735852', 'ENSE00003545950', 'ENSE00003605891', 'ENSE00002051192', 'ENSE00002084733', 'ENSE00003726882', 'ENSE00001146308', 'ENSE00002667911', 'ENSE00003752869', 'ENSE00003739898', 'ENSE00003753508', 'ENSE00002034209', 'ENSE00002030826', 'ENSE00001596491', 'ENSE00002037735', 'ENSE00003736616', 'ENSE00002672443', 'ENSE00002226620', 'ENSE00003715195', 'ENSE00003750794', 'ENSE00003745267', 'ENSE00003746220', 'ENSE00003656695', 'ENSE00003669712',
# Read the exported Differential Gene Expression find to find the Drug Candidates df = pd.read_csv("diff_exp_results.csv") #Stripping Ensembl ID name df['Gene'] = df['Gene'].str[5:] df = df[df.log2FoldChange > 3] #Store the Newly found Drugs for the thresholded Genes in df_DG df_DG = pd.DataFrame(columns=('Gene', 'Drug')) #Import annotations. You will need to install: #pyensembl install --release 100 --species homo_sapiens #if the file doesn't work geneDB = cached_release(100, "human") #Reading finding drugs available for the genes print("finding drugs available for the genes...") genes_list = [] drugs_list = [] for i in range(len(df['Gene'])): try: genename = geneDB.gene_name_of_gene_id(df['Gene'][i]) drugs = drugsfinder(genename) genes_list = genes_list + [genename] * len(drugs) drugs_list = drugs_list + drugs except: pass df_DG['Gene'] = genes_list
def __init__( self, contig, start, ref, alt, ensembl=ensembl_grch38, allow_extended_nucleotides=False, normalize_contig_name=True): """ Construct a Variant object. Parameters ---------- contig : str Chromosome that this variant is on start : int 1-based position on the chromosome of first reference nucleotide ref : str Reference nucleotide(s) alt : str Alternate nucleotide(s) ensembl : Genome or EnsemblRelease Object used for determining gene/transcript annotations allow_extended_nucleotides : bool Extended nucleotides include 'Y' for pyrimidies or 'N' for any base normalize_contig_name : bool By default the contig name will be normalized by trimming a 'chr' prefix and converting all letters to upper-case. If we don't want this behavior then pass normalize_contig_name=False. """ # first initialize the _genes and _transcripts fields we use to cache # lists of overlapping pyensembl Gene and Transcript objects self._genes = self._transcripts = None # user might supply Ensembl release as an integer, reference name, # or pyensembl.Genome object if isinstance(ensembl, Genome): self.ensembl = ensembl elif isinstance(ensembl, int): self.ensembl = cached_release(ensembl) elif isinstance(ensembl, str): self.ensembl = genome_for_reference_name(ensembl) else: raise TypeError( ("Expected ensembl to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % (type(ensembl), str(ensembl))) self.normalize_contig_name = normalize_contig_name self.allow_extended_nucleotides = allow_extended_nucleotides self.original_contig = contig self.contig = normalize_chromosome(contig) if normalize_contig_name else contig if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES: # Optimization for common case. self.original_ref = self.ref = ref self.original_alt = self.alt = alt self.original_start = self.start = self.end = int(start) return # we want to preserve the ref/alt/pos both as they appeared in the # original VCF or MAF file but also normalize variants to get rid # of shared prefixes/suffixes between the ref and alt nucleotide # strings e.g. g.10 CTT>T can be normalized into g.10delCT # # The normalized variant properties go into fields # Variant.{original_ref, original_alt, original_pos} # whereas the trimmed fields are: # Variant.{ref, alt, start, end} # the original entries must preserve the number of nucleotides in # ref and alt but we still want to normalize e.g. '-' and '.' into '' self.original_ref = normalize_nucleotide_string( ref, allow_extended_nucleotides=allow_extended_nucleotides) self.original_alt = normalize_nucleotide_string( alt, allow_extended_nucleotides=allow_extended_nucleotides) self.original_start = int(start) # normalize the variant by trimming any shared prefix or suffix # between ref and alt nucleotide sequences and then # offset the variant position in a strand-dependent manner (trimmed_ref, trimmed_alt, prefix, suffix) = ( trim_shared_flanking_strings(self.original_ref, self.original_alt)) self.ref = trimmed_ref self.alt = trimmed_alt if len(trimmed_ref) == 0: # insertions must be treated differently since the meaning of a # position for an insertion is: # "insert the alt nucleotides after this position" # # Aside: what if both trimmed ref and alt strings are empty? # This means we had a "null" variant, probably from a VCF # generated by force-calling mutations which weren't actually # found in the sample. # Null variants are interepted as inserting zero nucleotides # after the whole reference sequence. # # Start and end both are base-1 nucleotide position before # insertion. self.start = self.original_start + max(0, len(prefix) - 1) self.end = self.start else: # for substitutions and deletions the [start:end] interval is # an inclusive selection of reference nucleotides self.start = self.original_start + len(prefix) self.end = self.start + len(trimmed_ref) - 1
from .common import test_ensembl_releases from .data import ( FOXP3_001_transcript_id, CTNNBIP1_004_transcript_id, CTNNBIP1_004_UTR5, CTNNBIP1_004_UTR3, CTNNBIP1_004_CDS, CTNNBIP1_004_locus, CTTNNIP1_004_exon_lengths, CTTNNIP1_004_exon_ids, EGFR_001_protein_sequence, TP53_gene_id, ) ensembl77 = cached_release(77) def test_transcript_start_codon(): """ test_transcript_start_codon : Check that fields Transcript (for transcript named CTNNBIP1-004) matches known values. """ CTNNBIP1_004_transcript = ensembl77.transcript_by_id( CTNNBIP1_004_transcript_id) assert Locus.__eq__(CTNNBIP1_004_locus, CTNNBIP1_004_transcript), \ "Expected locus %s but got %s" % ( CTNNBIP1_004_locus, Locus.__str__(CTNNBIP1_004_transcript)) start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets assert len(start_offsets) == 3, \
""" Exon IDs of the TP53 gene and one of its transcripts (TP53-026) were copied from the Ensembl website, make sure same IDs are found by pyensembl. """ from __future__ import absolute_import from pyensembl import cached_release ensembl = cached_release(77) # all exons associated with TP53 gene in Ensembl release 77 TP53_EXON_IDS_RELEASE_77 = [ 'ENSE00002337729', 'ENSE00002419584', 'ENSE00003625790', 'ENSE00003518480', 'ENSE00003723991', 'ENSE00003712342', 'ENSE00001657961', 'ENSE00003725258', 'ENSE00003740946', 'ENSE00002204316', 'ENSE00002064269', 'ENSE00003750554', 'ENSE00003634848', 'ENSE00003492844', 'ENSE00003735852', 'ENSE00003545950', 'ENSE00003605891', 'ENSE00002051192', 'ENSE00002084733', 'ENSE00003726882', 'ENSE00001146308', 'ENSE00002667911', 'ENSE00003752869', 'ENSE00003739898', 'ENSE00003753508', 'ENSE00002034209', 'ENSE00002030826', 'ENSE00001596491', 'ENSE00002037735', 'ENSE00003736616', 'ENSE00002672443', 'ENSE00002226620', 'ENSE00003715195', 'ENSE00003750794', 'ENSE00003745267', 'ENSE00003746220', 'ENSE00003656695', 'ENSE00003669712', 'ENSE00002051873', 'ENSE00002048269', 'ENSE00002670535', 'ENSE00002677565', 'ENSE00003532881', 'ENSE00003520683', 'ENSE00002076714', 'ENSE00002062958', 'ENSE00002073243', 'ENSE00003670707', 'ENSE00002065802', 'ENSE00002362269' ] def test_exon_ids_of_gene_id(): """ test_exon_ids_of_gene_id: Ensure that gene_id ENSG00000141510 (name=TP53),