def test_normalize_chromosome(): assert normalize_chromosome("X") == "X" assert normalize_chromosome("chrX") == "X" assert normalize_chromosome("x") == "X" assert normalize_chromosome(1) == "1" assert normalize_chromosome("1") == "1" assert normalize_chromosome("chr1") == "1" assert normalize_chromosome("chrM") == "MT" assert normalize_chromosome("chrMT") == "MT" assert normalize_chromosome("M") == "MT" assert normalize_chromosome("MT") == "MT" with assert_raises(TypeError, None): normalize_chromosome({"a": "b"}) with assert_raises(TypeError, None): normalize_chromosome([]) with assert_raises(TypeError, None): normalize_chromosome(None) with assert_raises(ValueError, None): normalize_chromosome("") with assert_raises(ValueError, None): normalize_chromosome(0)
def __init__( self, contig, start, ref, alt, ensembl=ensembl_grch38, allow_extended_nucleotides=False, normalize_contig_name=True): """ Construct a Variant object. Parameters ---------- contig : str Chromosome that this variant is on start : int 1-based position on the chromosome of first reference nucleotide ref : str Reference nucleotide(s) alt : str Alternate nucleotide(s) ensembl : Genome or EnsemblRelease Object used for determining gene/transcript annotations allow_extended_nucleotides : bool Extended nucleotides include 'Y' for pyrimidies or 'N' for any base normalize_contig_name : bool By default the contig name will be normalized by trimming a 'chr' prefix and converting all letters to upper-case. If we don't want this behavior then pass normalize_contig_name=False. """ # first initialize the _genes and _transcripts fields we use to cache # lists of overlapping pyensembl Gene and Transcript objects self._genes = self._transcripts = None # user might supply Ensembl release as an integer, reference name, # or pyensembl.Genome object if isinstance(ensembl, Genome): self.ensembl = ensembl elif isinstance(ensembl, int): self.ensembl = cached_release(ensembl) elif isinstance(ensembl, str): self.ensembl = genome_for_reference_name(ensembl) else: raise TypeError( ("Expected ensembl to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % (type(ensembl), str(ensembl))) self.normalize_contig_name = normalize_contig_name self.allow_extended_nucleotides = allow_extended_nucleotides self.original_contig = contig self.contig = normalize_chromosome(contig) if normalize_contig_name else contig if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES: # Optimization for common case. self.original_ref = self.ref = ref self.original_alt = self.alt = alt self.original_start = self.start = self.end = int(start) return # we want to preserve the ref/alt/pos both as they appeared in the # original VCF or MAF file but also normalize variants to get rid # of shared prefixes/suffixes between the ref and alt nucleotide # strings e.g. g.10 CTT>T can be normalized into g.10delCT # # The normalized variant properties go into fields # Variant.{original_ref, original_alt, original_pos} # whereas the trimmed fields are: # Variant.{ref, alt, start, end} # the original entries must preserve the number of nucleotides in # ref and alt but we still want to normalize e.g. '-' and '.' into '' self.original_ref = normalize_nucleotide_string( ref, allow_extended_nucleotides=allow_extended_nucleotides) self.original_alt = normalize_nucleotide_string( alt, allow_extended_nucleotides=allow_extended_nucleotides) self.original_start = int(start) # normalize the variant by trimming any shared prefix or suffix # between ref and alt nucleotide sequences and then # offset the variant position in a strand-dependent manner (trimmed_ref, trimmed_alt, prefix, suffix) = ( trim_shared_flanking_strings(self.original_ref, self.original_alt)) self.ref = trimmed_ref self.alt = trimmed_alt if len(trimmed_ref) == 0: # insertions must be treated differently since the meaning of a # position for an insertion is: # "insert the alt nucleotides after this position" # # Aside: what if both trimmed ref and alt strings are empty? # This means we had a "null" variant, probably from a VCF # generated by force-calling mutations which weren't actually # found in the sample. # Null variants are interepted as inserting zero nucleotides # after the whole reference sequence. # # Start and end both are base-1 nucleotide position before # insertion. self.start = self.original_start + max(0, len(prefix) - 1) self.end = self.start else: # for substitutions and deletions the [start:end] interval is # an inclusive selection of reference nucleotides self.start = self.original_start + len(prefix) self.end = self.start + len(trimmed_ref) - 1
def __init__(self, contig, start, ref, alt, genome=None, ensembl=None, allow_extended_nucleotides=False, normalize_contig_names=True, convert_ucsc_contig_names=None): """ Construct a Variant object. Parameters ---------- contig : str Chromosome that this variant is on start : int 1-based position on the chromosome of first reference nucleotide ref : str Reference nucleotide(s) alt : str Alternate nucleotide(s) genome : Genome, EnsemblRelease, or str, or int Name of reference genome, Ensembl release number, or object derived from pyensembl.Genome. Default to latest available release of GRCh38 ensembl : Genome, EnsemblRelease, or str, or int (DEPRECATED) Previous name used instead of 'genome', the two arguments should be mutually exclusive. allow_extended_nucleotides : bool Extended nucleotides include 'Y' for pyrimidies or 'N' for any base normalize_contig_names : bool By default the contig name will be normalized by converting integers to strings (e.g. 1 -> "1"), and converting any letters after "chr" to uppercase (e.g. "chrx" -> "chrX"). If you don't want this behavior then pass normalize_contig_name=False. convert_ucsc_contig_names : bool, optional Setting this argument to True causes UCSC chromosome names to be coverted, such as "chr1" to "1". If the default value (None) is used then it defaults to whether or not a UCSC genome was pass in for the 'genome' argument. """ # first initialize the fields we use to cache lists of overlapping # pyensembl Gene and Transcript objects, or their properties such # as names/IDs self._genes = None self._transcripts = None self._gene_ids = None self._gene_names = None # store the options which affect how properties of this variant # may be changed/transformed self.normalize_contig_names = normalize_contig_names self.allow_extended_nucleotides = allow_extended_nucleotides # if genome not specified, try the old name 'ensembl' # if ensembl is also None, then default to "GRCh38" if genome is None and ensembl is None: genome = "GRCh38" elif genome is None: genome = ensembl # user might supply Ensembl release as an integer, reference name, # or pyensembl.Genome object self.original_genome = genome self.genome, self.original_genome_was_ucsc = infer_genome(genome) self.reference_name = self.genome.reference_name if self.original_genome_was_ucsc: self.original_reference_name = ensembl_to_ucsc_reference_names[ self.reference_name] else: self.original_reference_name = self.reference_name self.original_contig = contig self.contig = normalize_chromosome( contig) if normalize_contig_names else contig if convert_ucsc_contig_names is None: self.convert_ucsc_contig_names = self.original_genome_was_ucsc else: self.convert_ucsc_contig_names = convert_ucsc_contig_names # trim off the starting "chr" from hg19 chromosome names to make them # match GRCh37, also convert "chrM" to "MT". if self.convert_ucsc_contig_names: self.contig = self._convert_ucsc_contig_name_to_ensembl( self.contig) if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES: # Optimization for common case. self.original_ref = self.ref = ref self.original_alt = self.alt = alt self.original_start = self.start = self.end = int(start) return # we want to preserve the ref/alt/pos both as they appeared in the # original VCF or MAF file but also normalize variants to get rid # of shared prefixes/suffixes between the ref and alt nucleotide # strings e.g. g.10 CTT>T can be normalized into g.10delCT # # The normalized variant properties go into fields # Variant.{original_ref, original_alt, original_pos} # whereas the trimmed fields are: # Variant.{ref, alt, start, end} # the original entries must preserve the number of nucleotides in # ref and alt but we still want to normalize e.g. '-' and '.' into '' self.original_ref = normalize_nucleotide_string( ref, allow_extended_nucleotides=allow_extended_nucleotides) self.original_alt = normalize_nucleotide_string( alt, allow_extended_nucleotides=allow_extended_nucleotides) self.original_start = int(start) # normalize the variant by trimming any shared prefix or suffix # between ref and alt nucleotide sequences and then # offset the variant position in a strand-dependent manner (trimmed_ref, trimmed_alt, prefix, _) = \ trim_shared_flanking_strings(self.original_ref, self.original_alt) self.ref = trimmed_ref self.alt = trimmed_alt if len(trimmed_ref) == 0: # insertions must be treated differently since the meaning of a # position for an insertion is: # "insert the alt nucleotides after this position" # # Aside: what if both trimmed ref and alt strings are empty? # This means we had a "null" variant, probably from a VCF # generated by force-calling mutations which weren't actually # found in the sample. # Null variants are interepted as inserting zero nucleotides # after the whole reference sequence. # # Start and end both are base-1 nucleotide position before # insertion. self.start = self.original_start + max(0, len(prefix) - 1) self.end = self.start else: # for substitutions and deletions the [start:end] interval is # an inclusive selection of reference nucleotides self.start = self.original_start + len(prefix) self.end = self.start + len(trimmed_ref) - 1
def __init__( self, contig, start, ref, alt, ensembl=ensembl_grch38, allow_extended_nucleotides=False): """ Construct a Variant object. Parameters ---------- contig : str Chromosome that this variant is on start : int 1-based position on the chromosome of first reference nucleotide ref : str Reference nucleotide(s) alt : str Alternate nucleotide(s) ensembl : Genome or EnsemblRelease Object used for determining gene/transcript annotations info : dict, optional Extra metadata about this variant """ self.contig = normalize_chromosome(contig) # user might supply Ensembl release as an integer, reference name, # or pyensembl.Genome object if isinstance(ensembl, Genome): self.ensembl = ensembl elif isinstance(ensembl, int): self.ensembl = cached_release(ensembl) elif isinstance(ensembl, str): self.ensembl = genome_for_reference_name(ensembl) else: raise TypeError( ("Expected ensembl to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % (type(ensembl), str(ensembl))) if (ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES and ref != alt): # Optimization for common case. self.original_ref = self.ref = ref self.original_alt = self.alt = alt self.original_start = self.start = self.end = int(start) return # we want to preserve the ref/alt/pos both as they appeared in the # original VCF or MAF file but also normalize variants to get rid # of shared prefixes/suffixes between the ref and alt nucleotide # strings e.g. g.10 CTT>T can be normalized into g.10delCT # # The normalized variant properties go into fields # Variant.{original_ref, original_alt, original_pos} # whereas the trimmed fields are: # Variant.{ref, alt, start, end} # the original entries must preserve the number of nucleotides in # ref and alt but we still want to normalize e.g. '-' and '.' into '' self.original_ref = normalize_nucleotide_string(ref, allow_extended_nucleotides=allow_extended_nucleotides) self.original_alt = normalize_nucleotide_string(alt, allow_extended_nucleotides=allow_extended_nucleotides) self.original_start = int(start) # normalize the variant by trimming any shared prefix or suffix # between ref and alt nucleotide sequences and then # offset the variant position in a strand-dependent manner (trimmed_ref, trimmed_alt, prefix, suffix) = ( trim_shared_flanking_strings(self.original_ref, self.original_alt)) self.ref = trimmed_ref self.alt = trimmed_alt # insertions must be treated differently since the meaning of a # position for an insertion is # "insert the alt nucleotides after this position" if len(trimmed_ref) == 0: # start and end both are nucleotide before insertion self.start = self.original_start + max(0, len(prefix) - 1) self.end = self.start else: # for substitutions and deletions the [start:end] interval is # an inclusive selection of reference nucleotides self.start = self.original_start + len(prefix) self.end = self.start + len(trimmed_ref) - 1
def __init__(self, contig, start, ref, alt, ensembl=ensembl_grch38, allow_extended_nucleotides=False, normalize_contig_name=True): """ Construct a Variant object. Parameters ---------- contig : str Chromosome that this variant is on start : int 1-based position on the chromosome of first reference nucleotide ref : str Reference nucleotide(s) alt : str Alternate nucleotide(s) ensembl : Genome or EnsemblRelease Object used for determining gene/transcript annotations allow_extended_nucleotides : bool Extended nucleotides include 'Y' for pyrimidies or 'N' for any base normalize_contig_name : bool By default the contig name will be normalized by trimming a 'chr' prefix and converting all letters to upper-case. If we don't want this behavior then pass normalize_contig_name=False. """ # first initialize the _genes and _transcripts fields we use to cache # lists of overlapping pyensembl Gene and Transcript objects self._genes = self._transcripts = None # user might supply Ensembl release as an integer, reference name, # or pyensembl.Genome object if isinstance(ensembl, Genome): self.ensembl = ensembl elif isinstance(ensembl, int): self.ensembl = cached_release(ensembl) elif isinstance(ensembl, str): self.ensembl = genome_for_reference_name(ensembl) else: raise TypeError( ("Expected ensembl to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % (type(ensembl), str(ensembl))) self.normalize_contig_name = normalize_contig_name self.allow_extended_nucleotides = allow_extended_nucleotides self.original_contig = contig self.contig = normalize_chromosome( contig) if normalize_contig_name else contig if ref != alt and ref in STANDARD_NUCLEOTIDES and alt in STANDARD_NUCLEOTIDES: # Optimization for common case. self.original_ref = self.ref = ref self.original_alt = self.alt = alt self.original_start = self.start = self.end = int(start) return # we want to preserve the ref/alt/pos both as they appeared in the # original VCF or MAF file but also normalize variants to get rid # of shared prefixes/suffixes between the ref and alt nucleotide # strings e.g. g.10 CTT>T can be normalized into g.10delCT # # The normalized variant properties go into fields # Variant.{original_ref, original_alt, original_pos} # whereas the trimmed fields are: # Variant.{ref, alt, start, end} # the original entries must preserve the number of nucleotides in # ref and alt but we still want to normalize e.g. '-' and '.' into '' self.original_ref = normalize_nucleotide_string( ref, allow_extended_nucleotides=allow_extended_nucleotides) self.original_alt = normalize_nucleotide_string( alt, allow_extended_nucleotides=allow_extended_nucleotides) self.original_start = int(start) # normalize the variant by trimming any shared prefix or suffix # between ref and alt nucleotide sequences and then # offset the variant position in a strand-dependent manner (trimmed_ref, trimmed_alt, prefix, suffix) = (trim_shared_flanking_strings(self.original_ref, self.original_alt)) self.ref = trimmed_ref self.alt = trimmed_alt if len(trimmed_ref) == 0: # insertions must be treated differently since the meaning of a # position for an insertion is: # "insert the alt nucleotides after this position" # # Aside: what if both trimmed ref and alt strings are empty? # This means we had a "null" variant, probably from a VCF # generated by force-calling mutations which weren't actually # found in the sample. # Null variants are interepted as inserting zero nucleotides # after the whole reference sequence. # # Start and end both are base-1 nucleotide position before # insertion. self.start = self.original_start + max(0, len(prefix) - 1) self.end = self.start else: # for substitutions and deletions the [start:end] interval is # an inclusive selection of reference nucleotides self.start = self.original_start + len(prefix) self.end = self.start + len(trimmed_ref) - 1
def test_normalize_chromosome(): assert normalize_chromosome("X") == "X" assert normalize_chromosome("chrX") == "X" assert normalize_chromosome("x") == "X" assert normalize_chromosome(1) == "1" assert normalize_chromosome("1") == "1" assert normalize_chromosome("chr1") == "1" assert normalize_chromosome("chrM") == "MT" assert normalize_chromosome("chrMT") == "MT" assert normalize_chromosome("M") == "MT" assert normalize_chromosome("MT") == "MT" assert normalize_chromosome("m") == "MT" assert normalize_chromosome("chrm") == "MT" assert normalize_chromosome("mt") == "MT" with assert_raises(TypeError): normalize_chromosome({"a": "b"}) with assert_raises(TypeError): normalize_chromosome([]) with assert_raises(TypeError): normalize_chromosome(None) with assert_raises(ValueError): normalize_chromosome("") with assert_raises(ValueError): normalize_chromosome(0)