def test_is_integer(): assert is_integer(0) assert is_integer(-1) assert is_integer(1) # big integer assert is_integer(10**30) assert not is_integer("") assert not is_integer("a") assert not is_integer([]) assert not is_integer([1]) assert not is_integer(object()) assert not is_integer(None)
def infer_genome(genome_object_string_or_int): """ If given an integer, return associated human EnsemblRelease for that Ensembl version. If given a string, return latest EnsemblRelease which has a reference of the same name. If given a PyEnsembl Genome, simply return it. """ if isinstance(genome_object_string_or_int, Genome): return genome_object_string_or_int if is_integer(genome_object_string_or_int): return cached_release(genome_object_string_or_int) elif is_string(genome_object_string_or_int): # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37 # and then get the associated PyEnsembl Genome object reference_name = infer_reference_name(genome_object_string_or_int) return genome_for_reference_name(reference_name) else: raise TypeError( ("Expected genome to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % ( str(genome_object_string_or_int), type(genome_object_string_or_int)))
def normalize_chromosome(c): try: return NORMALIZE_CHROMOSOME_CACHE[c] except KeyError: pass if not (is_string(c) or is_integer(c)): raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c))) result = str(c) if result == "0": raise ValueError("Chromosome name cannot be 0") elif result == "": raise ValueError("Chromosome name cannot be empty") # only strip off lowercase chr since some of the non-chromosomal # contigs start with "CHR" if result.startswith("chr"): result = result[3:] # just in case someone is being lazy, capitalize "M", "MT", X", "Y" result = result.upper() # standardize mitochondrial genome to be "MT" if result == "M": result = "MT" # interning strings since the chromosome names probably get constructed # or parsed millions of times, can save memory in tight situations # (such as parsing GTF files) result = intern(result) NORMALIZE_CHROMOSOME_CACHE[c] = result return result
def normalize_chromosome(c): try: return NORMALIZE_CHROMOSOME_CACHE[c] except KeyError: pass if not (is_string(c) or is_integer(c)): raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c))) result = str(c) if result == "0": raise ValueError("Chromosome name cannot be 0") elif result == "": raise ValueError("Chromosome name cannot be empty") # only strip off lowercase chr since some of the non-chromosomal # contigs start with "CHR" if result.startswith("chr"): result = result[3:] # standardize mitochondrial genome to be "MT" if result == "M": result = "MT" else: # just in case someone is being lazy, capitalize "X" and "Y" result = result.upper() # interning strings since the chromosome names probably get constructed # or parsed millions of times, can save memory in tight situations # (such as parsing GTF files) result = intern(result) NORMALIZE_CHROMOSOME_CACHE[c] = result return result
def normalize_chromosome(c): try: return NORMALIZE_CHROMOSOME_CACHE[c] except KeyError: pass result = c if is_integer(result): if result == 0: raise ValueError("Contig cannot be 0") result = str(result) else: require_string(result, "contig name", nonempty=True) # only strip off lowercase chr since some of the non-chromosomal # contigs start with "CHR" if result.startswith("chr"): result = result[3:] # standardize mitochondrial genome to be "MT" if result == "M": result = "MT" else: # just in case someone is being lazy, capitalize "X" and "Y" result = result.upper() NORMALIZE_CHROMOSOME_CACHE[c] = result return result
def infer_genome(genome_object_string_or_int): """ If given an integer, get the human EnsemblRelease object for that Ensembl version. If given a string, return latest EnsemblRelease which has an equivalent reference. If the given name is a UCSC genome (e.g. hg19) then convert it to the equivalent Ensembl reference (e.g. GRCh37). If given a PyEnsembl Genome, simply use it. Returns a pair of (Genome, bool) where the bool corresponds to whether the input requested a UCSC genome (e.g. "hg19") and an Ensembl (e.g. GRCh37) was returned as a substitute. """ converted_ucsc_to_ensembl = False if isinstance(genome_object_string_or_int, Genome): genome = genome_object_string_or_int elif is_integer(genome_object_string_or_int): genome = cached_ensembl_release(genome_object_string_or_int) elif is_string(genome_object_string_or_int): genome, converted_ucsc_to_ensembl = \ infer_genome_for_reference_name(genome_object_string_or_int) else: raise TypeError( ("Expected genome to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % ( str(genome_object_string_or_int), type(genome_object_string_or_int))) return genome, converted_ucsc_to_ensembl
def normalize_chromosome(c): try: return NORMALIZE_CHROMOSOME_CACHE[c] except KeyError: pass if not (is_string(c) or is_integer(c)): raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c))) result = str(c) if result == "0": raise ValueError("Chromosome name cannot be 0") elif result == "": raise ValueError("Chromosome name cannot be empty") if result.startswith("chr") and "_" not in result: # excluding "_" for names like "chrUn_gl000212" # capitalize "chrx" -> "chrX" result = "chr" + result[3:].upper() elif result.isalpha(): # capitalize e.g. "x" -> "X" result = result.upper() # interning strings since the chromosome names probably get constructed # or parsed millions of times, can save memory in tight situations # (such as parsing GTF files) result = intern(result) NORMALIZE_CHROMOSOME_CACHE[c] = result return result