def test_is_string(): assert is_string("hello") assert is_string("") assert is_string(u"Ѽ") assert is_string(u"ﮚ") assert not is_string(1) assert not is_string(1.0) assert not is_string([]) assert not is_string(object()) assert not is_string(None)
def infer_genome(genome_object_string_or_int): """ If given an integer, get the human EnsemblRelease object for that Ensembl version. If given a string, return latest EnsemblRelease which has an equivalent reference. If the given name is a UCSC genome (e.g. hg19) then convert it to the equivalent Ensembl reference (e.g. GRCh37). If given a PyEnsembl Genome, simply use it. Returns a pair of (Genome, bool) where the bool corresponds to whether the input requested a UCSC genome (e.g. "hg19") and an Ensembl (e.g. GRCh37) was returned as a substitute. """ converted_ucsc_to_ensembl = False if isinstance(genome_object_string_or_int, Genome): genome = genome_object_string_or_int elif is_integer(genome_object_string_or_int): genome = cached_ensembl_release(genome_object_string_or_int) elif is_string(genome_object_string_or_int): genome, converted_ucsc_to_ensembl = \ infer_genome_for_reference_name(genome_object_string_or_int) else: raise TypeError( ("Expected genome to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % ( str(genome_object_string_or_int), type(genome_object_string_or_int))) return genome, converted_ucsc_to_ensembl
def __init__(self, hla=None, hla_dataframe=None, donor_to_hla=None): """ Specify exactly one of hla, hla_dataframe, or donor_to_hla. Parameters ----------- hla : list of string HLA alleles to use for all donors hla_dataframe : pandas.DataFrame with columns 'donor' and 'hla' DataFrame giving HLA alleles for each donor. The 'hla' column should be a space separated list of alleles for that donor. donor_to_hla : dict of string -> string list Map from donor to HLA alleles for that donor. """ if bool(hla) + (hla_dataframe is not None) + bool(donor_to_hla) != 1: raise TypeError( "Must specify exactly one of hla, hla_dataframe, donor_to_hla") self.hla = ( self.string_to_hla_alleles(hla) if typechecks.is_string(hla) else hla) self.donor_to_hla = donor_to_hla if hla_dataframe is not None: self.donor_to_hla = {} for (i, row) in hla_dataframe.iterrows(): if row.donor in self.donor_to_hla: raise ValueError("Multiple rows for donor: %s" % row.donor) if pandas.isnull(row.hla): self.donor_to_hla[row.donor] = None else: self.donor_to_hla[row.donor] = self.string_to_hla_alleles( row.hla) assert self.hla is not None or self.donor_to_hla is not None
def __init__(self, hla=None, hla_dataframe=None, donor_to_hla=None): """ Specify exactly one of hla, hla_dataframe, or donor_to_hla. Parameters ----------- hla : list of string HLA alleles to use for all donors hla_dataframe : pandas.DataFrame with columns 'donor' and 'hla' DataFrame giving HLA alleles for each donor. The 'hla' column should be a space separated list of alleles for that donor. donor_to_hla : dict of string -> string list Map from donor to HLA alleles for that donor. """ if bool(hla) + (hla_dataframe is not None) + bool(donor_to_hla) != 1: raise TypeError( "Must specify exactly one of hla, hla_dataframe, donor_to_hla") self.hla = (self.string_to_hla_alleles(hla) if typechecks.is_string(hla) else hla) self.donor_to_hla = donor_to_hla if hla_dataframe is not None: self.donor_to_hla = {} for (i, row) in hla_dataframe.iterrows(): if row.donor in self.donor_to_hla: raise ValueError("Multiple rows for donor: %s" % row.donor) if pandas.isnull(row.hla): self.donor_to_hla[row.donor] = None else: self.donor_to_hla[row.donor] = self.string_to_hla_alleles( row.hla) assert self.hla is not None or self.donor_to_hla is not None
def normalize_chromosome(c): try: return NORMALIZE_CHROMOSOME_CACHE[c] except KeyError: pass if not (is_string(c) or is_integer(c)): raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c))) result = str(c) if result == "0": raise ValueError("Chromosome name cannot be 0") elif result == "": raise ValueError("Chromosome name cannot be empty") if result.startswith("chr") and "_" not in result: # excluding "_" for names like "chrUn_gl000212" # capitalize "chrx" -> "chrX" result = "chr" + result[3:].upper() elif result.isalpha(): # capitalize e.g. "x" -> "X" result = result.upper() # interning strings since the chromosome names probably get constructed # or parsed millions of times, can save memory in tight situations # (such as parsing GTF files) result = intern(result) NORMALIZE_CHROMOSOME_CACHE[c] = result return result
def write(self, file=None, format=None, indent=None): close_on_exit = False if typechecks.is_string(file): fd = open(file, "w") close_on_exit = True if format is None: if file.endswith(".json"): format = "json" elif file.endswith(".py"): format = "python" else: raise ValueError( "Couldn't guess format from filename: %s" % file) elif not file: fd = sys.stdout if format is None: format = "json" else: fd = file try: extra_args = {} if indent is None else {"indent": indent} if format == "json": value = self.to_json(**extra_args) elif format == "python": value = self.to_python(**extra_args) else: raise ValueError("Unsupported format: %s" % format) fd.write(value) finally: if close_on_exit: fd.close()
def normalize_chromosome(c): try: return NORMALIZE_CHROMOSOME_CACHE[c] except KeyError: pass if not (is_string(c) or is_integer(c)): raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c))) result = str(c) if result == "0": raise ValueError("Chromosome name cannot be 0") elif result == "": raise ValueError("Chromosome name cannot be empty") # only strip off lowercase chr since some of the non-chromosomal # contigs start with "CHR" if result.startswith("chr"): result = result[3:] # standardize mitochondrial genome to be "MT" if result == "M": result = "MT" else: # just in case someone is being lazy, capitalize "X" and "Y" result = result.upper() # interning strings since the chromosome names probably get constructed # or parsed millions of times, can save memory in tight situations # (such as parsing GTF files) result = intern(result) NORMALIZE_CHROMOSOME_CACHE[c] = result return result
def infer_genome(genome_object_string_or_int): """ If given an integer, return associated human EnsemblRelease for that Ensembl version. If given a string, return latest EnsemblRelease which has a reference of the same name. If given a PyEnsembl Genome, simply return it. """ if isinstance(genome_object_string_or_int, Genome): return genome_object_string_or_int if is_integer(genome_object_string_or_int): return cached_release(genome_object_string_or_int) elif is_string(genome_object_string_or_int): # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37 # and then get the associated PyEnsembl Genome object reference_name = infer_reference_name(genome_object_string_or_int) return genome_for_reference_name(reference_name) else: raise TypeError( ("Expected genome to be an int, string, or pyensembl.Genome " "instance, got %s : %s") % ( str(genome_object_string_or_int), type(genome_object_string_or_int)))
def evaluate_variant_expression( expression, collection, variant, error_value=evaluation.RAISE, extra_bindings={}): if typechecks.is_string(expression): variant_metadata = collection.metadata.get(variant, {}) extra_bindings = { 'inclusive_start': variant.start, 'inclusive_end': variant.end, 'interbase_start': variant.start - 1, 'interbase_end': variant.end, 'variant': variant, 'collection': collection, 'metadata': variant_metadata, } extra_bindings.update(variant_metadata) bindings = evaluation.EvaluationEnvironment([variant], extra_bindings) return evaluation.evaluate_expression( expression, bindings, error_value=error_value) else: return expression(variant)
def normalize_chromosome(c): try: return NORMALIZE_CHROMOSOME_CACHE[c] except KeyError: pass if not (is_string(c) or is_integer(c)): raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c))) result = str(c) if result == "0": raise ValueError("Chromosome name cannot be 0") elif result == "": raise ValueError("Chromosome name cannot be empty") # only strip off lowercase chr since some of the non-chromosomal # contigs start with "CHR" if result.startswith("chr"): result = result[3:] # just in case someone is being lazy, capitalize "M", "MT", X", "Y" result = result.upper() # standardize mitochondrial genome to be "MT" if result == "M": result = "MT" # interning strings since the chromosome names probably get constructed # or parsed millions of times, can save memory in tight situations # (such as parsing GTF files) result = intern(result) NORMALIZE_CHROMOSOME_CACHE[c] = result return result
def evaluate_read_expression(expression, alignment, error_value=evaluation.RAISE, extra_bindings={}): if typechecks.is_string(expression): bindings = evaluation.EvaluationEnvironment([alignment], extra={}) return evaluation.evaluate_expression(expression, bindings, error_value=error_value) else: return expression(alignment)
def evaluate_read_expression( expression, alignment, error_value=evaluation.RAISE, extra_bindings={}): if typechecks.is_string(expression): bindings = evaluation.EvaluationEnvironment( [alignment], extra={}) return evaluation.evaluate_expression( expression, bindings, error_value=error_value) else: return expression(alignment)
def write(self, file=None, format=None, indent=None): """ Serialize this collection to disk. Parameters ---------- file : string or file handle [optional, default: sys.stdout] Path or file handle to write to. format : string, one of "python" or "json" [optional] Output format. If not specified, it is guessed from the filename extension. indent : int [optional] Number of spaces to use for indentation. """ close_on_exit = False if typechecks.is_string(file): fd = open(file, "w") close_on_exit = True if format is None: if file.endswith(".json"): format = "json" elif file.endswith(".py"): format = "python" else: raise ValueError( "Couldn't guess format from filename: %s" % file) elif not file: fd = sys.stdout if format is None: format = "python" else: fd = file try: extra_args = {} if indent is None else {"indent": indent} if format == "json": value = self.to_json(**extra_args) elif format == "python": value = self.to_python(**extra_args) else: raise ValueError("Unsupported format: %s" % format) fd.write(value) finally: if close_on_exit: fd.close()
def evaluate_pileup_element_expression(expression, collection, pileup, element, error_value=evaluation.RAISE, extra_bindings={}): if typechecks.is_string(expression): bindings = evaluation.EvaluationEnvironment( [element, element.alignment, pileup], extra={ 'element': element, 'pileup': pileup, 'collection': collection, }) return evaluation.evaluate_expression(expression, bindings, error_value=error_value) else: return expression(pileup)
def evaluate_pileup_element_expression( expression, collection, pileup, element, error_value=evaluation.RAISE, extra_bindings={}): if typechecks.is_string(expression): bindings = evaluation.EvaluationEnvironment( [element, element.alignment, pileup], extra={ 'element': element, 'pileup': pileup, 'collection': collection, }) return evaluation.evaluate_expression( expression, bindings, error_value=error_value) else: return expression(pileup)
def evaluate_variant_expression(expression, collection, variant, error_value=evaluation.RAISE, extra_bindings={}): if typechecks.is_string(expression): variant_metadata = collection.metadata.get(variant, {}) extra_bindings = { 'inclusive_start': variant.start, 'inclusive_end': variant.end, 'interbase_start': variant.start - 1, 'interbase_end': variant.end, 'variant': variant, 'collection': collection, 'metadata': variant_metadata, } extra_bindings.update(variant_metadata) bindings = evaluation.EvaluationEnvironment([variant], extra_bindings) return evaluation.evaluate_expression(expression, bindings, error_value=error_value) else: return expression(variant)
def is_valid_human_transcript_id(transcript_id): """Is the argument a valid identifier for human Ensembl transcripts?""" return is_string(transcript_id) and transcript_id.startswith("ENST")
def evaluate(self, expression, error_value=RAISE, extra_bindings={}): """ Evaluate a Python expression or callable in the context of this resource. Parameters ---------- expression : string or callable If a string, then it should give a valid Python expression. This expression will be evaluated with the attributes of this resource in the local namespace. For example, since the resource has a ``name`` attribute, the expression "name.lower()" would return the name in lower case. Tags can be accessed through the ``tags`` variable. If the resource has a tag called ``foo``, then the expression "tags.foo" will evaluate to ``True``. If there is no such tag, then "tags.foo" will evaluate to ``False``. A few common modules are included in the evaluation namespace, including ``os``, ``sys``, ``collections``, ``re``, and ``json``. The resource object itself is also available in the ``resource`` variable. As a hack to support a primitive form of exception handling, a function called ``on_error`` is also included in the evaluation namespace. This function takes a single argument, ``value``, of any type and returns None. If ``on_error`` is called while evaluating the expression, and the expression subsequently raises an exception, then the exception is caught and ``value`` is returned as the value of the expression. This means you can write expressions like: ``on_error(False) or foo.startswith("bar")`` and if the right side of the expression raises an error (for example, if there is no such attribute ``foo`` in the resource), then the value ``False`` will be used as the expression's value. Note that you must write the expression as it is here: put the ``on_error`` clause first, and connect it with the main expression with `or` (this ensures that it gets called before the rest of the expression). If ``expression`` is a callable, then it will be called and passed this Resource instance as its argument. error_value : object [optional] If evaluating the expression results in an uncaught exception, the ``error_value`` value will be returned instead. If not specified, then ``evaluate`` will raise the exception to the caller. extra_bindings : dict [optional] Additional local variables to include in the evaluation context. Returns ---------- The Python object returned by evaluating the expression. """ # Since Python 2 doesn't have a nonlocal keyword, we have to box up the # error_value, so we can reassign to it in the ``on_error`` function # below. error_box = [error_value] try: if typechecks.is_string(expression): # Give some basic modules. environment = dict(STANDARD_EVALUATION_ENVIRONMENT) environment["resource"] = self # We also add our "on_error" hack. def on_error(value): error_box[0] = value environment["on_error"] = on_error environment.update(extra_bindings) return eval(expression, environment, self) else: return expression(self) except Exception as e: if error_box[0] is not Resource.RAISE: return error_box[0] extra = "Error while evaluating: \n\t%s\non resource:\n%s" % ( expression, self) traceback = sys.exc_info()[2] raise_(ValueError, str(e) + "\n" + extra, traceback)
def from_bam(pysam_samfile, loci): """ Create a PileupCollection for a set of loci from a BAM file. Parameters ---------- pysam_samfile : `pysam.csamfile.Samfile` instance, or filename string to a BAM file. The BAM file must be indexed. loci : list of Locus instances Loci to collect pileups for. Returns ---------- PileupCollection instance containing pileups for the specified loci. All alignments in the BAM file are included (e.g. duplicate reads, secondary alignments, etc.). See `PileupCollection.filter` if these need to be removed. """ loci = [to_locus(obj) for obj in loci] close_on_completion = False if typechecks.is_string(pysam_samfile): pysam_samfile = Samfile(pysam_samfile) close_on_completion = True try: # Map from pyensembl normalized chromosome names used in Variant to # the names used in the BAM file. chromosome_name_map = {} for name in pysam_samfile.references: normalized = pyensembl.locus.normalize_chromosome(name) chromosome_name_map[normalized] = name result = PileupCollection({}) # Optimization: we sort variants so our BAM reads are localized. locus_iterator = itertools.chain.from_iterable( (Locus.from_interbase_coordinates(locus_interval.contig, pos) for pos in locus_interval.positions) for locus_interval in sorted(loci) ) for locus in locus_iterator: result.pileups[locus] = Pileup(locus, []) try: chromosome = chromosome_name_map[locus.contig] except KeyError: logging.warn("No such contig in bam: %s" % locus.contig) continue columns = pysam_samfile.pileup( chromosome, locus.position, locus.position + 1, # exclusive, 0-indexed truncate=True, stepper="nofilter", ) try: column = next(columns) except StopIteration: # No reads align to this locus. continue # Note that storing the pileups here is necessary, since the # subsequent assertion will invalidate our column. pileups = column.pileups assert list(columns) == [] # column is invalid after this. for pileup_read in pileups: if not pileup_read.is_refskip: element = PileupElement.from_pysam_alignment(locus, pileup_read) result.pileups[locus].append(element) return result finally: if close_on_completion: pysam_samfile.close()
def is_valid_ensembl_id(ensembl_id): """Is the argument a valid ID for any Ensembl feature?""" return is_string(ensembl_id) and ensembl_id.startswith("ENS")
def select(self, *expressions, **kwargs): """ Select fields (or expressions) from each resource as a pandas DataFrame. Parameters ---------- *expressions : string, callable, or (string, string or callable) pair One or more expressions giving the fields to select. Each expression can be either a ``string`` expression, a ``callable``, or a ``(string, string or callable)`` pair giving a label and an expression. Labels give the column names in the result. Labels can be specified either by giving a ``(label, expression)`` pair, or giving a string of the form "LABEL: EXPRESSION", such as "upper_name: name.upper()". Here "upper_name" is the label, and "name.upper()" is the expression that will be evaluated. If not specified, labels default to the text of the ``expression`` if ``expression`` is a string, and an automatically generated label if ``expression`` is a callable. Each ``expression`` will be passed to `Resource.evaluate` for each resource in the collection. See that method's docs for details on expression evaluation. if_error : string, one of "raise", "skip", or "none" [default: "raise"] Must be specified as a keyword argument. Controls the behavior when evaluation of an expression raises an uncaught exception. One of: raise Raise the exception to the caller. This is the default. skip Skip resources where evaluation of any of the expressions raises an error. These resources will be omitted from the result. none If evaluating an expression on a resource raises an exception, set that entry in the result to ``None``. Returns ------- A `pandas.DataFrame`. Rows correspond to resources. Columns correspond to the specified expressions. """ if_error = kwargs.pop("if_error", "raise") if if_error == "raise" or if_error == "skip": error_value = Resource.RAISE elif if_error == "none": error_value = None else: raise TypeError("if_error should be 'raise', 'skip', or 'none'") if kwargs: raise TypeError("Invalid keyword arguments: %s" % " ".join(kwargs)) labels_and_expressions = [] expr_num = 1 for expression in expressions: if isinstance(expression, tuple): (label, expression) = expression elif typechecks.is_string(expression): match = re.match(r"^([\w\- ]+):(.*)$", expression) if match is None: label = expression else: (label, expression) = match.groups() else: label = "expr_%d" % expr_num expr_num += 1 labels_and_expressions.append((label, expression)) df_dict = collections.OrderedDict( (label, []) for (label, _) in labels_and_expressions) extra_bindings = {key: None for key in self.attributes} def values_for_resource(resource): result = [] for (label, expression) in labels_and_expressions: try: value = resource.evaluate( expression, error_value=error_value, extra_bindings=extra_bindings) except: if if_error == "raise": raise elif if_error == "skip": return None elif if_error == "none": value = None result.append(value) return result for resource in self: row = values_for_resource(resource) if row is not None: for ((label, _), value) in zip(labels_and_expressions, row): df_dict[label].append(value) return pandas.DataFrame(df_dict)
def from_bam(pysam_samfile, loci, normalized_contig_names=True): ''' Create a PileupCollection for a set of loci from a BAM file. Parameters ---------- pysam_samfile : `pysam.Samfile` instance, or filename string to a BAM file. The BAM file must be indexed. loci : list of Locus instances Loci to collect pileups for. normalized_contig_names : whether the contig names have been normalized (e.g. pyensembl removes the 'chr' prefix). Set to true to de-normalize the names when querying the BAM file. Returns ---------- PileupCollection instance containing pileups for the specified loci. All alignments in the BAM file are included (e.g. duplicate reads, secondary alignments, etc.). See `PileupCollection.filter` if these need to be removed. ''' loci = [to_locus(obj) for obj in loci] close_on_completion = False if typechecks.is_string(pysam_samfile): pysam_samfile = Samfile(pysam_samfile) close_on_completion = True try: # Map from pyensembl normalized chromosome names used in Variant to # the names used in the BAM file. if normalized_contig_names: chromosome_name_map = {} for name in pysam_samfile.references: normalized = pyensembl.locus.normalize_chromosome(name) chromosome_name_map[normalized] = name chromosome_name_map[name] = name else: chromosome_name_map = None result = PileupCollection({}) # Optimization: we sort variants so our BAM reads are localized. locus_iterator = itertools.chain.from_iterable( (Locus.from_interbase_coordinates(locus_interval.contig, pos) for pos in locus_interval.positions) for locus_interval in sorted(loci)) for locus in locus_iterator: result.pileups[locus] = Pileup(locus, []) if normalized_contig_names: try: chromosome = chromosome_name_map[locus.contig] except KeyError: logging.warn("No such contig in bam: %s" % locus.contig) continue else: chromosome = locus.contig columns = pysam_samfile.pileup( chromosome, locus.position, locus.position + 1, # exclusive, 0-indexed truncate=True, stepper="nofilter") try: column = next(columns) except StopIteration: # No reads align to this locus. continue # Note that storing the pileups here is necessary, since the # subsequent assertion will invalidate our column. pileups = column.pileups assert list(columns) == [] # column is invalid after this. for pileup_read in pileups: if not pileup_read.is_refskip: element = PileupElement.from_pysam_alignment( locus, pileup_read) result.pileups[locus].append(element) return result finally: if close_on_completion: pysam_samfile.close()
def is_valid_human_protein_id(protein_id): """Is the argument a valid identifier for human Ensembl proteins?""" return is_string(protein_id) and protein_id.startswith("ENSP")