コード例 #1
0
ファイル: test_strings.py プロジェクト: openvax/typechecks
def test_is_string():
    assert is_string("hello")
    assert is_string("")
    assert is_string(u"Ѽ")
    assert is_string(u"ﮚ")
    assert not is_string(1)
    assert not is_string(1.0)
    assert not is_string([])
    assert not is_string(object())
    assert not is_string(None)
コード例 #2
0
ファイル: reference.py プロジェクト: yusuf1759/varcode
def infer_genome(genome_object_string_or_int):
    """
    If given an integer, get the human EnsemblRelease object for that
    Ensembl version.

    If given a string, return latest EnsemblRelease which has an equivalent
    reference. If the given name is a UCSC genome (e.g. hg19) then convert
    it to the equivalent Ensembl reference (e.g. GRCh37).

    If given a PyEnsembl Genome, simply use it.

    Returns a pair of (Genome, bool) where the bool corresponds to whether
    the input requested a UCSC genome (e.g. "hg19") and an Ensembl (e.g. GRCh37)
    was returned as a substitute.
    """
    converted_ucsc_to_ensembl = False
    if isinstance(genome_object_string_or_int, Genome):
        genome =  genome_object_string_or_int
    elif is_integer(genome_object_string_or_int):
        genome = cached_ensembl_release(genome_object_string_or_int)
    elif is_string(genome_object_string_or_int):
        genome, converted_ucsc_to_ensembl = \
            infer_genome_for_reference_name(genome_object_string_or_int)
    else:
        raise TypeError(
            ("Expected genome to be an int, string, or pyensembl.Genome "
                "instance, got %s : %s") % (
                str(genome_object_string_or_int),
                type(genome_object_string_or_int)))
    return genome, converted_ucsc_to_ensembl
コード例 #3
0
ファイル: variant_includes.py プロジェクト: hammerlab/varlens
    def __init__(self, hla=None, hla_dataframe=None, donor_to_hla=None):
        """
        Specify exactly one of hla, hla_dataframe, or donor_to_hla.

        Parameters
        -----------
        hla : list of string
            HLA alleles to use for all donors

        hla_dataframe : pandas.DataFrame with columns 'donor' and 'hla'
            DataFrame giving HLA alleles for each donor. The 'hla' column
            should be a space separated list of alleles for that donor.

        donor_to_hla : dict of string -> string list
            Map from donor to HLA alleles for that donor.
        """
        if bool(hla) + (hla_dataframe is not None) + bool(donor_to_hla) != 1:
            raise TypeError(
                "Must specify exactly one of hla, hla_dataframe, donor_to_hla")
        
        self.hla = (
            self.string_to_hla_alleles(hla) if typechecks.is_string(hla)
            else hla)
        self.donor_to_hla = donor_to_hla
        if hla_dataframe is not None:
            self.donor_to_hla = {}
            for (i, row) in hla_dataframe.iterrows():
                if row.donor in self.donor_to_hla:
                    raise ValueError("Multiple rows for donor: %s" % row.donor)
                if pandas.isnull(row.hla):
                    self.donor_to_hla[row.donor] = None
                else:
                    self.donor_to_hla[row.donor] = self.string_to_hla_alleles(
                        row.hla)
        assert self.hla is not None or self.donor_to_hla is not None
コード例 #4
0
ファイル: variant_includes.py プロジェクト: openvax/varlens
    def __init__(self, hla=None, hla_dataframe=None, donor_to_hla=None):
        """
        Specify exactly one of hla, hla_dataframe, or donor_to_hla.

        Parameters
        -----------
        hla : list of string
            HLA alleles to use for all donors

        hla_dataframe : pandas.DataFrame with columns 'donor' and 'hla'
            DataFrame giving HLA alleles for each donor. The 'hla' column
            should be a space separated list of alleles for that donor.

        donor_to_hla : dict of string -> string list
            Map from donor to HLA alleles for that donor.
        """
        if bool(hla) + (hla_dataframe is not None) + bool(donor_to_hla) != 1:
            raise TypeError(
                "Must specify exactly one of hla, hla_dataframe, donor_to_hla")

        self.hla = (self.string_to_hla_alleles(hla)
                    if typechecks.is_string(hla) else hla)
        self.donor_to_hla = donor_to_hla
        if hla_dataframe is not None:
            self.donor_to_hla = {}
            for (i, row) in hla_dataframe.iterrows():
                if row.donor in self.donor_to_hla:
                    raise ValueError("Multiple rows for donor: %s" % row.donor)
                if pandas.isnull(row.hla):
                    self.donor_to_hla[row.donor] = None
                else:
                    self.donor_to_hla[row.donor] = self.string_to_hla_alleles(
                        row.hla)
        assert self.hla is not None or self.donor_to_hla is not None
コード例 #5
0
def normalize_chromosome(c):
    try:
        return NORMALIZE_CHROMOSOME_CACHE[c]
    except KeyError:
        pass

    if not (is_string(c) or is_integer(c)):
        raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c)))

    result = str(c)

    if result == "0":
        raise ValueError("Chromosome name cannot be 0")
    elif result == "":
        raise ValueError("Chromosome name cannot be empty")

    if result.startswith("chr") and "_" not in result:
        # excluding "_" for names like "chrUn_gl000212"
        # capitalize "chrx" -> "chrX"
        result = "chr" + result[3:].upper()
    elif result.isalpha():
        # capitalize e.g. "x" -> "X"
        result = result.upper()

    # interning strings since the chromosome names probably get constructed
    # or parsed millions of times, can save memory in tight situations
    # (such as parsing GTF files)
    result = intern(result)

    NORMALIZE_CHROMOSOME_CACHE[c] = result

    return result
コード例 #6
0
ファイル: resource_collection.py プロジェクト: arahuja/sefara
 def write(self, file=None, format=None, indent=None):
     close_on_exit = False
     if typechecks.is_string(file):
         fd = open(file, "w")
         close_on_exit = True
         if format is None:
             if file.endswith(".json"):
                 format = "json"
             elif file.endswith(".py"):
                 format = "python"
             else:
                 raise ValueError(
                     "Couldn't guess format from filename: %s" % file)
     elif not file:
         fd = sys.stdout
         if format is None:
             format = "json"
     else:
         fd = file
     try:
         extra_args = {} if indent is None else {"indent": indent}
         if format == "json":
             value = self.to_json(**extra_args)
         elif format == "python":
             value = self.to_python(**extra_args)
         else:
             raise ValueError("Unsupported format: %s" % format)
         fd.write(value)
     finally:
         if close_on_exit:
             fd.close()
コード例 #7
0
def normalize_chromosome(c):
    try:
        return NORMALIZE_CHROMOSOME_CACHE[c]
    except KeyError:
        pass

    if not (is_string(c) or is_integer(c)):
        raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c)))

    result = str(c)
    if result == "0":
        raise ValueError("Chromosome name cannot be 0")
    elif result == "":
        raise ValueError("Chromosome name cannot be empty")

    # only strip off lowercase chr since some of the non-chromosomal
    # contigs start with "CHR"
    if result.startswith("chr"):
        result = result[3:]

    # standardize mitochondrial genome to be "MT"
    if result == "M":
        result = "MT"
    else:
        # just in case someone is being lazy, capitalize "X" and "Y"
        result = result.upper()
    # interning strings since the chromosome names probably get constructed
    # or parsed millions of times, can save memory in tight situations
    # (such as parsing GTF files)
    result = intern(result)
    NORMALIZE_CHROMOSOME_CACHE[c] = result
    return result
コード例 #8
0
ファイル: reference.py プロジェクト: Al3n70rn/varcode
def infer_genome(genome_object_string_or_int):
    """
    If given an integer, return associated human EnsemblRelease for that
    Ensembl version.

    If given a string, return latest EnsemblRelease which has a reference
    of the same name.

    If given a PyEnsembl Genome, simply return it.
    """
    if isinstance(genome_object_string_or_int, Genome):
        return genome_object_string_or_int
    if is_integer(genome_object_string_or_int):
        return cached_release(genome_object_string_or_int)
    elif is_string(genome_object_string_or_int):
        # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37
        # and then get the associated PyEnsembl Genome object
        reference_name = infer_reference_name(genome_object_string_or_int)
        return genome_for_reference_name(reference_name)
    else:
        raise TypeError(
            ("Expected genome to be an int, string, or pyensembl.Genome "
                "instance, got %s : %s") % (
                str(genome_object_string_or_int),
                type(genome_object_string_or_int)))
コード例 #9
0
ファイル: variants_util.py プロジェクト: melsiddieg/varlens
def evaluate_variant_expression(
        expression,
        collection,
        variant,
        error_value=evaluation.RAISE,
        extra_bindings={}):

    if typechecks.is_string(expression):
        variant_metadata = collection.metadata.get(variant, {})
        extra_bindings = {
            'inclusive_start': variant.start,
            'inclusive_end': variant.end,
            'interbase_start': variant.start - 1,
            'interbase_end': variant.end,
            'variant': variant,
            'collection': collection,
            'metadata': variant_metadata,
        }
        extra_bindings.update(variant_metadata)
        bindings = evaluation.EvaluationEnvironment([variant], extra_bindings)
        return evaluation.evaluate_expression(
            expression,
            bindings,
            error_value=error_value)
    else:
        return expression(variant)  
コード例 #10
0
ファイル: locus.py プロジェクト: BioInfoTools/pyensembl
def normalize_chromosome(c):
    try:
        return NORMALIZE_CHROMOSOME_CACHE[c]
    except KeyError:
        pass

    if not (is_string(c) or is_integer(c)):
        raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c)))

    result = str(c)
    if result == "0":
        raise ValueError("Chromosome name cannot be 0")
    elif result == "":
        raise ValueError("Chromosome name cannot be empty")

    # only strip off lowercase chr since some of the non-chromosomal
    # contigs start with "CHR"
    if result.startswith("chr"):
        result = result[3:]

    # just in case someone is being lazy, capitalize "M", "MT", X", "Y"
    result = result.upper()

    # standardize mitochondrial genome to be "MT"
    if result == "M":
        result = "MT"

    # interning strings since the chromosome names probably get constructed
    # or parsed millions of times, can save memory in tight situations
    # (such as parsing GTF files)
    result = intern(result)

    NORMALIZE_CHROMOSOME_CACHE[c] = result

    return result
コード例 #11
0
ファイル: reference.py プロジェクト: barryhicks/varcode
def infer_genome(genome_object_string_or_int):
    """
    If given an integer, return associated human EnsemblRelease for that
    Ensembl version.

    If given a string, return latest EnsemblRelease which has a reference
    of the same name.

    If given a PyEnsembl Genome, simply return it.
    """
    if isinstance(genome_object_string_or_int, Genome):
        return genome_object_string_or_int
    if is_integer(genome_object_string_or_int):
        return cached_release(genome_object_string_or_int)
    elif is_string(genome_object_string_or_int):
        # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37
        # and then get the associated PyEnsembl Genome object
        reference_name = infer_reference_name(genome_object_string_or_int)
        return genome_for_reference_name(reference_name)
    else:
        raise TypeError(
                ("Expected genome to be an int, string, or pyensembl.Genome "
                 "instance, got %s : %s") % (
                    str(genome_object_string_or_int),
                    type(genome_object_string_or_int)))
コード例 #12
0
def evaluate_read_expression(expression,
                             alignment,
                             error_value=evaluation.RAISE,
                             extra_bindings={}):

    if typechecks.is_string(expression):
        bindings = evaluation.EvaluationEnvironment([alignment], extra={})
        return evaluation.evaluate_expression(expression,
                                              bindings,
                                              error_value=error_value)
    else:
        return expression(alignment)
コード例 #13
0
ファイル: reads_util.py プロジェクト: melsiddieg/varlens
def evaluate_read_expression(
        expression,
        alignment,
        error_value=evaluation.RAISE,
        extra_bindings={}):

    if typechecks.is_string(expression):
        bindings = evaluation.EvaluationEnvironment(
            [alignment],
            extra={})
        return evaluation.evaluate_expression(
            expression,
            bindings,
            error_value=error_value)
    else:
        return expression(alignment) 
コード例 #14
0
    def write(self, file=None, format=None, indent=None):
        """
        Serialize this collection to disk.

        Parameters
        ----------
        file : string or file handle [optional, default: sys.stdout]
            Path or file handle to write to.

        format : string, one of "python" or "json" [optional]
            Output format. If not specified, it is guessed from the filename
            extension.

        indent : int [optional]
            Number of spaces to use for indentation.
        """
        close_on_exit = False
        if typechecks.is_string(file):
            fd = open(file, "w")
            close_on_exit = True
            if format is None:
                if file.endswith(".json"):
                    format = "json"
                elif file.endswith(".py"):
                    format = "python"
                else:
                    raise ValueError(
                        "Couldn't guess format from filename: %s" % file)
        elif not file:
            fd = sys.stdout
            if format is None:
                format = "python"
        else:
            fd = file
        try:
            extra_args = {} if indent is None else {"indent": indent}
            if format == "json":
                value = self.to_json(**extra_args)
            elif format == "python":
                value = self.to_python(**extra_args)
            else:
                raise ValueError("Unsupported format: %s" % format)
            fd.write(value)
        finally:
            if close_on_exit:
                fd.close()
コード例 #15
0
def evaluate_pileup_element_expression(expression,
                                       collection,
                                       pileup,
                                       element,
                                       error_value=evaluation.RAISE,
                                       extra_bindings={}):

    if typechecks.is_string(expression):
        bindings = evaluation.EvaluationEnvironment(
            [element, element.alignment, pileup],
            extra={
                'element': element,
                'pileup': pileup,
                'collection': collection,
            })
        return evaluation.evaluate_expression(expression,
                                              bindings,
                                              error_value=error_value)
    else:
        return expression(pileup)
コード例 #16
0
ファイル: reads_util.py プロジェクト: melsiddieg/varlens
def evaluate_pileup_element_expression(
        expression,
        collection,
        pileup,
        element,
        error_value=evaluation.RAISE,
        extra_bindings={}):

    if typechecks.is_string(expression):
        bindings = evaluation.EvaluationEnvironment(
            [element, element.alignment, pileup],
            extra={
                'element': element,
                'pileup': pileup,
                'collection': collection,
            })
        return evaluation.evaluate_expression(
            expression,
            bindings,
            error_value=error_value)
    else:
        return expression(pileup)   
コード例 #17
0
ファイル: variants_util.py プロジェクト: melsiddieg/varlens
def evaluate_variant_expression(expression,
                                collection,
                                variant,
                                error_value=evaluation.RAISE,
                                extra_bindings={}):

    if typechecks.is_string(expression):
        variant_metadata = collection.metadata.get(variant, {})
        extra_bindings = {
            'inclusive_start': variant.start,
            'inclusive_end': variant.end,
            'interbase_start': variant.start - 1,
            'interbase_end': variant.end,
            'variant': variant,
            'collection': collection,
            'metadata': variant_metadata,
        }
        extra_bindings.update(variant_metadata)
        bindings = evaluation.EvaluationEnvironment([variant], extra_bindings)
        return evaluation.evaluate_expression(expression,
                                              bindings,
                                              error_value=error_value)
    else:
        return expression(variant)
コード例 #18
0
ファイル: common.py プロジェクト: chrinide/pyensembl
def is_valid_human_transcript_id(transcript_id):
    """Is the argument a valid identifier for human Ensembl transcripts?"""
    return is_string(transcript_id) and transcript_id.startswith("ENST")
コード例 #19
0
ファイル: resource.py プロジェクト: timodonnell/sefara
    def evaluate(self, expression, error_value=RAISE, extra_bindings={}):
        """
        Evaluate a Python expression or callable in the context of this
        resource.

        Parameters
        ----------
        expression : string or callable
            If a string, then it should give a valid Python expression.
            This expression will be evaluated with the attributes of this
            resource in the local namespace. For example, since the resource
            has a ``name`` attribute, the expression "name.lower()" would
            return the name in lower case. Tags can be accessed through the
            ``tags`` variable. If the resource has a tag called ``foo``, then
            the expression "tags.foo" will evaluate to ``True``. If there is no
            such tag, then "tags.foo" will evaluate to ``False``.

            A few common modules are included in the evaluation namespace,
            including ``os``, ``sys``, ``collections``, ``re``, and ``json``.
            The resource object itself is also available in the ``resource``
            variable.

            As a hack to support a primitive form of exception handling, a
            function called ``on_error`` is also included in the evaluation
            namespace. This function takes a single argument, ``value``, of
            any type and returns None. If ``on_error`` is called while
            evaluating the expression, and the expression subsequently raises
            an exception, then the exception is caught and ``value`` is
            returned as the value of the expression. This means you can write
            expressions like:

                ``on_error(False) or foo.startswith("bar")``

            and if the right side of the expression raises an error (for
            example, if there is no such attribute ``foo`` in the resource),
            then the value ``False`` will be used as the expression's value.
            Note that you must write the expression as it is here: put the
            ``on_error`` clause first, and connect it with the main expression
            with `or` (this ensures that it gets called before the rest of the
            expression).

            If ``expression`` is a callable, then it will be called and passed
            this Resource instance as its argument.

        error_value : object [optional]
            If evaluating the expression results in an uncaught exception,
            the ``error_value`` value will be returned instead. If not
            specified, then ``evaluate`` will raise the exception to the
            caller.

        extra_bindings : dict [optional]
            Additional local variables to include in the evaluation context.

        Returns
        ----------
        The Python object returned by evaluating the expression.

        """
        # Since Python 2 doesn't have a nonlocal keyword, we have to box up the
        # error_value, so we can reassign to it in the ``on_error`` function
        # below.
        error_box = [error_value]
        try:
            if typechecks.is_string(expression):
                # Give some basic modules.
                environment = dict(STANDARD_EVALUATION_ENVIRONMENT)
                environment["resource"] = self

                # We also add our "on_error" hack.
                def on_error(value):
                    error_box[0] = value

                environment["on_error"] = on_error
                environment.update(extra_bindings)

                return eval(expression, environment, self)
            else:
                return expression(self)
        except Exception as e:
            if error_box[0] is not Resource.RAISE:
                return error_box[0]
            extra = "Error while evaluating: \n\t%s\non resource:\n%s" % (
                expression, self)
            traceback = sys.exc_info()[2]
            raise_(ValueError, str(e) + "\n" + extra, traceback)
コード例 #20
0
    def from_bam(pysam_samfile, loci):
        """
        Create a PileupCollection for a set of loci from a BAM file.

        Parameters
        ----------
        pysam_samfile : `pysam.csamfile.Samfile` instance, or filename string
            to a BAM file. The BAM file must be indexed.

        loci : list of Locus instances
            Loci to collect pileups for.

        Returns
        ----------
        PileupCollection instance containing pileups for the specified loci.
        All alignments in the BAM file are included (e.g. duplicate reads,
        secondary alignments, etc.). See `PileupCollection.filter` if these
        need to be removed. 
        """

        loci = [to_locus(obj) for obj in loci]

        close_on_completion = False
        if typechecks.is_string(pysam_samfile):
            pysam_samfile = Samfile(pysam_samfile)
            close_on_completion = True

        try:
            # Map from pyensembl normalized chromosome names used in Variant to
            # the names used in the BAM file.
            chromosome_name_map = {}
            for name in pysam_samfile.references:
                normalized = pyensembl.locus.normalize_chromosome(name)
                chromosome_name_map[normalized] = name

            result = PileupCollection({})

            # Optimization: we sort variants so our BAM reads are localized.
            locus_iterator = itertools.chain.from_iterable(
                (Locus.from_interbase_coordinates(locus_interval.contig, pos) for pos in locus_interval.positions)
                for locus_interval in sorted(loci)
            )
            for locus in locus_iterator:
                result.pileups[locus] = Pileup(locus, [])
                try:
                    chromosome = chromosome_name_map[locus.contig]
                except KeyError:
                    logging.warn("No such contig in bam: %s" % locus.contig)
                    continue
                columns = pysam_samfile.pileup(
                    chromosome,
                    locus.position,
                    locus.position + 1,  # exclusive, 0-indexed
                    truncate=True,
                    stepper="nofilter",
                )
                try:
                    column = next(columns)
                except StopIteration:
                    # No reads align to this locus.
                    continue

                # Note that storing the pileups here is necessary, since the
                # subsequent assertion will invalidate our column.
                pileups = column.pileups
                assert list(columns) == []  # column is invalid after this.
                for pileup_read in pileups:
                    if not pileup_read.is_refskip:
                        element = PileupElement.from_pysam_alignment(locus, pileup_read)
                        result.pileups[locus].append(element)
            return result
        finally:
            if close_on_completion:
                pysam_samfile.close()
コード例 #21
0
def is_valid_human_transcript_id(transcript_id):
    """Is the argument a valid identifier for human Ensembl transcripts?"""
    return is_string(transcript_id) and transcript_id.startswith("ENST")
コード例 #22
0
def is_valid_ensembl_id(ensembl_id):
    """Is the argument a valid ID for any Ensembl feature?"""
    return is_string(ensembl_id) and ensembl_id.startswith("ENS")
コード例 #23
0
    def select(self, *expressions, **kwargs):
        """
        Select fields (or expressions) from each resource as a pandas
        DataFrame.

        Parameters
        ----------
        *expressions : string, callable, or (string, string or callable) pair
            One or more expressions giving the fields to select.

            Each expression can be either a ``string`` expression, a
            ``callable``, or a ``(string, string or callable)`` pair giving a
            label and an expression.

            Labels give the column names in the result. Labels can be specified
            either by giving a ``(label, expression)`` pair, or giving a string
            of the form "LABEL: EXPRESSION", such as
            "upper_name: name.upper()". Here "upper_name" is the label, and
            "name.upper()" is the expression that will be evaluated. If not
            specified, labels default to the text of the ``expression`` if
            ``expression`` is a string, and an automatically generated label if
            ``expression`` is a callable.

            Each ``expression`` will be passed to `Resource.evaluate` for each
            resource in the collection. See that method's docs for details on
            expression evaluation.

        if_error : string, one of "raise", "skip", or "none" [default: "raise"]
            Must be specified as a keyword argument. Controls the behavior when
            evaluation of an expression raises an uncaught exception. One of:

            raise
                Raise the exception to the caller. This is the default.

            skip
                Skip resources where evaluation of any of the expressions
                raises an error. These resources will be omitted from the
                result.

            none
                If evaluating an expression on a resource raises an exception,
                set that entry in the result to ``None``.

        Returns
        -------
        A `pandas.DataFrame`. Rows correspond to resources. Columns correspond
        to the specified expressions.
        """
        if_error = kwargs.pop("if_error", "raise")
        if if_error == "raise" or if_error == "skip":
            error_value = Resource.RAISE
        elif if_error == "none":
            error_value = None
        else:
            raise TypeError("if_error should be 'raise', 'skip', or 'none'")
        if kwargs:
            raise TypeError("Invalid keyword arguments: %s" % " ".join(kwargs))

        labels_and_expressions = []
        expr_num = 1
        for expression in expressions:
            if isinstance(expression, tuple):
                (label, expression) = expression
            elif typechecks.is_string(expression):
                match = re.match(r"^([\w\- ]+):(.*)$", expression)
                if match is None:
                    label = expression
                else:
                    (label, expression) = match.groups()
            else:
                label = "expr_%d" % expr_num
                expr_num += 1
            labels_and_expressions.append((label, expression))

        df_dict = collections.OrderedDict(
            (label, []) for (label, _) in labels_and_expressions)

        extra_bindings = {key: None for key in self.attributes}

        def values_for_resource(resource):
            result = []
            for (label, expression) in labels_and_expressions:
                try:
                    value = resource.evaluate(
                        expression,
                        error_value=error_value,
                        extra_bindings=extra_bindings)
                except:
                    if if_error == "raise":
                        raise
                    elif if_error == "skip":
                        return None
                    elif if_error == "none":
                        value = None
                result.append(value)
            return result

        for resource in self:
            row = values_for_resource(resource)
            if row is not None:
                for ((label, _), value) in zip(labels_and_expressions, row):
                    df_dict[label].append(value)

        return pandas.DataFrame(df_dict)
コード例 #24
0
ファイル: common.py プロジェクト: chrinide/pyensembl
def is_valid_ensembl_id(ensembl_id):
    """Is the argument a valid ID for any Ensembl feature?"""
    return is_string(ensembl_id) and ensembl_id.startswith("ENS")
コード例 #25
0
    def from_bam(pysam_samfile, loci, normalized_contig_names=True):
        '''
        Create a PileupCollection for a set of loci from a BAM file.

        Parameters
        ----------
        pysam_samfile : `pysam.Samfile` instance, or filename string
            to a BAM file. The BAM file must be indexed.

        loci : list of Locus instances
            Loci to collect pileups for.

        normalized_contig_names : whether the contig names have been normalized
            (e.g. pyensembl removes the 'chr' prefix). Set to true to
            de-normalize the names when querying the BAM file.

        Returns
        ----------
        PileupCollection instance containing pileups for the specified loci.
        All alignments in the BAM file are included (e.g. duplicate reads,
        secondary alignments, etc.). See `PileupCollection.filter` if these
        need to be removed. 
        '''

        loci = [to_locus(obj) for obj in loci]

        close_on_completion = False
        if typechecks.is_string(pysam_samfile):
            pysam_samfile = Samfile(pysam_samfile)
            close_on_completion = True

        try:
            # Map from pyensembl normalized chromosome names used in Variant to
            # the names used in the BAM file.
            if normalized_contig_names:
                chromosome_name_map = {}
                for name in pysam_samfile.references:
                    normalized = pyensembl.locus.normalize_chromosome(name)
                    chromosome_name_map[normalized] = name
                    chromosome_name_map[name] = name
            else:
                chromosome_name_map = None

            result = PileupCollection({})

            # Optimization: we sort variants so our BAM reads are localized.
            locus_iterator = itertools.chain.from_iterable(
                (Locus.from_interbase_coordinates(locus_interval.contig, pos)
                 for pos in locus_interval.positions)
                for locus_interval in sorted(loci))
            for locus in locus_iterator:
                result.pileups[locus] = Pileup(locus, [])
                if normalized_contig_names:
                    try:
                        chromosome = chromosome_name_map[locus.contig]
                    except KeyError:
                        logging.warn("No such contig in bam: %s" %
                                     locus.contig)
                        continue
                else:
                    chromosome = locus.contig
                columns = pysam_samfile.pileup(
                    chromosome,
                    locus.position,
                    locus.position + 1,  # exclusive, 0-indexed
                    truncate=True,
                    stepper="nofilter")
                try:
                    column = next(columns)
                except StopIteration:
                    # No reads align to this locus.
                    continue

                # Note that storing the pileups here is necessary, since the
                # subsequent assertion will invalidate our column.
                pileups = column.pileups
                assert list(columns) == []  # column is invalid after this.
                for pileup_read in pileups:
                    if not pileup_read.is_refskip:
                        element = PileupElement.from_pysam_alignment(
                            locus, pileup_read)
                        result.pileups[locus].append(element)
            return result
        finally:
            if close_on_completion:
                pysam_samfile.close()
コード例 #26
0
ファイル: common.py プロジェクト: chrinide/pyensembl
def is_valid_human_protein_id(protein_id):
    """Is the argument a valid identifier for human Ensembl proteins?"""
    return is_string(protein_id) and protein_id.startswith("ENSP")
コード例 #27
0
def is_valid_human_protein_id(protein_id):
    """Is the argument a valid identifier for human Ensembl proteins?"""
    return is_string(protein_id) and protein_id.startswith("ENSP")