Beispiel #1
0
def evaluate_variant_expression(
        expression,
        collection,
        variant,
        error_value=evaluation.RAISE,
        extra_bindings={}):

    if typechecks.is_string(expression):
        variant_metadata = collection.metadata.get(variant, {})
        extra_bindings = {
            'inclusive_start': variant.start,
            'inclusive_end': variant.end,
            'interbase_start': variant.start - 1,
            'interbase_end': variant.end,
            'variant': variant,
            'collection': collection,
            'metadata': variant_metadata,
        }
        extra_bindings.update(variant_metadata)
        bindings = evaluation.EvaluationEnvironment([variant], extra_bindings)
        return evaluation.evaluate_expression(
            expression,
            bindings,
            error_value=error_value)
    else:
        return expression(variant)  
Beispiel #2
0
    def __init__(self, hla=None, hla_dataframe=None, donor_to_hla=None):
        """
        Specify exactly one of hla, hla_dataframe, or donor_to_hla.

        Parameters
        -----------
        hla : list of string
            HLA alleles to use for all donors

        hla_dataframe : pandas.DataFrame with columns 'donor' and 'hla'
            DataFrame giving HLA alleles for each donor. The 'hla' column
            should be a space separated list of alleles for that donor.

        donor_to_hla : dict of string -> string list
            Map from donor to HLA alleles for that donor.
        """
        if bool(hla) + (hla_dataframe is not None) + bool(donor_to_hla) != 1:
            raise TypeError(
                "Must specify exactly one of hla, hla_dataframe, donor_to_hla")
        
        self.hla = (
            self.string_to_hla_alleles(hla) if typechecks.is_string(hla)
            else hla)
        self.donor_to_hla = donor_to_hla
        if hla_dataframe is not None:
            self.donor_to_hla = {}
            for (i, row) in hla_dataframe.iterrows():
                if row.donor in self.donor_to_hla:
                    raise ValueError("Multiple rows for donor: %s" % row.donor)
                if pandas.isnull(row.hla):
                    self.donor_to_hla[row.donor] = None
                else:
                    self.donor_to_hla[row.donor] = self.string_to_hla_alleles(
                        row.hla)
        assert self.hla is not None or self.donor_to_hla is not None
Beispiel #3
0
 def write(self, file=None, format=None, indent=None):
     close_on_exit = False
     if typechecks.is_string(file):
         fd = open(file, "w")
         close_on_exit = True
         if format is None:
             if file.endswith(".json"):
                 format = "json"
             elif file.endswith(".py"):
                 format = "python"
             else:
                 raise ValueError(
                     "Couldn't guess format from filename: %s" % file)
     elif not file:
         fd = sys.stdout
         if format is None:
             format = "json"
     else:
         fd = file
     try:
         extra_args = {} if indent is None else {"indent": indent}
         if format == "json":
             value = self.to_json(**extra_args)
         elif format == "python":
             value = self.to_python(**extra_args)
         else:
             raise ValueError("Unsupported format: %s" % format)
         fd.write(value)
     finally:
         if close_on_exit:
             fd.close()
Beispiel #4
0
def infer_genome(genome_object_string_or_int):
    """
    If given an integer, return associated human EnsemblRelease for that
    Ensembl version.

    If given a string, return latest EnsemblRelease which has a reference
    of the same name.

    If given a PyEnsembl Genome, simply return it.
    """
    if isinstance(genome_object_string_or_int, Genome):
        return genome_object_string_or_int
    if is_integer(genome_object_string_or_int):
        return cached_release(genome_object_string_or_int)
    elif is_string(genome_object_string_or_int):
        # first infer the canonical reference name, e.g. mapping hg19 -> GRCh37
        # and then get the associated PyEnsembl Genome object
        reference_name = infer_reference_name(genome_object_string_or_int)
        return genome_for_reference_name(reference_name)
    else:
        raise TypeError(
            ("Expected genome to be an int, string, or pyensembl.Genome "
                "instance, got %s : %s") % (
                str(genome_object_string_or_int),
                type(genome_object_string_or_int)))
Beispiel #5
0
def normalize_chromosome(c):
    try:
        return NORMALIZE_CHROMOSOME_CACHE[c]
    except KeyError:
        pass

    if not (is_string(c) or is_integer(c)):
        raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c)))

    result = str(c)
    if result == "0":
        raise ValueError("Chromosome name cannot be 0")
    elif result == "":
        raise ValueError("Chromosome name cannot be empty")

    # only strip off lowercase chr since some of the non-chromosomal
    # contigs start with "CHR"
    if result.startswith("chr"):
        result = result[3:]

    # just in case someone is being lazy, capitalize "M", "MT", X", "Y"
    result = result.upper()

    # standardize mitochondrial genome to be "MT"
    if result == "M":
        result = "MT"

    # interning strings since the chromosome names probably get constructed
    # or parsed millions of times, can save memory in tight situations
    # (such as parsing GTF files)
    result = intern(result)

    NORMALIZE_CHROMOSOME_CACHE[c] = result

    return result
Beispiel #6
0
def evaluate_read_expression(
        expression,
        alignment,
        error_value=evaluation.RAISE,
        extra_bindings={}):

    if typechecks.is_string(expression):
        bindings = evaluation.EvaluationEnvironment(
            [alignment],
            extra={})
        return evaluation.evaluate_expression(
            expression,
            bindings,
            error_value=error_value)
    else:
        return expression(alignment) 
Beispiel #7
0
def evaluate_pileup_element_expression(
        expression,
        collection,
        pileup,
        element,
        error_value=evaluation.RAISE,
        extra_bindings={}):

    if typechecks.is_string(expression):
        bindings = evaluation.EvaluationEnvironment(
            [element, element.alignment, pileup],
            extra={
                'element': element,
                'pileup': pileup,
                'collection': collection,
            })
        return evaluation.evaluate_expression(
            expression,
            bindings,
            error_value=error_value)
    else:
        return expression(pileup)   
Beispiel #8
0
def is_valid_human_protein_id(protein_id):
    """Is the argument a valid identifier for human Ensembl proteins?"""
    return is_string(protein_id) and protein_id.startswith("ENSP")
Beispiel #9
0
def is_valid_human_transcript_id(transcript_id):
    """Is the argument a valid identifier for human Ensembl transcripts?"""
    return is_string(transcript_id) and transcript_id.startswith("ENST")
Beispiel #10
0
def is_valid_ensembl_id(ensembl_id):
    """Is the argument a valid ID for any Ensembl feature?"""
    return is_string(ensembl_id) and ensembl_id.startswith("ENS")
Beispiel #11
0
    def from_bam(pysam_samfile, loci):
        """
        Create a PileupCollection for a set of loci from a BAM file.

        Parameters
        ----------
        pysam_samfile : `pysam.csamfile.Samfile` instance, or filename string
            to a BAM file. The BAM file must be indexed.

        loci : list of Locus instances
            Loci to collect pileups for.

        Returns
        ----------
        PileupCollection instance containing pileups for the specified loci.
        All alignments in the BAM file are included (e.g. duplicate reads,
        secondary alignments, etc.). See `PileupCollection.filter` if these
        need to be removed. 
        """

        loci = [to_locus(obj) for obj in loci]

        close_on_completion = False
        if typechecks.is_string(pysam_samfile):
            pysam_samfile = Samfile(pysam_samfile)
            close_on_completion = True

        try:
            # Map from pyensembl normalized chromosome names used in Variant to
            # the names used in the BAM file.
            chromosome_name_map = {}
            for name in pysam_samfile.references:
                normalized = pyensembl.locus.normalize_chromosome(name)
                chromosome_name_map[normalized] = name

            result = PileupCollection({})

            # Optimization: we sort variants so our BAM reads are localized.
            locus_iterator = itertools.chain.from_iterable(
                (Locus.from_interbase_coordinates(locus_interval.contig, pos) for pos in locus_interval.positions)
                for locus_interval in sorted(loci)
            )
            for locus in locus_iterator:
                result.pileups[locus] = Pileup(locus, [])
                try:
                    chromosome = chromosome_name_map[locus.contig]
                except KeyError:
                    logging.warn("No such contig in bam: %s" % locus.contig)
                    continue
                columns = pysam_samfile.pileup(
                    chromosome,
                    locus.position,
                    locus.position + 1,  # exclusive, 0-indexed
                    truncate=True,
                    stepper="nofilter",
                )
                try:
                    column = next(columns)
                except StopIteration:
                    # No reads align to this locus.
                    continue

                # Note that storing the pileups here is necessary, since the
                # subsequent assertion will invalidate our column.
                pileups = column.pileups
                assert list(columns) == []  # column is invalid after this.
                for pileup_read in pileups:
                    if not pileup_read.is_refskip:
                        element = PileupElement.from_pysam_alignment(locus, pileup_read)
                        result.pileups[locus].append(element)
            return result
        finally:
            if close_on_completion:
                pysam_samfile.close()