Esempio n. 1
0
    def _create_table(self, table_name, column_types, primary=None, nullable=()):
        """Creates a sqlite3 table from the given metadata.

        Parameters
        ----------

        column_types : list of (str, str) pairs
            First element of each tuple is the column name, second element is the sqlite3 type

        primary : str, optional
            Which column is the primary key

        nullable : iterable, optional
            Names of columns which have null values
        """
        require_string(table_name, "table name")
        require_iterable_of(column_types, tuple, name="rows")
        if primary is not None:
            require_string(primary, "primary")
        require_iterable_of(nullable, str, name="nullable")

        column_decls = []
        for column_name, column_type in column_types:
            decl = "%s %s" % (column_name, column_type)
            if column_name == primary:
                decl += " UNIQUE PRIMARY KEY"
            if column_name not in nullable:
                decl += " NOT NULL"
            column_decls.append(decl)
        column_decl_str = ", ".join(column_decls)
        create_table_sql = \
            "CREATE TABLE %s (%s)" % (table_name, column_decl_str)
        self.execute_sql(create_table_sql)
Esempio n. 2
0
def normalize_chromosome(c):
    try:
        return NORMALIZE_CHROMOSOME_CACHE[c]
    except KeyError:
        pass

    result = c
    if is_integer(result):
        if result == 0:
            raise ValueError("Contig cannot be 0")
        result = str(result)
    else:
        require_string(result, "contig name", nonempty=True)

        # only strip off lowercase chr since some of the non-chromosomal
        # contigs start with "CHR"
        if result.startswith("chr"):
            result = result[3:]

        # standardize mitochondrial genome to be "MT"
        if result == "M":
            result = "MT"
        else:
            # just in case someone is being lazy, capitalize "X" and "Y"
            result = result.upper()
        
    NORMALIZE_CHROMOSOME_CACHE[c] = result
    return result
Esempio n. 3
0
File: vcf.py Progetto: gmazz/varcode
    def __init__(self, path):
        """
        Construct a new wrapper.

        Parameters
        ----------
        path : string or pyvcf Reader instance
            Path or URL to load, or Reader instance.
        """
        self.path = None  # string path, if available.
        self.vcf_reader = None  # vcf_reader. Will always be set.
        self._to_close = None  # object to call close() on when we're done.

        if isinstance(path, pyvcf.Reader):
            self.vcf_reader = path
        else:
            require_string(path, "Path or URL to VCF")
            self.path = path
            parsed_path = parse_url_or_path(path)
            if not parsed_path.scheme or parsed_path.scheme.lower() == 'file':
                self.vcf_reader = pyvcf.Reader(filename=parsed_path.path,
                                               strict_whitespace=True)
            elif parsed_path.scheme.lower() in ("http", "https", "ftp"):
                self._to_close = response = requests.get(path, stream=True)
                response.raise_for_status()  # raise error on 404, etc.
                if path.endswith(".gz"):
                    lines = stream_gzip_decompress_lines(
                        response.iter_content())
                else:
                    lines = response.iter_lines(decode_unicode=True)
                self.vcf_reader = pyvcf.Reader(fsock=lines,
                                               compressed=False,
                                               strict_whitespace=True)
            else:
                raise ValueError("Unsupported scheme: %s" % parsed_path.scheme)
Esempio n. 4
0
def evaluate_expression(expression, bindings, error_value=RAISE):
    typechecks.require_string(expression)

    # Since Python 2 doesn't have a nonlocal keyword, we have to box up the
    # error_value, so we can reassign to it in the ``on_error`` function
    # below.
    error_box = [error_value]
    try:
        # Give some basic modules.
        standard_environment = dict(STANDARD_EVALUATION_ENVIRONMENT)

        # Add our "on_error" hack.
        def on_error(value):
            error_box[0] = value

        standard_environment["on_error"] = on_error

        return eval(expression, standard_environment, bindings)
    except Exception as e:
        if error_box[0] is not RAISE:
            return error_box[0]
        extra = "Error while evaluating: \n\t%s\non:\n%s" % (expression,
                                                             bindings)
        traceback = sys.exc_info()[2]
        raise_(ValueError, str(e) + "\n" + extra, traceback)
Esempio n. 5
0
    def __init__(self, path):
        """
        Construct a new wrapper.

        Parameters
        ----------
        path : string or pyvcf Reader instance
            Path or URL to load, or Reader instance.
        """
        self.path = None  # string path, if available.
        self.vcf_reader = None  # vcf_reader. Will always be set.
        self._to_close = None  # object to call close() on when we're done.

        if isinstance(path, vcf.Reader):
            self.vcf_reader = path
        else:
            require_string(path, "Path or URL to VCF")
            self.path = path
            parsed_path = parse_url_or_path(path)
            if not parsed_path.scheme or parsed_path.scheme.lower() == 'file':
                self.vcf_reader = vcf.Reader(filename=parsed_path.path)
            elif parsed_path.scheme.lower() in ("http", "https", "ftp"):
                self._to_close = response = requests.get(path, stream=True)
                response.raise_for_status()  # raise error on 404, etc.
                if path.endswith(".gz"):
                    lines = stream_gzip_decompress_lines(
                        response.iter_content())
                else:
                    lines = response.iter_lines(decode_unicode=True)
                self.vcf_reader = vcf.Reader(fsock=lines, compressed=False)
            else:
                raise ValueError("Unsupported scheme: %s" % parsed_path.scheme)
Esempio n. 6
0
    def column_values_at_locus(
            self,
            column_name,
            feature,
            contig,
            position,
            end=None,
            strand=None,
            distinct=False,
            sorted=False):
        """
        Get the non-null values of a column from the database
        at a particular range of loci
        """

        # TODO: combine with the query method, since they overlap
        # significantly
        require_string(column_name, "column_name", nonempty=True)

        contig = normalize_chromosome(contig)

        require_integer(position, "position")

        if end is None:
            end = position

        require_integer(end, "end")

        if not self.column_exists(feature, column_name):
            raise ValueError("Table %s doesn't have column %s" % (
                feature, column_name,))

        if distinct:
            distinct_string = "DISTINCT "
        else:
            distinct_string = ""

        query = """
            SELECT %s%s
            FROM %s
            WHERE seqname = ?
            AND start <= ?
            AND end >= ?

        """ % (distinct_string, column_name, feature)

        query_params = [contig, end, position]

        if strand:
            query += " AND strand = ?"
            query_params.append(strand)

        tuples = self.connection.execute(query, query_params).fetchall()

        # each result is a tuple, so pull out its first element
        results = [t[0] for t in tuples if t[0] is not None]

        if sorted:
            results.sort()
        return results
Esempio n. 7
0
    def dataframe(self, contig=None, feature=None, strand=None):
        """
        Load genome entries as a DataFrame, optionally restricted to
        particular contig or feature type.
        """
        if contig:
            contig = normalize_chromosome(contig)

        if strand:
            strand = normalize_strand(strand)

        if feature is not None:
            require_string(feature, "feature")

        key = (contig, feature, strand)

        if key not in self._dataframes:
            csv_path = self.cached_data_file_path(
                contig=contig,
                feature=feature,
                strand=strand,
                distinct=False)

            def cached_loader_fn():
                # pylint: disable=no-member
                # pylint has trouble with df.seqname and similar
                # statements in this function.

                full_df = self._load_full_dataframe()
                assert len(full_df) > 0, \
                    "Dataframe representation of genomic database empty!"

                # rename since we're going to be filtering the entries but
                # may still want to access the full dataset
                df = full_df
                if contig:
                    df = df[df.seqname == contig]
                    if len(df) == 0:
                        raise ValueError("Contig not found: %s" % (contig,))

                if feature:
                    df = df[df.feature == feature]
                    if len(df) == 0:
                        # check to make sure feature was somewhere in
                        # the full dataset before returning an empty dataframe
                        features = full_df.feature.unique()
                        if feature not in features:
                            raise ValueError(
                                "Feature not found: %s" % (feature,))
                if strand:
                    df = df[df.strand == strand]

                return df

            self._dataframes[key] = cached_dataframe(csv_path, cached_loader_fn)

        return self._dataframes[key]
Esempio n. 8
0
 def from_interbase_coordinates(contig, start, end=None):
     '''
     Given coordinates in 0-based interbase coordinates, return a Locus
     instance.
     '''
     typechecks.require_string(contig)
     typechecks.require_integer(start)
     if end is None:
         end = start + 1
     typechecks.require_integer(end)
     return Locus(contig, start, end)
Esempio n. 9
0
 def from_inclusive_coordinates(contig, start, end=None):
     '''
     Given coordinates in 1-based coordinates that are inclusive on start
     and end, return a Locus instance. Locus instances are always 0-based
     "interbase" coordinates.
     '''
     typechecks.require_string(contig)
     typechecks.require_integer(start)
     if end is None:
         end = start
     typechecks.require_integer(end)
     return Locus(contig, start - 1, end)
Esempio n. 10
0
def _create_cached_db(db_path, tables, version=1):
    """
    Either create or retrieve sqlite database.

    Parameters
    --------
    db_path : str
        Path to sqlite3 database file

    tables : dict
        Dictionary mapping table names to datacache.DatabaseTable objects

    version : int, optional
        Version acceptable as cached data.

    Returns sqlite3 connection
    """
    require_string(db_path, "db_path")
    require_iterable_of(tables, DatabaseTable)
    require_integer(version, "version")

    # if the database file doesn't already exist and we encounter an error
    # later, delete the file before raising an exception
    delete_on_error = not exists(db_path)

    # if the database already exists, contains all the table
    # names and has the right version, then just return it
    db = Database(db_path)

    # make sure to delete the database file in case anything goes wrong
    # to avoid leaving behind an empty DB
    table_names = [table.name for table in tables]
    try:
        if db.has_tables(table_names) and \
                db.has_version() and \
                db.version() == version:
            logger.info("Found existing table in database %s", db_path)
        else:
            if len(db.table_names()) > 0:
                logger.info("Dropping tables from database %s: %s", db_path,
                            ", ".join(db.table_names()))
                db.drop_all_tables()
            logger.info("Creating database %s containing: %s", db_path,
                        ", ".join(table_names))
            db.create(tables, version)
    except:
        logger.warning("Failed to create tables %s in database %s",
                       table_names, db_path)
        db.close()
        if delete_on_error:
            remove(db_path)
        raise
    return db.connection
Esempio n. 11
0
 def from_interbase_coordinates(contig, start, end=None):
     '''
     Given coordinates in 0-based interbase coordinates, return a Locus
     instance.
     '''
     typechecks.require_string(contig)
     typechecks.require_integer(start)
     if end is None:
         end = start + 1
     typechecks.require_integer(end)
     contig = pyensembl.locus.normalize_chromosome(contig)
     return Locus(contig, start, end)
Esempio n. 12
0
 def from_inclusive_coordinates(contig, start, end=None):
     '''
     Given coordinates in 1-based coordinates that are inclusive on start
     and end, return a Locus instance. Locus instances are always 0-based
     "interbase" coordinates.
     '''
     typechecks.require_string(contig)
     typechecks.require_integer(start)
     if end is None:
         end = start
     typechecks.require_integer(end)
     contig = pyensembl.locus.normalize_chromosome(contig)
     return Locus(contig, start - 1, end)
Esempio n. 13
0
    def _create_indices(self, table_name, indices):
        """
        Create multiple indices (each over multiple columns) on a given table.

        Parameters
        ----------
        table_name : str

        indices : iterable of tuples
            Multiple groups of columns, each of which should be indexed.
        """
        require_string(table_name, "table_name")
        require_iterable_of(indices, (tuple, list))
        for index_column_set in indices:
            self._create_index(table_name, index_column_set)
Esempio n. 14
0
    def _create_indices(self, table_name, indices):
        """
        Create multiple indices (each over multiple columns) on a given table.

        Parameters
        ----------
        table_name : str

        indices : iterable of tuples
            Multiple groups of columns, each of which should be indexed.
        """
        require_string(table_name, "table_name")
        require_iterable_of(indices, (tuple, list))
        for index_column_set in indices:
            self._create_index(table_name, index_column_set)
Esempio n. 15
0
    def _fill_table(self, table_name, rows):
        require_string(table_name, "table_name")
        require_iterable_of(rows, tuple, "rows")

        if not self.has_table(table_name):
            raise ValueError("Table '%s' does not exist in database" %
                             (table_name, ))
        if len(rows) == 0:
            raise ValueError("Rows must be non-empty sequence")

        first_row = rows[0]
        n_columns = len(first_row)
        if not all(len(row) == n_columns for row in rows):
            raise ValueError("Rows must all have %d values" % n_columns)
        blank_slots = ", ".join("?" for _ in range(n_columns))
        logger.info("Inserting %d rows into table %s", len(rows), table_name)
        sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots)
        self.connection.executemany(sql, rows)
Esempio n. 16
0
    def _fill_table(self, table_name, rows):
        require_string(table_name, "table_name")
        require_iterable_of(rows, tuple, "rows")

        if not self.has_table(table_name):
            raise ValueError(
                "Table '%s' does not exist in database" % (table_name,))
        if len(rows) == 0:
            raise ValueError("Rows must be non-empty sequence")

        first_row = rows[0]
        n_columns = len(first_row)
        if not all(len(row) == n_columns for row in rows):
            raise ValueError("Rows must all have %d values" % n_columns)
        blank_slots = ", ".join("?" for _ in range(n_columns))
        logging.info("Inserting %d rows into table %s", len(rows), table_name)
        sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots)
        self.connection.executemany(sql, rows)
Esempio n. 17
0
def normalize_nucleotide_string(nucleotides,
                                allow_extended_nucleotides=False,
                                empty_chars=".-",
                                treat_nan_as_empty=True):
    """
    Normalizes a nucleotide string by converting various ways of encoding empty
    strings into "", making all letters upper case, and checking to make sure
    all letters in the string are actually nucleotides.

    Parameters
    ----------
    nucleotides : str
        Sequence of nucleotides, e.g. "ACCTG"

    extended_nucleotides : bool
        Allow non-canonical nucleotide characters like 'X' for unknown base

    empty_chars : str
        Characters which encode empty strings, such as "." used in VCF format
        or "-" used in MAF format

    treat_nan_as_empty : bool
        Some MAF files represent deletions/insertions with NaN ref/alt values
    """
    if nucleotides in empty_chars:
        return ""
    elif treat_nan_as_empty and isinstance(nucleotides,
                                           float) and np.isnan(nucleotides):
        return ""

    require_string(nucleotides, name="nucleotide string")

    nucleotides = nucleotides.upper()

    if allow_extended_nucleotides:
        valid_nucleotides = EXTENDED_NUCLEOTIDES
    else:
        valid_nucleotides = STANDARD_NUCLEOTIDES

    if not set(nucleotides) <= valid_nucleotides:
        raise ValueError("Invalid character(s) in nucleotide string: %s" %
                         (",".join(set(nucleotides) - valid_nucleotides), ))

    return nucleotides
Esempio n. 18
0
def load_maf_dataframe(path, nrows=None, verbose=False):
    """
    Load the guaranteed columns of a TCGA MAF file into a DataFrame
    """
    require_string(path, "Path to MAF")

    n_basic_columns = len(MAF_COLUMN_NAMES)

    # pylint: disable=no-member
    # pylint gets confused by read_csv
    df = pandas.read_csv(
        path,
        comment="#",
        sep="\t",
        low_memory=False,
        skip_blank_lines=True,
        header=0)

    if len(df.columns) < n_basic_columns:
        raise ValueError(
            "Too few columns in MAF file %s, expected %d but got  %d : %s" % (
                path, n_basic_columns, len(df.columns), df.columns))

    # check each pair of expected/actual column names to make sure they match
    for expected, actual in zip(MAF_COLUMN_NAMES, df.columns):
        if expected != actual:
            # MAFs in the wild have capitalization differences in their
            # column names, normalize them to always use the names above
            if expected.lower() == actual.lower():
                # using DataFrame.rename in Python 2.7.x doesn't seem to
                # work for some files, possibly because Pandas treats
                # unicode vs. str columns as different?
                df[expected] = df[actual]
                del df[actual]
            else:
                raise ValueError("Expected column %s but got %s" % (
                    expected, actual))
    return df
Esempio n. 19
0
def normalize_nucleotide_string(nucleotides, allow_extended_nucleotides=False):
    """
    Normalizes a nucleotide string by converting various ways of encoding empty
    strings into "", making all letters upper case, and checking to make sure
    all letters in the string are actually nucleotides.

    Parameters
    ----------
    nucleotides : str
        Sequence of nucleotides, e.g. "ACCTG"

    extended_nucleotides : bool
        Allow non-canonical nucleotide characters like 'X' for unknown base
    """
    # some MAF files represent deletions/insertions with NaN ref/alt values
    if isinstance(nucleotides, float) and np.isnan(nucleotides):
        return ""

    # VCF files sometimes have '.' ref or alt for insertions and deletions, and
    # MAF files sometimes have '-' ref or alt for insertions and deletions.
    if nucleotides == "." or nucleotides == "-":
        return ""

    typechecks.require_string(nucleotides, "nucleotide string")

    nucleotides = nucleotides.upper()

    if allow_extended_nucleotides:
        valid_nucleotides = EXTENDED_NUCLEOTIDES
    else:
        valid_nucleotides = STANDARD_NUCLEOTIDES

    if not set(nucleotides) <= valid_nucleotides:
        raise ValueError(
            "Invalid character(s) in nucleotide string: %s" % (
                ",".join(set(nucleotides) - valid_nucleotides),))

    return nucleotides
Esempio n. 20
0
def load_maf_dataframe(path, nrows=None, verbose=False):
    """
    Load the guaranteed columns of a TCGA MAF file into a DataFrame
    """
    require_string(path, "Path to MAF")

    n_basic_columns = len(MAF_COLUMN_NAMES)

    # pylint: disable=no-member
    # pylint gets confused by read_csv
    df = pandas.read_csv(
        path,
        comment="#",
        sep="\t",
        low_memory=False,
        skip_blank_lines=True,
        header=0)

    if len(df.columns) < n_basic_columns:
        raise ValueError(
            "Too few columns in MAF file %s, expected %d but got  %d : %s" % (
                path, n_basic_columns, len(df.columns), df.columns))

    # check each pair of expected/actual column names to make sure they match
    for expected, actual in zip(MAF_COLUMN_NAMES, df.columns):
        if expected != actual:
            # MAFs in the wild have capitalization differences in their
            # column names, normalize them to always use the names above
            if expected.lower() == actual.lower():
                # using DataFrame.rename in Python 2.7.x doesn't seem to
                # work for some files, possibly because Pandas treats
                # unicode vs. str columns as different?
                df[expected] = df[actual]
                del df[actual]
            else:
                raise ValueError("Expected column %s but got %s" % (
                    expected, actual))
    return df
Esempio n. 21
0
def normalize_nucleotide_string(nucleotides, allow_extended_nucleotides=False):
    """
    Normalizes a nucleotide string by converting various ways of encoding empty
    strings into "", making all letters upper case, and checking to make sure
    all letters in the string are actually nucleotides.

    Parameters
    ----------
    nucleotides : str
        Sequence of nucleotides, e.g. "ACCTG"

    extended_nucleotides : bool
        Allow non-canonical nucleotide characters like 'X' for unknown base
    """
    # some MAF files represent deletions/insertions with NaN ref/alt values
    if isinstance(nucleotides, float) and np.isnan(nucleotides):
        return ""

    # VCF files sometimes have '.' ref or alt for insertions and deletions, and
    # MAF files sometimes have '-' ref or alt for insertions and deletions.
    if nucleotides == "." or nucleotides == "-":
        return ""

    typechecks.require_string(nucleotides, "nucleotide string")

    nucleotides = nucleotides.upper()

    if allow_extended_nucleotides:
        valid_nucleotides = EXTENDED_NUCLEOTIDES
    else:
        valid_nucleotides = STANDARD_NUCLEOTIDES

    if not set(nucleotides) <= valid_nucleotides:
        raise ValueError("Invalid character(s) in nucleotide string: %s" %
                         ",".join(set(nucleotides) - valid_nucleotides))

    return nucleotides
Esempio n. 22
0
    def _create_table(self,
                      table_name,
                      column_types,
                      primary=None,
                      nullable=()):
        """Creates a sqlite3 table from the given metadata.

        Parameters
        ----------

        column_types : list of (str, str) pairs
            First element of each tuple is the column name, second element is the sqlite3 type

        primary : str, optional
            Which column is the primary key

        nullable : iterable, optional
            Names of columns which have null values
        """
        require_string(table_name, "table name")
        require_iterable_of(column_types, tuple, name="rows")
        if primary is not None:
            require_string(primary, "primary")
        require_iterable_of(nullable, str, name="nullable")

        column_decls = []
        for column_name, column_type in column_types:
            decl = "%s %s" % (column_name, column_type)
            if column_name == primary:
                decl += " UNIQUE PRIMARY KEY"
            if column_name not in nullable:
                decl += " NOT NULL"
            column_decls.append(decl)
        column_decl_str = ", ".join(column_decls)
        create_table_sql = \
            "CREATE TABLE %s (%s)" % (table_name, column_decl_str)
        self.execute_sql(create_table_sql)
Esempio n. 23
0
def load_vcf_fast(path,
                  genome=None,
                  reference_vcf_key="reference",
                  only_passing=True,
                  allow_extended_nucleotides=False,
                  include_info=True,
                  chunk_size=10**5,
                  max_variants=None):
    """
    Load reference name and Variant objects from the given VCF filename.

    This is an experimental faster implementation of `load_vcf`. It is
    typically about 2X faster, and with `include_info=False`, about 4X faster.
    If most of the records in the VCF have failed filters (and
    only_passing=True), this function can be orders of magnitude faster than
    `load_vcf`.

    Currently only local files are supported by this function (no http). If you
    call this on an HTTP URL, it will fall back to `load_vcf`.

    Parameters
    ----------

    path : str
        Path to VCF (*.vcf) or compressed VCF (*.vcf.gz).

    genome : {pyensembl.Genome, reference name, Ensembl version int}, optional
        Optionally pass in a PyEnsembl Genome object, name of reference, or
        PyEnsembl release version to specify the reference associated with a VCF
        (otherwise infer reference from VCF using reference_vcf_key)

    reference_vcf_key : str, optional
        Name of metadata field which contains path to reference FASTA
        file (default = 'reference')

    only_passing : boolean, optional
        If true, any entries whose FILTER field is not one of "." or "PASS" is
        dropped.

    allow_extended_nucleotides : boolean, default False
        Allow characters other that A,C,T,G in the ref and alt strings.

    include_info : boolean, default True
        Whether to parse the info column. If you don't need that column, set to
        False for faster parsing.

    chunk_size: int, optional
        Number of records to load in memory at once.

    max_variants : int, optional
        If specified, return only the first max_variants variants.
    """

    require_string(path, "Path or URL to VCF")
    parsed_path = parse_url_or_path(path)

    if parsed_path.scheme and parsed_path.scheme.lower() != "file":
        # pandas.read_table nominally supports HTTP, but it tends to crash on
        # large files and does not support gzip. Switching to the python-based
        # implementation of read_table (with engine="python") helps with some
        # issues but introduces a new set of problems (e.g. the dtype parameter
        # is not accepted). For these reasons, we're currently not attempting
        # to load VCFs over HTTP with pandas, and fall back to the pyvcf
        # implementation here.
        return load_vcf(path,
                        genome=genome,
                        reference_vcf_key=reference_vcf_key,
                        only_passing=only_passing,
                        allow_extended_nucleotides=allow_extended_nucleotides,
                        max_variants=max_variants)

    # Loading a local file.
    # The file will be opened twice: first to parse the header with pyvcf, then
    # by pandas to read the data.

    # PyVCF reads the metadata immediately and stops at the first line with
    # data. We can close the file after that.
    handle = PyVCFReaderFromPathOrURL(path)
    handle.close()
    genome = infer_genome_from_vcf(genome, handle.vcf_reader,
                                   reference_vcf_key)

    df_iterator = read_vcf_into_dataframe(path,
                                          include_info=include_info,
                                          chunk_size=chunk_size)

    return dataframes_to_variant_collection(
        df_iterator,
        info_parser=handle.vcf_reader._parse_info if include_info else None,
        only_passing=only_passing,
        max_variants=max_variants,
        variant_kwargs={
            'ensembl': genome,
            'allow_extended_nucleotides': allow_extended_nucleotides
        },
        variant_collection_kwargs={"path": path})
Esempio n. 24
0
def load_vcf(
        path,
        genome=None,
        reference_vcf_key="reference",
        only_passing=True,
        allow_extended_nucleotides=False,
        include_info=True,
        chunk_size=10 ** 5,
        max_variants=None,
        sort_key=variant_ascending_position_sort_key,
        distinct=True):
    """
    Load reference name and Variant objects from the given VCF filename.

    Currently only local files are supported by this function (no http). If you
    call this on an HTTP URL, it will fall back to `load_vcf`.

    Parameters
    ----------

    path : str
        Path to VCF (*.vcf) or compressed VCF (*.vcf.gz).

    genome : {pyensembl.Genome, reference name, Ensembl version int}, optional
        Optionally pass in a PyEnsembl Genome object, name of reference, or
        PyEnsembl release version to specify the reference associated with a
        VCF (otherwise infer reference from VCF using reference_vcf_key)

    reference_vcf_key : str, optional
        Name of metadata field which contains path to reference FASTA
        file (default = 'reference')

    only_passing : boolean, optional
        If true, any entries whose FILTER field is not one of "." or "PASS" is
        dropped.

    allow_extended_nucleotides : boolean, default False
        Allow characters other that A,C,T,G in the ref and alt strings.

    include_info : boolean, default True
        Whether to parse the INFO and per-sample columns. If you don't need
        these, set to False for faster parsing.

    chunk_size: int, optional
        Number of records to load in memory at once.

    max_variants : int, optional
        If specified, return only the first max_variants variants.

    sort_key : fn
        Function which maps each element to a sorting criterion.
        Set to None to not to sort the variants.

    distinct : boolean, default True
        Don't keep repeated variants
    """

    require_string(path, "Path or URL to VCF")
    parsed_path = parse_url_or_path(path)

    if parsed_path.scheme and parsed_path.scheme.lower() != "file":
        # pandas.read_table nominally supports HTTP, but it tends to crash on
        # large files and does not support gzip. Switching to the python-based
        # implementation of read_table (with engine="python") helps with some
        # issues but introduces a new set of problems (e.g. the dtype parameter
        # is not accepted). For these reasons, we're currently not attempting
        # to load VCFs over HTTP with pandas directly, and instead download it
        # to a temporary file and open that.

        (filename, headers) = urllib.request.urlretrieve(path)
        try:
            # The downloaded file has no file extension, which confuses pyvcf
            # for gziped files in Python 3. We rename it to have the correct
            # file extension.
            new_filename = "%s.%s" % (
                filename, parsed_path.path.split(".")[-1])
            os.rename(filename, new_filename)
            filename = new_filename
            return load_vcf(
                filename,
                genome=genome,
                reference_vcf_key=reference_vcf_key,
                only_passing=only_passing,
                allow_extended_nucleotides=allow_extended_nucleotides,
                include_info=include_info,
                chunk_size=chunk_size,
                max_variants=max_variants,
                sort_key=sort_key,
                distinct=distinct)
        finally:
            logger.info("Removing temporary file: %s", filename)
            os.unlink(filename)

    # Loading a local file.
    # The file will be opened twice: first to parse the header with pyvcf, then
    # by pandas to read the data.

    # PyVCF reads the metadata immediately and stops at the first line with
    # data. We can close the file after that.
    handle = PyVCFReaderFromPathOrURL(path)
    handle.close()
    genome = infer_genome_from_vcf(
        genome,
        handle.vcf_reader,
        reference_vcf_key)

    df_iterator = read_vcf_into_dataframe(
        path,
        include_info=include_info,
        sample_names=handle.vcf_reader.samples if include_info else None,
        chunk_size=chunk_size)

    if include_info:
        def sample_info_parser(unparsed_sample_info_strings, format_string):
            """
            Given a format string like "GT:AD:ADP:DP:FS"
            and a list of sample info strings where each entry is like
            "0/1:3,22:T=3,G=22:25:33", return a dict that maps:
            sample name -> field name -> value. Uses pyvcf to parse the fields.
            """
            return pyvcf_calls_to_sample_info_list(
                handle.vcf_reader._parse_samples(
                    unparsed_sample_info_strings, format_string, None))
    else:
        sample_info_parser = None

    return dataframes_to_variant_collection(
        df_iterator,
        source_path=path,
        info_parser=handle.vcf_reader._parse_info if include_info else None,
        only_passing=only_passing,
        max_variants=max_variants,
        sample_names=handle.vcf_reader.samples if include_info else None,
        sample_info_parser=sample_info_parser,
        variant_kwargs={
            'ensembl': genome,
            'allow_extended_nucleotides': allow_extended_nucleotides},
        variant_collection_kwargs={
            'sort_key': sort_key,
            'distinct': distinct})
Esempio n. 25
0
def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None):
    """
    Load the guaranteed columns of a TCGA MAF file into a DataFrame

    Parameters
    ----------
    path : str
        Path to MAF file

    nrows : int
        Optional limit to number of rows loaded

    raise_on_error : bool
        Raise an exception upon encountering an error or log an error

    encoding : str, optional
        Encoding to use for UTF when reading MAF file.
    """
    require_string(path, "Path to MAF")

    n_basic_columns = len(MAF_COLUMN_NAMES)

    # pylint: disable=no-member
    # pylint gets confused by read_csv
    df = pandas.read_csv(
        path,
        comment="#",
        sep="\t",
        low_memory=False,
        skip_blank_lines=True,
        header=0,
        nrows=nrows,
        encoding=encoding)

    if len(df.columns) < n_basic_columns:
        error_message = (
            "Too few columns in MAF file %s, expected %d but got  %d : %s" % (
                path, n_basic_columns, len(df.columns), df.columns))
        if raise_on_error:
            raise ValueError(error_message)
        else:
            logging.warn(error_message)

    # check each pair of expected/actual column names to make sure they match
    for expected, actual in zip(MAF_COLUMN_NAMES, df.columns):
        if expected != actual:
            # MAFs in the wild have capitalization differences in their
            # column names, normalize them to always use the names above
            if expected.lower() == actual.lower():
                # using DataFrame.rename in Python 2.7.x doesn't seem to
                # work for some files, possibly because Pandas treats
                # unicode vs. str columns as different?
                df[expected] = df[actual]
                del df[actual]
            else:
                error_message = (
                    "Expected column %s but got %s" % (expected, actual))
                if raise_on_error:
                    raise ValueError(error_message)
                else:
                    logging.warn(error_message)

    return df
Esempio n. 26
0
def test_require_string():
    require_string("", nonempty=False)
    with assert_raises(TypeError):
        require_string(0, nonempty=False)
    with assert_raises(TypeError):
        require_string(0, nonempty=True)
    with assert_raises(ValueError):
        require_string("", nonempty=True)
    require_string("1", nonempty=False)
    require_string("1", nonempty=True)
Esempio n. 27
0
def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None):
    """
    Load the guaranteed columns of a TCGA MAF file into a DataFrame

    Parameters
    ----------
    path : str
        Path to MAF file

    nrows : int
        Optional limit to number of rows loaded

    raise_on_error : bool
        Raise an exception upon encountering an error or log an error

    encoding : str, optional
        Encoding to use for UTF when reading MAF file.
    """
    require_string(path, "Path to MAF")

    n_basic_columns = len(MAF_COLUMN_NAMES)

    # pylint: disable=no-member
    # pylint gets confused by read_csv
    df = pandas.read_csv(
        path,
        comment="#",
        sep="\t",
        low_memory=False,
        skip_blank_lines=True,
        header=0,
        encoding=encoding)

    if len(df.columns) < n_basic_columns:
        error_message = (
            "Too few columns in MAF file %s, expected %d but got  %d : %s" % (
                path, n_basic_columns, len(df.columns), df.columns))
        if raise_on_error:
            raise ValueError(error_message)
        else:
            logging.warn(error_message)

    # check each pair of expected/actual column names to make sure they match
    for expected, actual in zip(MAF_COLUMN_NAMES, df.columns):
        if expected != actual:
            # MAFs in the wild have capitalization differences in their
            # column names, normalize them to always use the names above
            if expected.lower() == actual.lower():
                # using DataFrame.rename in Python 2.7.x doesn't seem to
                # work for some files, possibly because Pandas treats
                # unicode vs. str columns as different?
                df[expected] = df[actual]
                del df[actual]
            else:
                error_message = (
                    "Expected column %s but got %s" % (expected, actual))
                if raise_on_error:
                    raise ValueError(error_message)
                else:
                    logging.warn(error_message)

    return df
Esempio n. 28
0
File: vcf.py Progetto: gmazz/varcode
def load_vcf(path,
             genome=None,
             reference_vcf_key="reference",
             only_passing=True,
             allow_extended_nucleotides=False,
             include_info=True,
             chunk_size=10**5,
             max_variants=None,
             sort_key=variant_ascending_position_sort_key,
             distinct=True):
    """
    Load reference name and Variant objects from the given VCF filename.

    Currently only local files are supported by this function (no http). If you
    call this on an HTTP URL, it will fall back to `load_vcf`.

    Parameters
    ----------

    path : str
        Path to VCF (*.vcf) or compressed VCF (*.vcf.gz).

    genome : {pyensembl.Genome, reference name, Ensembl version int}, optional
        Optionally pass in a PyEnsembl Genome object, name of reference, or
        PyEnsembl release version to specify the reference associated with a
        VCF (otherwise infer reference from VCF using reference_vcf_key)

    reference_vcf_key : str, optional
        Name of metadata field which contains path to reference FASTA
        file (default = 'reference')

    only_passing : boolean, optional
        If true, any entries whose FILTER field is not one of "." or "PASS" is
        dropped.

    allow_extended_nucleotides : boolean, default False
        Allow characters other that A,C,T,G in the ref and alt strings.

    include_info : boolean, default True
        Whether to parse the INFO and per-sample columns. If you don't need
        these, set to False for faster parsing.

    chunk_size: int, optional
        Number of records to load in memory at once.

    max_variants : int, optional
        If specified, return only the first max_variants variants.

    sort_key : fn
        Function which maps each element to a sorting criterion.
        Set to None to not to sort the variants.

    distinct : boolean, default True
        Don't keep repeated variants
    """

    require_string(path, "Path or URL to VCF")
    parsed_path = parse_url_or_path(path)

    if parsed_path.scheme and parsed_path.scheme.lower() != "file":
        # pandas.read_table nominally supports HTTP, but it tends to crash on
        # large files and does not support gzip. Switching to the python-based
        # implementation of read_table (with engine="python") helps with some
        # issues but introduces a new set of problems (e.g. the dtype parameter
        # is not accepted). For these reasons, we're currently not attempting
        # to load VCFs over HTTP with pandas directly, and instead download it
        # to a temporary file and open that.

        (filename, headers) = urllib.request.urlretrieve(path)
        try:
            # The downloaded file has no file extension, which confuses pyvcf
            # for gziped files in Python 3. We rename it to have the correct
            # file extension.
            new_filename = "%s.%s" % (filename,
                                      parsed_path.path.split(".")[-1])
            os.rename(filename, new_filename)
            filename = new_filename
            return load_vcf(
                filename,
                genome=genome,
                reference_vcf_key=reference_vcf_key,
                only_passing=only_passing,
                allow_extended_nucleotides=allow_extended_nucleotides,
                include_info=include_info,
                chunk_size=chunk_size,
                max_variants=max_variants,
                sort_key=sort_key,
                distinct=distinct)
        finally:
            logger.info("Removing temporary file: %s", filename)
            os.unlink(filename)

    # Loading a local file.
    # The file will be opened twice: first to parse the header with pyvcf, then
    # by pandas to read the data.

    # PyVCF reads the metadata immediately and stops at the first line with
    # data. We can close the file after that.
    handle = PyVCFReaderFromPathOrURL(path)
    handle.close()
    genome = infer_genome_from_vcf(genome, handle.vcf_reader,
                                   reference_vcf_key)

    df_iterator = read_vcf_into_dataframe(
        path,
        include_info=include_info,
        sample_names=handle.vcf_reader.samples if include_info else None,
        chunk_size=chunk_size)

    if include_info:

        def sample_info_parser(unparsed_sample_info_strings, format_string):
            """
            Given a format string like "GT:AD:ADP:DP:FS"
            and a list of sample info strings where each entry is like
            "0/1:3,22:T=3,G=22:25:33", return a dict that maps:
            sample name -> field name -> value. Uses pyvcf to parse the fields.
            """
            return pyvcf_calls_to_sample_info_list(
                handle.vcf_reader._parse_samples(unparsed_sample_info_strings,
                                                 format_string, None))
    else:
        sample_info_parser = None

    return dataframes_to_variant_collection(
        df_iterator,
        source_path=path,
        info_parser=handle.vcf_reader._parse_info if include_info else None,
        only_passing=only_passing,
        max_variants=max_variants,
        sample_names=handle.vcf_reader.samples if include_info else None,
        sample_info_parser=sample_info_parser,
        variant_kwargs={
            'ensembl': genome,
            'allow_extended_nucleotides': allow_extended_nucleotides
        },
        variant_collection_kwargs={
            'sort_key': sort_key,
            'distinct': distinct
        })
Esempio n. 29
0
    def dataframe(
            self,
            contig=None,
            feature=None,
            strand=None,
            save_to_disk=False):
        """
        Load genome entries as a DataFrame, optionally restricted to
        particular contig or feature type.
        """
        if contig:
            contig = normalize_chromosome(contig)

        if strand:
            strand = normalize_strand(strand)

        if feature is not None:
            require_string(feature, "feature")

        key = (contig, feature, strand)

        if key not in self._dataframes:
            def _construct_df():
                full_df = self._load_full_dataframe_cached()

                assert len(full_df) > 0, \
                    "Dataframe representation of genomic database empty!"

                # rename since we're going to be filtering the entries but
                # may still want to access the full dataset
                df = full_df
                if contig:
                    df = df[df["seqname"] == contig]
                    if len(df) == 0:
                        raise ValueError("Contig not found: %s" % (contig,))

                if feature:
                    df = df[df["feature"] == feature]
                    if len(df) == 0:
                        # check to make sure feature was somewhere in
                        # the full dataset before returning an empty dataframe
                        features = full_df["feature"].unique()
                        if feature not in features:
                            raise ValueError(
                                "Feature not found: %s" % (feature,))
                if strand:
                    df = df[df["strand"] == strand]

                return df
            if save_to_disk:
                csv_path = self.data_subset_path(
                    contig=contig,
                    feature=feature,
                    strand=strand,
                    distinct=False)
                df = self.memory_cache.cached_dataframe(
                    csv_path=csv_path,
                    compute_fn=_construct_df)
            else:
                df = _construct_df()
            self._dataframes[key] = df

        return self._dataframes[key]
Esempio n. 30
0
    def __init__(
        self,
        program_name,
        alleles,
        parse_output_fn,
        supported_alleles_flag,
        input_file_flag,
        length_flag,
        allele_flag,
        peptide_mode_flags=["-p"],
        tempdir_flag=None,
        extra_flags=[],
        max_peptides_per_file=10**4,
        process_limit=-1,
        default_peptide_lengths=[9],
        group_peptides_by_length=False,
        min_peptide_length=8,
        max_peptide_length=None,
    ):
        """
        Parameters
        ----------
        program_name : str
            Name of prediction program to run
            (e.g. "netMHCcons" or "netMHCIIpan")

        alleles : list of str
            MHC alleles

        supported_alleles_flag : str
            Flag to pass to the predictor to get a list of supported alleles
            (e.g. "-A", "-list", "-listMHC")

        parse_output_fn : fn
            Takes the stdout string from the predictor and returns a collection
            of BindingPrediction objects

        input_file_flag : str
            How to specify the input FASTA file of source sequences (e.g. "-f")

        length_flag : str
            How to specify the desired predicted peptide length (e.g. "-length")

        allele_flag : str
            How to specify the allele we want predictions for (e.g. "-a")

        peptide_mode_flags : list of str
            How to switch from the default FASTA subsequences input mode to
            where peptides are explicitly given one per line of a text file.

        tempdir_flag : str, optional
            How to specify the predictor's temporary directory (e.g. "-tdir")

        extra_flags : list of str
            Extra flags to pass to the predictor

        max_peptides_per_file : int, optional
            Maximum number of lines per file when predicting peptides directly.

        process_limit : int, optional
            Maximum number of parallel processes to start
            (0 for no limit, -1 for use all available processors)

        default_peptide_lengths : list of int, optional
            When making predictions across subsequences of protein sequences,
            what peptide lengths to predict for.

        group_peptides_by_length : bool
            Run commandline predictor on groups of peptides of equal length

        min_peptide_length : int
            Shortest peptide this predictor can handle

        max_peptide_length : int
            Longest peptide this predictor can handle
        """
        require_string(program_name, "Predictor program name")
        self.program_name = program_name

        if supported_alleles_flag is not None:
            require_string(supported_alleles_flag, "Supported alleles flag")
        self.supported_alleles_flag = supported_alleles_flag

        require_string(input_file_flag, "Input file flag")
        self.input_file_flag = input_file_flag

        require_string(length_flag, "Peptide length flag")
        self.length_flag = length_flag

        require_string(allele_flag, "Allele flag")
        self.allele_flag = allele_flag

        require_iterable_of(peptide_mode_flags, string_types)
        self.peptide_mode_flags = peptide_mode_flags

        if tempdir_flag is not None:
            require_string(tempdir_flag, "Temporary directory flag")
        self.tempdir_flag = tempdir_flag

        require_iterable_of(extra_flags, string_types)
        self.extra_flags = extra_flags

        require_integer(max_peptides_per_file,
                        "Maximum number of lines in a peptides input file")
        self.max_peptides_per_file = max_peptides_per_file

        require_integer(process_limit, "Maximum number of processes")
        self.process_limit = process_limit

        self.parse_output_fn = parse_output_fn

        if isinstance(default_peptide_lengths, int):
            default_peptide_lengths = [default_peptide_lengths]

        self.group_peptides_by_length = group_peptides_by_length

        if self.supported_alleles_flag:
            valid_alleles = self._determine_supported_alleles(
                self.program_name, self.supported_alleles_flag)
        else:
            # if we're not running the tool to determine supported alleles
            # then at least try running it by itself to determine if it's
            # it's present
            try:
                run_command([self.program_name])
            except:
                raise SystemError("Failed to run %s" % self.program_name)
            valid_alleles = None

        try:
            BasePredictor.__init__(
                self,
                alleles=alleles,
                valid_alleles=valid_alleles,
                default_peptide_lengths=default_peptide_lengths,
                min_peptide_length=min_peptide_length,
                max_peptide_length=max_peptide_length)
        except UnsupportedAllele as e:
            if self.supported_alleles_flag:
                additional_message = (
                    "\nRun command %s %s to see a list of valid alleles" %
                    (self.program_name, self.supported_alleles_flag))
            else:
                additional_message = ""
            raise UnsupportedAllele(str(e) + additional_message)
Esempio n. 31
0
def load_vcf(path,
             genome=None,
             reference_vcf_key="reference",
             only_passing=True,
             allow_extended_nucleotides=False,
             include_info=True,
             chunk_size=10**5,
             max_variants=None,
             sort_key=variant_ascending_position_sort_key,
             distinct=True,
             normalize_contig_names=True,
             convert_ucsc_contig_names=None):
    """
    Load reference name and Variant objects from the given VCF filename.

    Currently only local files are supported by this function (no http). If you
    call this on an HTTP URL, it will fall back to `load_vcf`.

    Parameters
    ----------

    path : str
        Path to VCF (*.vcf) or compressed VCF (*.vcf.gz).

    genome : {pyensembl.Genome, reference name, Ensembl version int}, optional
        Optionally pass in a PyEnsembl Genome object, name of reference, or
        PyEnsembl release version to specify the reference associated with a
        VCF (otherwise infer reference from VCF using reference_vcf_key)

    reference_vcf_key : str, optional
        Name of metadata field which contains path to reference FASTA
        file (default = 'reference')

    only_passing : bool, optional
        If true, any entries whose FILTER field is not one of "." or "PASS" is
        dropped.

    allow_extended_nucleotides : bool, default False
        Allow characters other that A,C,T,G in the ref and alt strings.

    include_info : bool, default True
        Whether to parse the INFO and per-sample columns. If you don't need
        these, set to False for faster parsing.

    chunk_size: int, optional
        Number of records to load in memory at once.

    max_variants : int, optional
        If specified, return only the first max_variants variants.

    sort_key : fn
        Function which maps each element to a sorting criterion.
        Set to None to not to sort the variants.

    distinct : bool, default True
        Don't keep repeated variants

    normalize_contig_names : bool, default True
        By default contig names will be normalized by converting integers
        to strings (e.g. 1 -> "1"), and converting any letters after "chr"
        to uppercase (e.g. "chrx" -> "chrX"). If you don't want
        this behavior then pass normalize_contig_names=False.

    convert_ucsc_contig_names : bool
        Convert chromosome names from hg19 (e.g. "chr1") to equivalent names
        for GRCh37 (e.g. "1"). By default this is set to True if the genome
        of the VCF is a UCSC reference and otherwise set to False.
    """

    require_string(path, "Path or URL to VCF")
    parsed_path = parse_url_or_path(path)

    if parsed_path.scheme and parsed_path.scheme.lower() != "file":
        # pandas.read_table nominally supports HTTP, but it tends to crash on
        # large files and does not support gzip. Switching to the python-based
        # implementation of read_table (with engine="python") helps with some
        # issues but introduces a new set of problems (e.g. the dtype parameter
        # is not accepted). For these reasons, we're currently not attempting
        # to load VCFs over HTTP with pandas directly, and instead download it
        # to a temporary file and open that.

        (filename, headers) = urllib.request.urlretrieve(path)
        try:
            # The downloaded file has no file extension, which confuses pyvcf
            # for gziped files in Python 3. We rename it to have the correct
            # file extension.
            new_filename = "%s.%s" % (filename,
                                      parsed_path.path.split(".")[-1])
            os.rename(filename, new_filename)
            filename = new_filename
            return load_vcf(
                filename,
                genome=genome,
                reference_vcf_key=reference_vcf_key,
                only_passing=only_passing,
                allow_extended_nucleotides=allow_extended_nucleotides,
                include_info=include_info,
                chunk_size=chunk_size,
                max_variants=max_variants,
                sort_key=sort_key,
                distinct=distinct,
                normalize_contig_names=normalize_contig_names,
                convert_ucsc_contig_names=convert_ucsc_contig_names)
        finally:
            logger.info("Removing temporary file: %s", filename)
            os.unlink(filename)

    # Loading a local file.
    # The file will be opened twice: first to parse the header with pyvcf, then
    # by pandas to read the data.

    # PyVCF reads the metadata immediately and stops at the first line with
    # data. We can close the file after that.
    handle = PyVCFReaderFromPathOrURL(path)
    handle.close()

    ####
    # The following code looks a bit crazy because it's motivated by the
    # desired to preserve UCSC reference names even though the Variant
    # objects we're creating will convert them to EnsemblRelease genomes
    # with different reference names.
    #
    # For example, if a VCF is aligned against 'hg19' then we want to create a
    # variant which has 'hg19' as its genome argument, so that serialization
    # back to VCF will put the correct reference genome in the generated
    # header.
    if genome is None:
        vcf_reader = handle.vcf_reader
        if reference_vcf_key not in vcf_reader.metadata:
            raise ValueError("Unable to infer reference genome for %s" %
                             (vcf_reader.filename, ))
        genome = vcf_reader.metadata[reference_vcf_key]

    genome, genome_was_ucsc = infer_genome(genome)
    if genome_was_ucsc:
        genome = ensembl_to_ucsc_reference_names[genome.reference_name]

    if convert_ucsc_contig_names is None:
        convert_ucsc_contig_names = genome_was_ucsc

    df_iterator = read_vcf_into_dataframe(
        path,
        include_info=include_info,
        sample_names=handle.vcf_reader.samples if include_info else None,
        chunk_size=chunk_size)

    if include_info:

        def sample_info_parser(unparsed_sample_info_strings, format_string):
            """
            Given a format string like "GT:AD:ADP:DP:FS"
            and a list of sample info strings where each entry is like
            "0/1:3,22:T=3,G=22:25:33", return a dict that maps:
            sample name -> field name -> value. Uses pyvcf to parse the fields.
            """
            return pyvcf_calls_to_sample_info_list(
                handle.vcf_reader._parse_samples(unparsed_sample_info_strings,
                                                 format_string, None))
    else:
        sample_info_parser = None

    variant_kwargs = {
        'genome': genome,
        'allow_extended_nucleotides': allow_extended_nucleotides,
        'normalize_contig_names': normalize_contig_names,
        'convert_ucsc_contig_names': convert_ucsc_contig_names,
    }

    variant_collection_kwargs = {'sort_key': sort_key, 'distinct': distinct}

    # TODO: drop chrMT variants from hg19 and warn user about it

    return dataframes_to_variant_collection(
        df_iterator,
        source_path=path,
        info_parser=handle.vcf_reader._parse_info if include_info else None,
        only_passing=only_passing,
        max_variants=max_variants,
        sample_names=handle.vcf_reader.samples if include_info else None,
        sample_info_parser=sample_info_parser,
        variant_kwargs=variant_kwargs,
        variant_collection_kwargs=variant_collection_kwargs)
Esempio n. 32
0
    def column_values_at_locus(self,
                               column_name,
                               feature,
                               contig,
                               position,
                               end=None,
                               strand=None,
                               distinct=False,
                               sorted=False):
        """
        Get the non-null values of a column from the database
        at a particular range of loci
        """

        # TODO: combine with the query method, since they overlap
        # significantly
        require_string(column_name, "column_name", nonempty=True)

        contig = normalize_chromosome(contig)

        require_integer(position, "position")

        if end is None:
            end = position

        require_integer(end, "end")

        if not self.column_exists(feature, column_name):
            raise ValueError("Table %s doesn't have column %s" % (
                feature,
                column_name,
            ))

        if distinct:
            distinct_string = "DISTINCT "
        else:
            distinct_string = ""

        query = """
            SELECT %s%s
            FROM %s
            WHERE seqname = ?
            AND start <= ?
            AND end >= ?

        """ % (distinct_string, column_name, feature)

        query_params = [contig, end, position]

        if strand:
            query += " AND strand = ?"
            query_params.append(strand)

        tuples = self.connection.execute(query, query_params).fetchall()

        # each result is a tuple, so pull out its first element
        results = [t[0] for t in tuples if t[0] is not None]

        if sorted:
            results.sort()
        return results
    def __init__(
            self,
            program_name,
            alleles,
            epitope_lengths,
            parse_output_fn,
            supported_alleles_flag,
            input_fasta_flag,
            length_flag,
            allele_flag,
            tempdir_flag=None,
            extra_flags=[],
            max_file_records=None,
            process_limit=0):
        """
        Parameters
        ----------
        program_name : str
            Name of prediction program to run
            (e.g. "netMHCcons" or "netMHCIIpan")

        alleles : list of str
            MHC alleles

        epitope_lengths : list of int

        supported_alleles_flag : str
            Flag to pass to the predictor to get a list of supported alleles
            (e.g. "-A", "-list", "-listMHC")

        parse_output_fn : fn
            Takes the stdout string from the predictor and returns a collection
            of BindingPrediction objects

        input_fasta_flag : str
            How to specify the input FASTA file of source sequences (e.g. "-f")

        length_flag : str
            How to specify the desired predicted epitope length (e.g. "-length")

        allele_flag : str
            How to specify the allele we want predictions for (e.g. "-a")

        tempdir_flag : str, optional
            How to specify the predictor's temporary directory (e.g. "-tdir")

        extra_flags : list of str
            Extra flags to pass to the predictor

        max_file_records : int, optional
            Maximum number of sequences per input FASTA file

        process_limit : int, optional
            Maximum number of parallel processes to start
        """
        require_string(program_name, "Predictor program name")
        self.program_name = program_name

        if supported_alleles_flag is not None:
            require_string(supported_alleles_flag, "Supported alleles flag")
        self.supported_alleles_flag = supported_alleles_flag

        require_string(input_fasta_flag, "Input FASTA file flag")
        self.input_fasta_flag = input_fasta_flag

        require_string(allele_flag, "Allele flag")
        self.allele_flag = allele_flag

        require_string(length_flag, "Peptide length flag")
        self.length_flag = length_flag

        if tempdir_flag is not None:
            require_string(tempdir_flag, "Temporary directory flag")
        self.tempdir_flag = tempdir_flag

        self.extra_flags = extra_flags

        if max_file_records is not None:
            require_integer(
                    max_file_records,
                    "Maximum number of sequences per input files")
        self.max_file_records = max_file_records

        require_integer(process_limit, "Maximum number of processes")
        self.process_limit = process_limit

        self.parse_output_fn = parse_output_fn

        if self.supported_alleles_flag:
            valid_alleles = self._determine_supported_alleles(
                self.program_name,
                self.supported_alleles_flag)
        else:
            # if we're not running the tool to determine supported alleles
            # then at least try running it by itself to determine if it's
            # it's present
            try:
                run_command([self.program_name])
            except:
                raise SystemError("Failed to run %s" % self.program_name)
            valid_alleles = None

        try:
            BasePredictor.__init__(
                self,
                alleles,
                epitope_lengths,
                valid_alleles=valid_alleles)
        except UnsupportedAllele as e:
            if self.supported_alleles_flag:
                additional_message = (
                    "\nRun command %s %s to see a list of valid alleles" % (
                        self.program_name,
                        self.supported_alleles_flag))
            else:
                additional_message = ""
            raise UnsupportedAllele(str(e) + additional_message)
Esempio n. 34
0
def _create_cached_db(
        db_path,
        tables,
        version=1):
    """
    Either create or retrieve sqlite database.

    Parameters
    --------
    db_path : str
        Path to sqlite3 database file

    tables : dict
        Dictionary mapping table names to datacache.DatabaseTable objects

    version : int, optional
        Version acceptable as cached data.

    Returns sqlite3 connection
    """
    require_string(db_path, "db_path")
    require_iterable_of(tables, DatabaseTable)
    require_integer(version, "version")

    # if the database file doesn't already exist and we encounter an error
    # later, delete the file before raising an exception
    delete_on_error = not exists(db_path)

    # if the database already exists, contains all the table
    # names and has the right version, then just return it
    db = Database(db_path)

    # make sure to delete the database file in case anything goes wrong
    # to avoid leaving behind an empty DB
    table_names = [table.name for table in tables]
    try:
        if db.has_tables(table_names) and \
                db.has_version() and \
                db.version() == version:
            logger.info("Found existing table in database %s", db_path)
        else:
            if len(db.table_names()) > 0:
                logger.info("Dropping tables from database %s: %s",
                    db_path,
                    ", ".join(db.table_names()))
                db.drop_all_tables()
            logger.info(
                "Creating database %s containing: %s",
                db_path,
                ", ".join(table_names))
            db.create(tables, version)
    except:
        logger.warning(
            "Failed to create tables %s in database %s",
            table_names,
            db_path)
        db.close()
        if delete_on_error:
            remove(db_path)
        raise
    return db.connection
Esempio n. 35
0
def load_vcf_fast(
        path,
        genome=None,
        reference_vcf_key="reference",
        only_passing=True,
        allow_extended_nucleotides=False,
        include_info=True,
        chunk_size=10**5,
        max_variants=None):
    """
    Load reference name and Variant objects from the given VCF filename.

    This is an experimental faster implementation of `load_vcf`. It is
    typically about 2X faster, and with `include_info=False`, about 4X faster.
    If most of the records in the VCF have failed filters (and
    only_passing=True), this function can be orders of magnitude faster than
    `load_vcf`.

    Currently only local files are supported by this function (no http). If you
    call this on an HTTP URL, it will fall back to `load_vcf`.

    Parameters
    ----------

    path : str
        Path to VCF (*.vcf) or compressed VCF (*.vcf.gz).

    genome : {pyensembl.Genome, reference name, Ensembl version int}, optional
        Optionally pass in a PyEnsembl Genome object, name of reference, or
        PyEnsembl release version to specify the reference associated with a VCF
        (otherwise infer reference from VCF using reference_vcf_key)

    reference_vcf_key : str, optional
        Name of metadata field which contains path to reference FASTA
        file (default = 'reference')

    only_passing : boolean, optional
        If true, any entries whose FILTER field is not one of "." or "PASS" is
        dropped.

    allow_extended_nucleotides : boolean, default False
        Allow characters other that A,C,T,G in the ref and alt strings.

    include_info : boolean, default True
        Whether to parse the info column. If you don't need that column, set to
        False for faster parsing.

    chunk_size: int, optional
        Number of records to load in memory at once.

    max_variants : int, optional
        If specified, return only the first max_variants variants.
    """

    require_string(path, "Path or URL to VCF")
    parsed_path = parse_url_or_path(path)

    if parsed_path.scheme and parsed_path.scheme.lower() != "file":
        # pandas.read_table nominally supports HTTP, but it tends to crash on
        # large files and does not support gzip. Switching to the python-based
        # implementation of read_table (with engine="python") helps with some
        # issues but introduces a new set of problems (e.g. the dtype parameter
        # is not accepted). For these reasons, we're currently not attempting
        # to load VCFs over HTTP with pandas, and fall back to the pyvcf
        # implementation here.
        return load_vcf(
            path,
            genome=genome,
            reference_vcf_key=reference_vcf_key,
            only_passing=only_passing,
            allow_extended_nucleotides=allow_extended_nucleotides,
            max_variants=max_variants)

    # Loading a local file.
    # The file will be opened twice: first to parse the header with pyvcf, then
    # by pandas to read the data.

    # PyVCF reads the metadata immediately and stops at the first line with
    # data. We can close the file after that.
    handle = PyVCFReaderFromPathOrURL(path)
    handle.close()
    genome = infer_genome_from_vcf(
        genome,
        handle.vcf_reader,
        reference_vcf_key)

    df_iterator = read_vcf_into_dataframe(
        path, include_info=include_info, chunk_size=chunk_size)

    return dataframes_to_variant_collection(
        df_iterator,
        info_parser=handle.vcf_reader._parse_info if include_info else None,
        only_passing=only_passing,
        max_variants=max_variants,
        variant_kwargs={
            'ensembl': genome,
            'allow_extended_nucleotides': allow_extended_nucleotides},
        variant_collection_kwargs={"path": path})