def _create_table(self, table_name, column_types, primary=None, nullable=()): """Creates a sqlite3 table from the given metadata. Parameters ---------- column_types : list of (str, str) pairs First element of each tuple is the column name, second element is the sqlite3 type primary : str, optional Which column is the primary key nullable : iterable, optional Names of columns which have null values """ require_string(table_name, "table name") require_iterable_of(column_types, tuple, name="rows") if primary is not None: require_string(primary, "primary") require_iterable_of(nullable, str, name="nullable") column_decls = [] for column_name, column_type in column_types: decl = "%s %s" % (column_name, column_type) if column_name == primary: decl += " UNIQUE PRIMARY KEY" if column_name not in nullable: decl += " NOT NULL" column_decls.append(decl) column_decl_str = ", ".join(column_decls) create_table_sql = \ "CREATE TABLE %s (%s)" % (table_name, column_decl_str) self.execute_sql(create_table_sql)
def normalize_chromosome(c): try: return NORMALIZE_CHROMOSOME_CACHE[c] except KeyError: pass result = c if is_integer(result): if result == 0: raise ValueError("Contig cannot be 0") result = str(result) else: require_string(result, "contig name", nonempty=True) # only strip off lowercase chr since some of the non-chromosomal # contigs start with "CHR" if result.startswith("chr"): result = result[3:] # standardize mitochondrial genome to be "MT" if result == "M": result = "MT" else: # just in case someone is being lazy, capitalize "X" and "Y" result = result.upper() NORMALIZE_CHROMOSOME_CACHE[c] = result return result
def __init__(self, path): """ Construct a new wrapper. Parameters ---------- path : string or pyvcf Reader instance Path or URL to load, or Reader instance. """ self.path = None # string path, if available. self.vcf_reader = None # vcf_reader. Will always be set. self._to_close = None # object to call close() on when we're done. if isinstance(path, pyvcf.Reader): self.vcf_reader = path else: require_string(path, "Path or URL to VCF") self.path = path parsed_path = parse_url_or_path(path) if not parsed_path.scheme or parsed_path.scheme.lower() == 'file': self.vcf_reader = pyvcf.Reader(filename=parsed_path.path, strict_whitespace=True) elif parsed_path.scheme.lower() in ("http", "https", "ftp"): self._to_close = response = requests.get(path, stream=True) response.raise_for_status() # raise error on 404, etc. if path.endswith(".gz"): lines = stream_gzip_decompress_lines( response.iter_content()) else: lines = response.iter_lines(decode_unicode=True) self.vcf_reader = pyvcf.Reader(fsock=lines, compressed=False, strict_whitespace=True) else: raise ValueError("Unsupported scheme: %s" % parsed_path.scheme)
def evaluate_expression(expression, bindings, error_value=RAISE): typechecks.require_string(expression) # Since Python 2 doesn't have a nonlocal keyword, we have to box up the # error_value, so we can reassign to it in the ``on_error`` function # below. error_box = [error_value] try: # Give some basic modules. standard_environment = dict(STANDARD_EVALUATION_ENVIRONMENT) # Add our "on_error" hack. def on_error(value): error_box[0] = value standard_environment["on_error"] = on_error return eval(expression, standard_environment, bindings) except Exception as e: if error_box[0] is not RAISE: return error_box[0] extra = "Error while evaluating: \n\t%s\non:\n%s" % (expression, bindings) traceback = sys.exc_info()[2] raise_(ValueError, str(e) + "\n" + extra, traceback)
def __init__(self, path): """ Construct a new wrapper. Parameters ---------- path : string or pyvcf Reader instance Path or URL to load, or Reader instance. """ self.path = None # string path, if available. self.vcf_reader = None # vcf_reader. Will always be set. self._to_close = None # object to call close() on when we're done. if isinstance(path, vcf.Reader): self.vcf_reader = path else: require_string(path, "Path or URL to VCF") self.path = path parsed_path = parse_url_or_path(path) if not parsed_path.scheme or parsed_path.scheme.lower() == 'file': self.vcf_reader = vcf.Reader(filename=parsed_path.path) elif parsed_path.scheme.lower() in ("http", "https", "ftp"): self._to_close = response = requests.get(path, stream=True) response.raise_for_status() # raise error on 404, etc. if path.endswith(".gz"): lines = stream_gzip_decompress_lines( response.iter_content()) else: lines = response.iter_lines(decode_unicode=True) self.vcf_reader = vcf.Reader(fsock=lines, compressed=False) else: raise ValueError("Unsupported scheme: %s" % parsed_path.scheme)
def column_values_at_locus( self, column_name, feature, contig, position, end=None, strand=None, distinct=False, sorted=False): """ Get the non-null values of a column from the database at a particular range of loci """ # TODO: combine with the query method, since they overlap # significantly require_string(column_name, "column_name", nonempty=True) contig = normalize_chromosome(contig) require_integer(position, "position") if end is None: end = position require_integer(end, "end") if not self.column_exists(feature, column_name): raise ValueError("Table %s doesn't have column %s" % ( feature, column_name,)) if distinct: distinct_string = "DISTINCT " else: distinct_string = "" query = """ SELECT %s%s FROM %s WHERE seqname = ? AND start <= ? AND end >= ? """ % (distinct_string, column_name, feature) query_params = [contig, end, position] if strand: query += " AND strand = ?" query_params.append(strand) tuples = self.connection.execute(query, query_params).fetchall() # each result is a tuple, so pull out its first element results = [t[0] for t in tuples if t[0] is not None] if sorted: results.sort() return results
def dataframe(self, contig=None, feature=None, strand=None): """ Load genome entries as a DataFrame, optionally restricted to particular contig or feature type. """ if contig: contig = normalize_chromosome(contig) if strand: strand = normalize_strand(strand) if feature is not None: require_string(feature, "feature") key = (contig, feature, strand) if key not in self._dataframes: csv_path = self.cached_data_file_path( contig=contig, feature=feature, strand=strand, distinct=False) def cached_loader_fn(): # pylint: disable=no-member # pylint has trouble with df.seqname and similar # statements in this function. full_df = self._load_full_dataframe() assert len(full_df) > 0, \ "Dataframe representation of genomic database empty!" # rename since we're going to be filtering the entries but # may still want to access the full dataset df = full_df if contig: df = df[df.seqname == contig] if len(df) == 0: raise ValueError("Contig not found: %s" % (contig,)) if feature: df = df[df.feature == feature] if len(df) == 0: # check to make sure feature was somewhere in # the full dataset before returning an empty dataframe features = full_df.feature.unique() if feature not in features: raise ValueError( "Feature not found: %s" % (feature,)) if strand: df = df[df.strand == strand] return df self._dataframes[key] = cached_dataframe(csv_path, cached_loader_fn) return self._dataframes[key]
def from_interbase_coordinates(contig, start, end=None): ''' Given coordinates in 0-based interbase coordinates, return a Locus instance. ''' typechecks.require_string(contig) typechecks.require_integer(start) if end is None: end = start + 1 typechecks.require_integer(end) return Locus(contig, start, end)
def from_inclusive_coordinates(contig, start, end=None): ''' Given coordinates in 1-based coordinates that are inclusive on start and end, return a Locus instance. Locus instances are always 0-based "interbase" coordinates. ''' typechecks.require_string(contig) typechecks.require_integer(start) if end is None: end = start typechecks.require_integer(end) return Locus(contig, start - 1, end)
def _create_cached_db(db_path, tables, version=1): """ Either create or retrieve sqlite database. Parameters -------- db_path : str Path to sqlite3 database file tables : dict Dictionary mapping table names to datacache.DatabaseTable objects version : int, optional Version acceptable as cached data. Returns sqlite3 connection """ require_string(db_path, "db_path") require_iterable_of(tables, DatabaseTable) require_integer(version, "version") # if the database file doesn't already exist and we encounter an error # later, delete the file before raising an exception delete_on_error = not exists(db_path) # if the database already exists, contains all the table # names and has the right version, then just return it db = Database(db_path) # make sure to delete the database file in case anything goes wrong # to avoid leaving behind an empty DB table_names = [table.name for table in tables] try: if db.has_tables(table_names) and \ db.has_version() and \ db.version() == version: logger.info("Found existing table in database %s", db_path) else: if len(db.table_names()) > 0: logger.info("Dropping tables from database %s: %s", db_path, ", ".join(db.table_names())) db.drop_all_tables() logger.info("Creating database %s containing: %s", db_path, ", ".join(table_names)) db.create(tables, version) except: logger.warning("Failed to create tables %s in database %s", table_names, db_path) db.close() if delete_on_error: remove(db_path) raise return db.connection
def from_interbase_coordinates(contig, start, end=None): ''' Given coordinates in 0-based interbase coordinates, return a Locus instance. ''' typechecks.require_string(contig) typechecks.require_integer(start) if end is None: end = start + 1 typechecks.require_integer(end) contig = pyensembl.locus.normalize_chromosome(contig) return Locus(contig, start, end)
def from_inclusive_coordinates(contig, start, end=None): ''' Given coordinates in 1-based coordinates that are inclusive on start and end, return a Locus instance. Locus instances are always 0-based "interbase" coordinates. ''' typechecks.require_string(contig) typechecks.require_integer(start) if end is None: end = start typechecks.require_integer(end) contig = pyensembl.locus.normalize_chromosome(contig) return Locus(contig, start - 1, end)
def _create_indices(self, table_name, indices): """ Create multiple indices (each over multiple columns) on a given table. Parameters ---------- table_name : str indices : iterable of tuples Multiple groups of columns, each of which should be indexed. """ require_string(table_name, "table_name") require_iterable_of(indices, (tuple, list)) for index_column_set in indices: self._create_index(table_name, index_column_set)
def _fill_table(self, table_name, rows): require_string(table_name, "table_name") require_iterable_of(rows, tuple, "rows") if not self.has_table(table_name): raise ValueError("Table '%s' does not exist in database" % (table_name, )) if len(rows) == 0: raise ValueError("Rows must be non-empty sequence") first_row = rows[0] n_columns = len(first_row) if not all(len(row) == n_columns for row in rows): raise ValueError("Rows must all have %d values" % n_columns) blank_slots = ", ".join("?" for _ in range(n_columns)) logger.info("Inserting %d rows into table %s", len(rows), table_name) sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots) self.connection.executemany(sql, rows)
def _fill_table(self, table_name, rows): require_string(table_name, "table_name") require_iterable_of(rows, tuple, "rows") if not self.has_table(table_name): raise ValueError( "Table '%s' does not exist in database" % (table_name,)) if len(rows) == 0: raise ValueError("Rows must be non-empty sequence") first_row = rows[0] n_columns = len(first_row) if not all(len(row) == n_columns for row in rows): raise ValueError("Rows must all have %d values" % n_columns) blank_slots = ", ".join("?" for _ in range(n_columns)) logging.info("Inserting %d rows into table %s", len(rows), table_name) sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots) self.connection.executemany(sql, rows)
def normalize_nucleotide_string(nucleotides, allow_extended_nucleotides=False, empty_chars=".-", treat_nan_as_empty=True): """ Normalizes a nucleotide string by converting various ways of encoding empty strings into "", making all letters upper case, and checking to make sure all letters in the string are actually nucleotides. Parameters ---------- nucleotides : str Sequence of nucleotides, e.g. "ACCTG" extended_nucleotides : bool Allow non-canonical nucleotide characters like 'X' for unknown base empty_chars : str Characters which encode empty strings, such as "." used in VCF format or "-" used in MAF format treat_nan_as_empty : bool Some MAF files represent deletions/insertions with NaN ref/alt values """ if nucleotides in empty_chars: return "" elif treat_nan_as_empty and isinstance(nucleotides, float) and np.isnan(nucleotides): return "" require_string(nucleotides, name="nucleotide string") nucleotides = nucleotides.upper() if allow_extended_nucleotides: valid_nucleotides = EXTENDED_NUCLEOTIDES else: valid_nucleotides = STANDARD_NUCLEOTIDES if not set(nucleotides) <= valid_nucleotides: raise ValueError("Invalid character(s) in nucleotide string: %s" % (",".join(set(nucleotides) - valid_nucleotides), )) return nucleotides
def load_maf_dataframe(path, nrows=None, verbose=False): """ Load the guaranteed columns of a TCGA MAF file into a DataFrame """ require_string(path, "Path to MAF") n_basic_columns = len(MAF_COLUMN_NAMES) # pylint: disable=no-member # pylint gets confused by read_csv df = pandas.read_csv( path, comment="#", sep="\t", low_memory=False, skip_blank_lines=True, header=0) if len(df.columns) < n_basic_columns: raise ValueError( "Too few columns in MAF file %s, expected %d but got %d : %s" % ( path, n_basic_columns, len(df.columns), df.columns)) # check each pair of expected/actual column names to make sure they match for expected, actual in zip(MAF_COLUMN_NAMES, df.columns): if expected != actual: # MAFs in the wild have capitalization differences in their # column names, normalize them to always use the names above if expected.lower() == actual.lower(): # using DataFrame.rename in Python 2.7.x doesn't seem to # work for some files, possibly because Pandas treats # unicode vs. str columns as different? df[expected] = df[actual] del df[actual] else: raise ValueError("Expected column %s but got %s" % ( expected, actual)) return df
def normalize_nucleotide_string(nucleotides, allow_extended_nucleotides=False): """ Normalizes a nucleotide string by converting various ways of encoding empty strings into "", making all letters upper case, and checking to make sure all letters in the string are actually nucleotides. Parameters ---------- nucleotides : str Sequence of nucleotides, e.g. "ACCTG" extended_nucleotides : bool Allow non-canonical nucleotide characters like 'X' for unknown base """ # some MAF files represent deletions/insertions with NaN ref/alt values if isinstance(nucleotides, float) and np.isnan(nucleotides): return "" # VCF files sometimes have '.' ref or alt for insertions and deletions, and # MAF files sometimes have '-' ref or alt for insertions and deletions. if nucleotides == "." or nucleotides == "-": return "" typechecks.require_string(nucleotides, "nucleotide string") nucleotides = nucleotides.upper() if allow_extended_nucleotides: valid_nucleotides = EXTENDED_NUCLEOTIDES else: valid_nucleotides = STANDARD_NUCLEOTIDES if not set(nucleotides) <= valid_nucleotides: raise ValueError( "Invalid character(s) in nucleotide string: %s" % ( ",".join(set(nucleotides) - valid_nucleotides),)) return nucleotides
def normalize_nucleotide_string(nucleotides, allow_extended_nucleotides=False): """ Normalizes a nucleotide string by converting various ways of encoding empty strings into "", making all letters upper case, and checking to make sure all letters in the string are actually nucleotides. Parameters ---------- nucleotides : str Sequence of nucleotides, e.g. "ACCTG" extended_nucleotides : bool Allow non-canonical nucleotide characters like 'X' for unknown base """ # some MAF files represent deletions/insertions with NaN ref/alt values if isinstance(nucleotides, float) and np.isnan(nucleotides): return "" # VCF files sometimes have '.' ref or alt for insertions and deletions, and # MAF files sometimes have '-' ref or alt for insertions and deletions. if nucleotides == "." or nucleotides == "-": return "" typechecks.require_string(nucleotides, "nucleotide string") nucleotides = nucleotides.upper() if allow_extended_nucleotides: valid_nucleotides = EXTENDED_NUCLEOTIDES else: valid_nucleotides = STANDARD_NUCLEOTIDES if not set(nucleotides) <= valid_nucleotides: raise ValueError("Invalid character(s) in nucleotide string: %s" % ",".join(set(nucleotides) - valid_nucleotides)) return nucleotides
def load_vcf_fast(path, genome=None, reference_vcf_key="reference", only_passing=True, allow_extended_nucleotides=False, include_info=True, chunk_size=10**5, max_variants=None): """ Load reference name and Variant objects from the given VCF filename. This is an experimental faster implementation of `load_vcf`. It is typically about 2X faster, and with `include_info=False`, about 4X faster. If most of the records in the VCF have failed filters (and only_passing=True), this function can be orders of magnitude faster than `load_vcf`. Currently only local files are supported by this function (no http). If you call this on an HTTP URL, it will fall back to `load_vcf`. Parameters ---------- path : str Path to VCF (*.vcf) or compressed VCF (*.vcf.gz). genome : {pyensembl.Genome, reference name, Ensembl version int}, optional Optionally pass in a PyEnsembl Genome object, name of reference, or PyEnsembl release version to specify the reference associated with a VCF (otherwise infer reference from VCF using reference_vcf_key) reference_vcf_key : str, optional Name of metadata field which contains path to reference FASTA file (default = 'reference') only_passing : boolean, optional If true, any entries whose FILTER field is not one of "." or "PASS" is dropped. allow_extended_nucleotides : boolean, default False Allow characters other that A,C,T,G in the ref and alt strings. include_info : boolean, default True Whether to parse the info column. If you don't need that column, set to False for faster parsing. chunk_size: int, optional Number of records to load in memory at once. max_variants : int, optional If specified, return only the first max_variants variants. """ require_string(path, "Path or URL to VCF") parsed_path = parse_url_or_path(path) if parsed_path.scheme and parsed_path.scheme.lower() != "file": # pandas.read_table nominally supports HTTP, but it tends to crash on # large files and does not support gzip. Switching to the python-based # implementation of read_table (with engine="python") helps with some # issues but introduces a new set of problems (e.g. the dtype parameter # is not accepted). For these reasons, we're currently not attempting # to load VCFs over HTTP with pandas, and fall back to the pyvcf # implementation here. return load_vcf(path, genome=genome, reference_vcf_key=reference_vcf_key, only_passing=only_passing, allow_extended_nucleotides=allow_extended_nucleotides, max_variants=max_variants) # Loading a local file. # The file will be opened twice: first to parse the header with pyvcf, then # by pandas to read the data. # PyVCF reads the metadata immediately and stops at the first line with # data. We can close the file after that. handle = PyVCFReaderFromPathOrURL(path) handle.close() genome = infer_genome_from_vcf(genome, handle.vcf_reader, reference_vcf_key) df_iterator = read_vcf_into_dataframe(path, include_info=include_info, chunk_size=chunk_size) return dataframes_to_variant_collection( df_iterator, info_parser=handle.vcf_reader._parse_info if include_info else None, only_passing=only_passing, max_variants=max_variants, variant_kwargs={ 'ensembl': genome, 'allow_extended_nucleotides': allow_extended_nucleotides }, variant_collection_kwargs={"path": path})
def load_vcf( path, genome=None, reference_vcf_key="reference", only_passing=True, allow_extended_nucleotides=False, include_info=True, chunk_size=10 ** 5, max_variants=None, sort_key=variant_ascending_position_sort_key, distinct=True): """ Load reference name and Variant objects from the given VCF filename. Currently only local files are supported by this function (no http). If you call this on an HTTP URL, it will fall back to `load_vcf`. Parameters ---------- path : str Path to VCF (*.vcf) or compressed VCF (*.vcf.gz). genome : {pyensembl.Genome, reference name, Ensembl version int}, optional Optionally pass in a PyEnsembl Genome object, name of reference, or PyEnsembl release version to specify the reference associated with a VCF (otherwise infer reference from VCF using reference_vcf_key) reference_vcf_key : str, optional Name of metadata field which contains path to reference FASTA file (default = 'reference') only_passing : boolean, optional If true, any entries whose FILTER field is not one of "." or "PASS" is dropped. allow_extended_nucleotides : boolean, default False Allow characters other that A,C,T,G in the ref and alt strings. include_info : boolean, default True Whether to parse the INFO and per-sample columns. If you don't need these, set to False for faster parsing. chunk_size: int, optional Number of records to load in memory at once. max_variants : int, optional If specified, return only the first max_variants variants. sort_key : fn Function which maps each element to a sorting criterion. Set to None to not to sort the variants. distinct : boolean, default True Don't keep repeated variants """ require_string(path, "Path or URL to VCF") parsed_path = parse_url_or_path(path) if parsed_path.scheme and parsed_path.scheme.lower() != "file": # pandas.read_table nominally supports HTTP, but it tends to crash on # large files and does not support gzip. Switching to the python-based # implementation of read_table (with engine="python") helps with some # issues but introduces a new set of problems (e.g. the dtype parameter # is not accepted). For these reasons, we're currently not attempting # to load VCFs over HTTP with pandas directly, and instead download it # to a temporary file and open that. (filename, headers) = urllib.request.urlretrieve(path) try: # The downloaded file has no file extension, which confuses pyvcf # for gziped files in Python 3. We rename it to have the correct # file extension. new_filename = "%s.%s" % ( filename, parsed_path.path.split(".")[-1]) os.rename(filename, new_filename) filename = new_filename return load_vcf( filename, genome=genome, reference_vcf_key=reference_vcf_key, only_passing=only_passing, allow_extended_nucleotides=allow_extended_nucleotides, include_info=include_info, chunk_size=chunk_size, max_variants=max_variants, sort_key=sort_key, distinct=distinct) finally: logger.info("Removing temporary file: %s", filename) os.unlink(filename) # Loading a local file. # The file will be opened twice: first to parse the header with pyvcf, then # by pandas to read the data. # PyVCF reads the metadata immediately and stops at the first line with # data. We can close the file after that. handle = PyVCFReaderFromPathOrURL(path) handle.close() genome = infer_genome_from_vcf( genome, handle.vcf_reader, reference_vcf_key) df_iterator = read_vcf_into_dataframe( path, include_info=include_info, sample_names=handle.vcf_reader.samples if include_info else None, chunk_size=chunk_size) if include_info: def sample_info_parser(unparsed_sample_info_strings, format_string): """ Given a format string like "GT:AD:ADP:DP:FS" and a list of sample info strings where each entry is like "0/1:3,22:T=3,G=22:25:33", return a dict that maps: sample name -> field name -> value. Uses pyvcf to parse the fields. """ return pyvcf_calls_to_sample_info_list( handle.vcf_reader._parse_samples( unparsed_sample_info_strings, format_string, None)) else: sample_info_parser = None return dataframes_to_variant_collection( df_iterator, source_path=path, info_parser=handle.vcf_reader._parse_info if include_info else None, only_passing=only_passing, max_variants=max_variants, sample_names=handle.vcf_reader.samples if include_info else None, sample_info_parser=sample_info_parser, variant_kwargs={ 'ensembl': genome, 'allow_extended_nucleotides': allow_extended_nucleotides}, variant_collection_kwargs={ 'sort_key': sort_key, 'distinct': distinct})
def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None): """ Load the guaranteed columns of a TCGA MAF file into a DataFrame Parameters ---------- path : str Path to MAF file nrows : int Optional limit to number of rows loaded raise_on_error : bool Raise an exception upon encountering an error or log an error encoding : str, optional Encoding to use for UTF when reading MAF file. """ require_string(path, "Path to MAF") n_basic_columns = len(MAF_COLUMN_NAMES) # pylint: disable=no-member # pylint gets confused by read_csv df = pandas.read_csv( path, comment="#", sep="\t", low_memory=False, skip_blank_lines=True, header=0, nrows=nrows, encoding=encoding) if len(df.columns) < n_basic_columns: error_message = ( "Too few columns in MAF file %s, expected %d but got %d : %s" % ( path, n_basic_columns, len(df.columns), df.columns)) if raise_on_error: raise ValueError(error_message) else: logging.warn(error_message) # check each pair of expected/actual column names to make sure they match for expected, actual in zip(MAF_COLUMN_NAMES, df.columns): if expected != actual: # MAFs in the wild have capitalization differences in their # column names, normalize them to always use the names above if expected.lower() == actual.lower(): # using DataFrame.rename in Python 2.7.x doesn't seem to # work for some files, possibly because Pandas treats # unicode vs. str columns as different? df[expected] = df[actual] del df[actual] else: error_message = ( "Expected column %s but got %s" % (expected, actual)) if raise_on_error: raise ValueError(error_message) else: logging.warn(error_message) return df
def test_require_string(): require_string("", nonempty=False) with assert_raises(TypeError): require_string(0, nonempty=False) with assert_raises(TypeError): require_string(0, nonempty=True) with assert_raises(ValueError): require_string("", nonempty=True) require_string("1", nonempty=False) require_string("1", nonempty=True)
def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None): """ Load the guaranteed columns of a TCGA MAF file into a DataFrame Parameters ---------- path : str Path to MAF file nrows : int Optional limit to number of rows loaded raise_on_error : bool Raise an exception upon encountering an error or log an error encoding : str, optional Encoding to use for UTF when reading MAF file. """ require_string(path, "Path to MAF") n_basic_columns = len(MAF_COLUMN_NAMES) # pylint: disable=no-member # pylint gets confused by read_csv df = pandas.read_csv( path, comment="#", sep="\t", low_memory=False, skip_blank_lines=True, header=0, encoding=encoding) if len(df.columns) < n_basic_columns: error_message = ( "Too few columns in MAF file %s, expected %d but got %d : %s" % ( path, n_basic_columns, len(df.columns), df.columns)) if raise_on_error: raise ValueError(error_message) else: logging.warn(error_message) # check each pair of expected/actual column names to make sure they match for expected, actual in zip(MAF_COLUMN_NAMES, df.columns): if expected != actual: # MAFs in the wild have capitalization differences in their # column names, normalize them to always use the names above if expected.lower() == actual.lower(): # using DataFrame.rename in Python 2.7.x doesn't seem to # work for some files, possibly because Pandas treats # unicode vs. str columns as different? df[expected] = df[actual] del df[actual] else: error_message = ( "Expected column %s but got %s" % (expected, actual)) if raise_on_error: raise ValueError(error_message) else: logging.warn(error_message) return df
def load_vcf(path, genome=None, reference_vcf_key="reference", only_passing=True, allow_extended_nucleotides=False, include_info=True, chunk_size=10**5, max_variants=None, sort_key=variant_ascending_position_sort_key, distinct=True): """ Load reference name and Variant objects from the given VCF filename. Currently only local files are supported by this function (no http). If you call this on an HTTP URL, it will fall back to `load_vcf`. Parameters ---------- path : str Path to VCF (*.vcf) or compressed VCF (*.vcf.gz). genome : {pyensembl.Genome, reference name, Ensembl version int}, optional Optionally pass in a PyEnsembl Genome object, name of reference, or PyEnsembl release version to specify the reference associated with a VCF (otherwise infer reference from VCF using reference_vcf_key) reference_vcf_key : str, optional Name of metadata field which contains path to reference FASTA file (default = 'reference') only_passing : boolean, optional If true, any entries whose FILTER field is not one of "." or "PASS" is dropped. allow_extended_nucleotides : boolean, default False Allow characters other that A,C,T,G in the ref and alt strings. include_info : boolean, default True Whether to parse the INFO and per-sample columns. If you don't need these, set to False for faster parsing. chunk_size: int, optional Number of records to load in memory at once. max_variants : int, optional If specified, return only the first max_variants variants. sort_key : fn Function which maps each element to a sorting criterion. Set to None to not to sort the variants. distinct : boolean, default True Don't keep repeated variants """ require_string(path, "Path or URL to VCF") parsed_path = parse_url_or_path(path) if parsed_path.scheme and parsed_path.scheme.lower() != "file": # pandas.read_table nominally supports HTTP, but it tends to crash on # large files and does not support gzip. Switching to the python-based # implementation of read_table (with engine="python") helps with some # issues but introduces a new set of problems (e.g. the dtype parameter # is not accepted). For these reasons, we're currently not attempting # to load VCFs over HTTP with pandas directly, and instead download it # to a temporary file and open that. (filename, headers) = urllib.request.urlretrieve(path) try: # The downloaded file has no file extension, which confuses pyvcf # for gziped files in Python 3. We rename it to have the correct # file extension. new_filename = "%s.%s" % (filename, parsed_path.path.split(".")[-1]) os.rename(filename, new_filename) filename = new_filename return load_vcf( filename, genome=genome, reference_vcf_key=reference_vcf_key, only_passing=only_passing, allow_extended_nucleotides=allow_extended_nucleotides, include_info=include_info, chunk_size=chunk_size, max_variants=max_variants, sort_key=sort_key, distinct=distinct) finally: logger.info("Removing temporary file: %s", filename) os.unlink(filename) # Loading a local file. # The file will be opened twice: first to parse the header with pyvcf, then # by pandas to read the data. # PyVCF reads the metadata immediately and stops at the first line with # data. We can close the file after that. handle = PyVCFReaderFromPathOrURL(path) handle.close() genome = infer_genome_from_vcf(genome, handle.vcf_reader, reference_vcf_key) df_iterator = read_vcf_into_dataframe( path, include_info=include_info, sample_names=handle.vcf_reader.samples if include_info else None, chunk_size=chunk_size) if include_info: def sample_info_parser(unparsed_sample_info_strings, format_string): """ Given a format string like "GT:AD:ADP:DP:FS" and a list of sample info strings where each entry is like "0/1:3,22:T=3,G=22:25:33", return a dict that maps: sample name -> field name -> value. Uses pyvcf to parse the fields. """ return pyvcf_calls_to_sample_info_list( handle.vcf_reader._parse_samples(unparsed_sample_info_strings, format_string, None)) else: sample_info_parser = None return dataframes_to_variant_collection( df_iterator, source_path=path, info_parser=handle.vcf_reader._parse_info if include_info else None, only_passing=only_passing, max_variants=max_variants, sample_names=handle.vcf_reader.samples if include_info else None, sample_info_parser=sample_info_parser, variant_kwargs={ 'ensembl': genome, 'allow_extended_nucleotides': allow_extended_nucleotides }, variant_collection_kwargs={ 'sort_key': sort_key, 'distinct': distinct })
def dataframe( self, contig=None, feature=None, strand=None, save_to_disk=False): """ Load genome entries as a DataFrame, optionally restricted to particular contig or feature type. """ if contig: contig = normalize_chromosome(contig) if strand: strand = normalize_strand(strand) if feature is not None: require_string(feature, "feature") key = (contig, feature, strand) if key not in self._dataframes: def _construct_df(): full_df = self._load_full_dataframe_cached() assert len(full_df) > 0, \ "Dataframe representation of genomic database empty!" # rename since we're going to be filtering the entries but # may still want to access the full dataset df = full_df if contig: df = df[df["seqname"] == contig] if len(df) == 0: raise ValueError("Contig not found: %s" % (contig,)) if feature: df = df[df["feature"] == feature] if len(df) == 0: # check to make sure feature was somewhere in # the full dataset before returning an empty dataframe features = full_df["feature"].unique() if feature not in features: raise ValueError( "Feature not found: %s" % (feature,)) if strand: df = df[df["strand"] == strand] return df if save_to_disk: csv_path = self.data_subset_path( contig=contig, feature=feature, strand=strand, distinct=False) df = self.memory_cache.cached_dataframe( csv_path=csv_path, compute_fn=_construct_df) else: df = _construct_df() self._dataframes[key] = df return self._dataframes[key]
def __init__( self, program_name, alleles, parse_output_fn, supported_alleles_flag, input_file_flag, length_flag, allele_flag, peptide_mode_flags=["-p"], tempdir_flag=None, extra_flags=[], max_peptides_per_file=10**4, process_limit=-1, default_peptide_lengths=[9], group_peptides_by_length=False, min_peptide_length=8, max_peptide_length=None, ): """ Parameters ---------- program_name : str Name of prediction program to run (e.g. "netMHCcons" or "netMHCIIpan") alleles : list of str MHC alleles supported_alleles_flag : str Flag to pass to the predictor to get a list of supported alleles (e.g. "-A", "-list", "-listMHC") parse_output_fn : fn Takes the stdout string from the predictor and returns a collection of BindingPrediction objects input_file_flag : str How to specify the input FASTA file of source sequences (e.g. "-f") length_flag : str How to specify the desired predicted peptide length (e.g. "-length") allele_flag : str How to specify the allele we want predictions for (e.g. "-a") peptide_mode_flags : list of str How to switch from the default FASTA subsequences input mode to where peptides are explicitly given one per line of a text file. tempdir_flag : str, optional How to specify the predictor's temporary directory (e.g. "-tdir") extra_flags : list of str Extra flags to pass to the predictor max_peptides_per_file : int, optional Maximum number of lines per file when predicting peptides directly. process_limit : int, optional Maximum number of parallel processes to start (0 for no limit, -1 for use all available processors) default_peptide_lengths : list of int, optional When making predictions across subsequences of protein sequences, what peptide lengths to predict for. group_peptides_by_length : bool Run commandline predictor on groups of peptides of equal length min_peptide_length : int Shortest peptide this predictor can handle max_peptide_length : int Longest peptide this predictor can handle """ require_string(program_name, "Predictor program name") self.program_name = program_name if supported_alleles_flag is not None: require_string(supported_alleles_flag, "Supported alleles flag") self.supported_alleles_flag = supported_alleles_flag require_string(input_file_flag, "Input file flag") self.input_file_flag = input_file_flag require_string(length_flag, "Peptide length flag") self.length_flag = length_flag require_string(allele_flag, "Allele flag") self.allele_flag = allele_flag require_iterable_of(peptide_mode_flags, string_types) self.peptide_mode_flags = peptide_mode_flags if tempdir_flag is not None: require_string(tempdir_flag, "Temporary directory flag") self.tempdir_flag = tempdir_flag require_iterable_of(extra_flags, string_types) self.extra_flags = extra_flags require_integer(max_peptides_per_file, "Maximum number of lines in a peptides input file") self.max_peptides_per_file = max_peptides_per_file require_integer(process_limit, "Maximum number of processes") self.process_limit = process_limit self.parse_output_fn = parse_output_fn if isinstance(default_peptide_lengths, int): default_peptide_lengths = [default_peptide_lengths] self.group_peptides_by_length = group_peptides_by_length if self.supported_alleles_flag: valid_alleles = self._determine_supported_alleles( self.program_name, self.supported_alleles_flag) else: # if we're not running the tool to determine supported alleles # then at least try running it by itself to determine if it's # it's present try: run_command([self.program_name]) except: raise SystemError("Failed to run %s" % self.program_name) valid_alleles = None try: BasePredictor.__init__( self, alleles=alleles, valid_alleles=valid_alleles, default_peptide_lengths=default_peptide_lengths, min_peptide_length=min_peptide_length, max_peptide_length=max_peptide_length) except UnsupportedAllele as e: if self.supported_alleles_flag: additional_message = ( "\nRun command %s %s to see a list of valid alleles" % (self.program_name, self.supported_alleles_flag)) else: additional_message = "" raise UnsupportedAllele(str(e) + additional_message)
def load_vcf(path, genome=None, reference_vcf_key="reference", only_passing=True, allow_extended_nucleotides=False, include_info=True, chunk_size=10**5, max_variants=None, sort_key=variant_ascending_position_sort_key, distinct=True, normalize_contig_names=True, convert_ucsc_contig_names=None): """ Load reference name and Variant objects from the given VCF filename. Currently only local files are supported by this function (no http). If you call this on an HTTP URL, it will fall back to `load_vcf`. Parameters ---------- path : str Path to VCF (*.vcf) or compressed VCF (*.vcf.gz). genome : {pyensembl.Genome, reference name, Ensembl version int}, optional Optionally pass in a PyEnsembl Genome object, name of reference, or PyEnsembl release version to specify the reference associated with a VCF (otherwise infer reference from VCF using reference_vcf_key) reference_vcf_key : str, optional Name of metadata field which contains path to reference FASTA file (default = 'reference') only_passing : bool, optional If true, any entries whose FILTER field is not one of "." or "PASS" is dropped. allow_extended_nucleotides : bool, default False Allow characters other that A,C,T,G in the ref and alt strings. include_info : bool, default True Whether to parse the INFO and per-sample columns. If you don't need these, set to False for faster parsing. chunk_size: int, optional Number of records to load in memory at once. max_variants : int, optional If specified, return only the first max_variants variants. sort_key : fn Function which maps each element to a sorting criterion. Set to None to not to sort the variants. distinct : bool, default True Don't keep repeated variants normalize_contig_names : bool, default True By default contig names will be normalized by converting integers to strings (e.g. 1 -> "1"), and converting any letters after "chr" to uppercase (e.g. "chrx" -> "chrX"). If you don't want this behavior then pass normalize_contig_names=False. convert_ucsc_contig_names : bool Convert chromosome names from hg19 (e.g. "chr1") to equivalent names for GRCh37 (e.g. "1"). By default this is set to True if the genome of the VCF is a UCSC reference and otherwise set to False. """ require_string(path, "Path or URL to VCF") parsed_path = parse_url_or_path(path) if parsed_path.scheme and parsed_path.scheme.lower() != "file": # pandas.read_table nominally supports HTTP, but it tends to crash on # large files and does not support gzip. Switching to the python-based # implementation of read_table (with engine="python") helps with some # issues but introduces a new set of problems (e.g. the dtype parameter # is not accepted). For these reasons, we're currently not attempting # to load VCFs over HTTP with pandas directly, and instead download it # to a temporary file and open that. (filename, headers) = urllib.request.urlretrieve(path) try: # The downloaded file has no file extension, which confuses pyvcf # for gziped files in Python 3. We rename it to have the correct # file extension. new_filename = "%s.%s" % (filename, parsed_path.path.split(".")[-1]) os.rename(filename, new_filename) filename = new_filename return load_vcf( filename, genome=genome, reference_vcf_key=reference_vcf_key, only_passing=only_passing, allow_extended_nucleotides=allow_extended_nucleotides, include_info=include_info, chunk_size=chunk_size, max_variants=max_variants, sort_key=sort_key, distinct=distinct, normalize_contig_names=normalize_contig_names, convert_ucsc_contig_names=convert_ucsc_contig_names) finally: logger.info("Removing temporary file: %s", filename) os.unlink(filename) # Loading a local file. # The file will be opened twice: first to parse the header with pyvcf, then # by pandas to read the data. # PyVCF reads the metadata immediately and stops at the first line with # data. We can close the file after that. handle = PyVCFReaderFromPathOrURL(path) handle.close() #### # The following code looks a bit crazy because it's motivated by the # desired to preserve UCSC reference names even though the Variant # objects we're creating will convert them to EnsemblRelease genomes # with different reference names. # # For example, if a VCF is aligned against 'hg19' then we want to create a # variant which has 'hg19' as its genome argument, so that serialization # back to VCF will put the correct reference genome in the generated # header. if genome is None: vcf_reader = handle.vcf_reader if reference_vcf_key not in vcf_reader.metadata: raise ValueError("Unable to infer reference genome for %s" % (vcf_reader.filename, )) genome = vcf_reader.metadata[reference_vcf_key] genome, genome_was_ucsc = infer_genome(genome) if genome_was_ucsc: genome = ensembl_to_ucsc_reference_names[genome.reference_name] if convert_ucsc_contig_names is None: convert_ucsc_contig_names = genome_was_ucsc df_iterator = read_vcf_into_dataframe( path, include_info=include_info, sample_names=handle.vcf_reader.samples if include_info else None, chunk_size=chunk_size) if include_info: def sample_info_parser(unparsed_sample_info_strings, format_string): """ Given a format string like "GT:AD:ADP:DP:FS" and a list of sample info strings where each entry is like "0/1:3,22:T=3,G=22:25:33", return a dict that maps: sample name -> field name -> value. Uses pyvcf to parse the fields. """ return pyvcf_calls_to_sample_info_list( handle.vcf_reader._parse_samples(unparsed_sample_info_strings, format_string, None)) else: sample_info_parser = None variant_kwargs = { 'genome': genome, 'allow_extended_nucleotides': allow_extended_nucleotides, 'normalize_contig_names': normalize_contig_names, 'convert_ucsc_contig_names': convert_ucsc_contig_names, } variant_collection_kwargs = {'sort_key': sort_key, 'distinct': distinct} # TODO: drop chrMT variants from hg19 and warn user about it return dataframes_to_variant_collection( df_iterator, source_path=path, info_parser=handle.vcf_reader._parse_info if include_info else None, only_passing=only_passing, max_variants=max_variants, sample_names=handle.vcf_reader.samples if include_info else None, sample_info_parser=sample_info_parser, variant_kwargs=variant_kwargs, variant_collection_kwargs=variant_collection_kwargs)
def column_values_at_locus(self, column_name, feature, contig, position, end=None, strand=None, distinct=False, sorted=False): """ Get the non-null values of a column from the database at a particular range of loci """ # TODO: combine with the query method, since they overlap # significantly require_string(column_name, "column_name", nonempty=True) contig = normalize_chromosome(contig) require_integer(position, "position") if end is None: end = position require_integer(end, "end") if not self.column_exists(feature, column_name): raise ValueError("Table %s doesn't have column %s" % ( feature, column_name, )) if distinct: distinct_string = "DISTINCT " else: distinct_string = "" query = """ SELECT %s%s FROM %s WHERE seqname = ? AND start <= ? AND end >= ? """ % (distinct_string, column_name, feature) query_params = [contig, end, position] if strand: query += " AND strand = ?" query_params.append(strand) tuples = self.connection.execute(query, query_params).fetchall() # each result is a tuple, so pull out its first element results = [t[0] for t in tuples if t[0] is not None] if sorted: results.sort() return results
def __init__( self, program_name, alleles, epitope_lengths, parse_output_fn, supported_alleles_flag, input_fasta_flag, length_flag, allele_flag, tempdir_flag=None, extra_flags=[], max_file_records=None, process_limit=0): """ Parameters ---------- program_name : str Name of prediction program to run (e.g. "netMHCcons" or "netMHCIIpan") alleles : list of str MHC alleles epitope_lengths : list of int supported_alleles_flag : str Flag to pass to the predictor to get a list of supported alleles (e.g. "-A", "-list", "-listMHC") parse_output_fn : fn Takes the stdout string from the predictor and returns a collection of BindingPrediction objects input_fasta_flag : str How to specify the input FASTA file of source sequences (e.g. "-f") length_flag : str How to specify the desired predicted epitope length (e.g. "-length") allele_flag : str How to specify the allele we want predictions for (e.g. "-a") tempdir_flag : str, optional How to specify the predictor's temporary directory (e.g. "-tdir") extra_flags : list of str Extra flags to pass to the predictor max_file_records : int, optional Maximum number of sequences per input FASTA file process_limit : int, optional Maximum number of parallel processes to start """ require_string(program_name, "Predictor program name") self.program_name = program_name if supported_alleles_flag is not None: require_string(supported_alleles_flag, "Supported alleles flag") self.supported_alleles_flag = supported_alleles_flag require_string(input_fasta_flag, "Input FASTA file flag") self.input_fasta_flag = input_fasta_flag require_string(allele_flag, "Allele flag") self.allele_flag = allele_flag require_string(length_flag, "Peptide length flag") self.length_flag = length_flag if tempdir_flag is not None: require_string(tempdir_flag, "Temporary directory flag") self.tempdir_flag = tempdir_flag self.extra_flags = extra_flags if max_file_records is not None: require_integer( max_file_records, "Maximum number of sequences per input files") self.max_file_records = max_file_records require_integer(process_limit, "Maximum number of processes") self.process_limit = process_limit self.parse_output_fn = parse_output_fn if self.supported_alleles_flag: valid_alleles = self._determine_supported_alleles( self.program_name, self.supported_alleles_flag) else: # if we're not running the tool to determine supported alleles # then at least try running it by itself to determine if it's # it's present try: run_command([self.program_name]) except: raise SystemError("Failed to run %s" % self.program_name) valid_alleles = None try: BasePredictor.__init__( self, alleles, epitope_lengths, valid_alleles=valid_alleles) except UnsupportedAllele as e: if self.supported_alleles_flag: additional_message = ( "\nRun command %s %s to see a list of valid alleles" % ( self.program_name, self.supported_alleles_flag)) else: additional_message = "" raise UnsupportedAllele(str(e) + additional_message)
def _create_cached_db( db_path, tables, version=1): """ Either create or retrieve sqlite database. Parameters -------- db_path : str Path to sqlite3 database file tables : dict Dictionary mapping table names to datacache.DatabaseTable objects version : int, optional Version acceptable as cached data. Returns sqlite3 connection """ require_string(db_path, "db_path") require_iterable_of(tables, DatabaseTable) require_integer(version, "version") # if the database file doesn't already exist and we encounter an error # later, delete the file before raising an exception delete_on_error = not exists(db_path) # if the database already exists, contains all the table # names and has the right version, then just return it db = Database(db_path) # make sure to delete the database file in case anything goes wrong # to avoid leaving behind an empty DB table_names = [table.name for table in tables] try: if db.has_tables(table_names) and \ db.has_version() and \ db.version() == version: logger.info("Found existing table in database %s", db_path) else: if len(db.table_names()) > 0: logger.info("Dropping tables from database %s: %s", db_path, ", ".join(db.table_names())) db.drop_all_tables() logger.info( "Creating database %s containing: %s", db_path, ", ".join(table_names)) db.create(tables, version) except: logger.warning( "Failed to create tables %s in database %s", table_names, db_path) db.close() if delete_on_error: remove(db_path) raise return db.connection
def load_vcf_fast( path, genome=None, reference_vcf_key="reference", only_passing=True, allow_extended_nucleotides=False, include_info=True, chunk_size=10**5, max_variants=None): """ Load reference name and Variant objects from the given VCF filename. This is an experimental faster implementation of `load_vcf`. It is typically about 2X faster, and with `include_info=False`, about 4X faster. If most of the records in the VCF have failed filters (and only_passing=True), this function can be orders of magnitude faster than `load_vcf`. Currently only local files are supported by this function (no http). If you call this on an HTTP URL, it will fall back to `load_vcf`. Parameters ---------- path : str Path to VCF (*.vcf) or compressed VCF (*.vcf.gz). genome : {pyensembl.Genome, reference name, Ensembl version int}, optional Optionally pass in a PyEnsembl Genome object, name of reference, or PyEnsembl release version to specify the reference associated with a VCF (otherwise infer reference from VCF using reference_vcf_key) reference_vcf_key : str, optional Name of metadata field which contains path to reference FASTA file (default = 'reference') only_passing : boolean, optional If true, any entries whose FILTER field is not one of "." or "PASS" is dropped. allow_extended_nucleotides : boolean, default False Allow characters other that A,C,T,G in the ref and alt strings. include_info : boolean, default True Whether to parse the info column. If you don't need that column, set to False for faster parsing. chunk_size: int, optional Number of records to load in memory at once. max_variants : int, optional If specified, return only the first max_variants variants. """ require_string(path, "Path or URL to VCF") parsed_path = parse_url_or_path(path) if parsed_path.scheme and parsed_path.scheme.lower() != "file": # pandas.read_table nominally supports HTTP, but it tends to crash on # large files and does not support gzip. Switching to the python-based # implementation of read_table (with engine="python") helps with some # issues but introduces a new set of problems (e.g. the dtype parameter # is not accepted). For these reasons, we're currently not attempting # to load VCFs over HTTP with pandas, and fall back to the pyvcf # implementation here. return load_vcf( path, genome=genome, reference_vcf_key=reference_vcf_key, only_passing=only_passing, allow_extended_nucleotides=allow_extended_nucleotides, max_variants=max_variants) # Loading a local file. # The file will be opened twice: first to parse the header with pyvcf, then # by pandas to read the data. # PyVCF reads the metadata immediately and stops at the first line with # data. We can close the file after that. handle = PyVCFReaderFromPathOrURL(path) handle.close() genome = infer_genome_from_vcf( genome, handle.vcf_reader, reference_vcf_key) df_iterator = read_vcf_into_dataframe( path, include_info=include_info, chunk_size=chunk_size) return dataframes_to_variant_collection( df_iterator, info_parser=handle.vcf_reader._parse_info if include_info else None, only_passing=only_passing, max_variants=max_variants, variant_kwargs={ 'ensembl': genome, 'allow_extended_nucleotides': allow_extended_nucleotides}, variant_collection_kwargs={"path": path})