def column_values_at_locus( self, column_name, feature, contig, position, end=None, strand=None, distinct=False, sorted=False): """ Get the non-null values of a column from the database at a particular range of loci """ # TODO: combine with the query method, since they overlap # significantly require_string(column_name, "column_name", nonempty=True) contig = normalize_chromosome(contig) require_integer(position, "position") if end is None: end = position require_integer(end, "end") if not self.column_exists(feature, column_name): raise ValueError("Table %s doesn't have column %s" % ( feature, column_name,)) if distinct: distinct_string = "DISTINCT " else: distinct_string = "" query = """ SELECT %s%s FROM %s WHERE seqname = ? AND start <= ? AND end >= ? """ % (distinct_string, column_name, feature) query_params = [contig, end, position] if strand: query += " AND strand = ?" query_params.append(strand) tuples = self.connection.execute(query, query_params).fetchall() # each result is a tuple, so pull out its first element results = [t[0] for t in tuples if t[0] is not None] if sorted: results.sort() return results
def test_require_integer(): require_integer(0) require_integer(10) require_integer(-10) with assert_raises(TypeError): require_integer("") with assert_raises(TypeError): require_integer(None)
def from_interbase_coordinates(contig, start, end=None): ''' Given coordinates in 0-based interbase coordinates, return a Locus instance. ''' typechecks.require_string(contig) typechecks.require_integer(start) if end is None: end = start + 1 typechecks.require_integer(end) return Locus(contig, start, end)
def from_interbase_coordinates(contig, start, end=None): ''' Given coordinates in 0-based interbase coordinates, return a Locus instance. ''' typechecks.require_string(contig) typechecks.require_integer(start) if end is None: end = start + 1 typechecks.require_integer(end) contig = pyensembl.locus.normalize_chromosome(contig) return Locus(contig, start, end)
def from_inclusive_coordinates(contig, start, end=None): ''' Given coordinates in 1-based coordinates that are inclusive on start and end, return a Locus instance. Locus instances are always 0-based "interbase" coordinates. ''' typechecks.require_string(contig) typechecks.require_integer(start) if end is None: end = start typechecks.require_integer(end) return Locus(contig, start - 1, end)
def _create_cached_db(db_path, tables, version=1): """ Either create or retrieve sqlite database. Parameters -------- db_path : str Path to sqlite3 database file tables : dict Dictionary mapping table names to datacache.DatabaseTable objects version : int, optional Version acceptable as cached data. Returns sqlite3 connection """ require_string(db_path, "db_path") require_iterable_of(tables, DatabaseTable) require_integer(version, "version") # if the database file doesn't already exist and we encounter an error # later, delete the file before raising an exception delete_on_error = not exists(db_path) # if the database already exists, contains all the table # names and has the right version, then just return it db = Database(db_path) # make sure to delete the database file in case anything goes wrong # to avoid leaving behind an empty DB table_names = [table.name for table in tables] try: if db.has_tables(table_names) and \ db.has_version() and \ db.version() == version: logger.info("Found existing table in database %s", db_path) else: if len(db.table_names()) > 0: logger.info("Dropping tables from database %s: %s", db_path, ", ".join(db.table_names())) db.drop_all_tables() logger.info("Creating database %s containing: %s", db_path, ", ".join(table_names)) db.create(tables, version) except: logger.warning("Failed to create tables %s in database %s", table_names, db_path) db.close() if delete_on_error: remove(db_path) raise return db.connection
def from_inclusive_coordinates(contig, start, end=None): ''' Given coordinates in 1-based coordinates that are inclusive on start and end, return a Locus instance. Locus instances are always 0-based "interbase" coordinates. ''' typechecks.require_string(contig) typechecks.require_integer(start) if end is None: end = start typechecks.require_integer(end) contig = pyensembl.locus.normalize_chromosome(contig) return Locus(contig, start - 1, end)
def _finalize_database(self, version): """ Create metadata table for database with version number. Parameters ---------- version : int Tag created database with user-specified version number """ require_integer(version, "version") create_metadata_sql = \ "CREATE TABLE %s (version INT)" % METADATA_TABLE_NAME self.execute_sql(create_metadata_sql) insert_version_sql = \ "INSERT INTO %s VALUES (%s)" % (METADATA_TABLE_NAME, version) self.execute_sql(insert_version_sql)
def check_padding_around_mutation(given_padding, epitope_lengths): """ If user doesn't provide any padding around the mutation we need to at least include enough of the surrounding non-mutated esidues to construct candidate epitopes of the specified lengths. """ min_required_padding = max(epitope_lengths) - 1 if not given_padding: return min_required_padding else: require_integer(given_padding, "Padding around mutation") if given_padding < min_required_padding: raise ValueError( "Padding around mutation %d cannot be less than %d " "for epitope lengths %s" % (given_padding, min_required_padding, epitope_lengths)) return given_padding
def check_padding_around_mutation(given_padding, epitope_lengths): """ If user doesn't provide any padding around the mutation we need to at least include enough of the surrounding non-mutated esidues to construct candidate epitopes of the specified lengths. """ min_required_padding = max(epitope_lengths) - 1 if not given_padding: return min_required_padding else: require_integer(given_padding, "Padding around mutation") if given_padding < min_required_padding: raise ValueError("Padding around mutation %d cannot " "be less than %d for epitope lengths " "%s" % ( given_padding, min_required_padding, epitope_lengths)) return given_padding
def _create_cached_db( db_path, tables, version=1): """ Either create or retrieve sqlite database. Parameters -------- db_path : str Path to sqlite3 database file tables : dict Dictionary mapping table names to datacache.DatabaseTable objects version : int, optional Version acceptable as cached data. Returns sqlite3 connection """ require_string(db_path, "db_path") require_iterable_of(tables, DatabaseTable) require_integer(version, "version") # if the database file doesn't already exist and we encounter an error # later, delete the file before raising an exception delete_on_error = not exists(db_path) # if the database already exists, contains all the table # names and has the right version, then just return it db = Database(db_path) # make sure to delete the database file in case anything goes wrong # to avoid leaving behind an empty DB table_names = [table.name for table in tables] try: if db.has_tables(table_names) and \ db.has_version() and \ db.version() == version: logger.info("Found existing table in database %s", db_path) else: if len(db.table_names()) > 0: logger.info("Dropping tables from database %s: %s", db_path, ", ".join(db.table_names())) db.drop_all_tables() logger.info( "Creating database %s containing: %s", db_path, ", ".join(table_names)) db.create(tables, version) except: logger.warning( "Failed to create tables %s in database %s", table_names, db_path) db.close() if delete_on_error: remove(db_path) raise return db.connection
def column_values_at_locus(self, column_name, feature, contig, position, end=None, strand=None, distinct=False, sorted=False): """ Get the non-null values of a column from the database at a particular range of loci """ # TODO: combine with the query method, since they overlap # significantly require_string(column_name, "column_name", nonempty=True) contig = normalize_chromosome(contig) require_integer(position, "position") if end is None: end = position require_integer(end, "end") if not self.column_exists(feature, column_name): raise ValueError("Table %s doesn't have column %s" % ( feature, column_name, )) if distinct: distinct_string = "DISTINCT " else: distinct_string = "" query = """ SELECT %s%s FROM %s WHERE seqname = ? AND start <= ? AND end >= ? """ % (distinct_string, column_name, feature) query_params = [contig, end, position] if strand: query += " AND strand = ?" query_params.append(strand) tuples = self.connection.execute(query, query_params).fetchall() # each result is a tuple, so pull out its first element results = [t[0] for t in tuples if t[0] is not None] if sorted: results.sort() return results
def __init__( self, program_name, alleles, epitope_lengths, parse_output_fn, supported_alleles_flag, input_fasta_flag, length_flag, allele_flag, tempdir_flag=None, extra_flags=[], max_file_records=None, process_limit=0): """ Parameters ---------- program_name : str Name of prediction program to run (e.g. "netMHCcons" or "netMHCIIpan") alleles : list of str MHC alleles epitope_lengths : list of int supported_alleles_flag : str Flag to pass to the predictor to get a list of supported alleles (e.g. "-A", "-list", "-listMHC") parse_output_fn : fn Takes the stdout string from the predictor and returns a collection of BindingPrediction objects input_fasta_flag : str How to specify the input FASTA file of source sequences (e.g. "-f") length_flag : str How to specify the desired predicted epitope length (e.g. "-length") allele_flag : str How to specify the allele we want predictions for (e.g. "-a") tempdir_flag : str, optional How to specify the predictor's temporary directory (e.g. "-tdir") extra_flags : list of str Extra flags to pass to the predictor max_file_records : int, optional Maximum number of sequences per input FASTA file process_limit : int, optional Maximum number of parallel processes to start """ require_string(program_name, "Predictor program name") self.program_name = program_name if supported_alleles_flag is not None: require_string(supported_alleles_flag, "Supported alleles flag") self.supported_alleles_flag = supported_alleles_flag require_string(input_fasta_flag, "Input FASTA file flag") self.input_fasta_flag = input_fasta_flag require_string(allele_flag, "Allele flag") self.allele_flag = allele_flag require_string(length_flag, "Peptide length flag") self.length_flag = length_flag if tempdir_flag is not None: require_string(tempdir_flag, "Temporary directory flag") self.tempdir_flag = tempdir_flag self.extra_flags = extra_flags if max_file_records is not None: require_integer( max_file_records, "Maximum number of sequences per input files") self.max_file_records = max_file_records require_integer(process_limit, "Maximum number of processes") self.process_limit = process_limit self.parse_output_fn = parse_output_fn if self.supported_alleles_flag: valid_alleles = self._determine_supported_alleles( self.program_name, self.supported_alleles_flag) else: # if we're not running the tool to determine supported alleles # then at least try running it by itself to determine if it's # it's present try: run_command([self.program_name]) except: raise SystemError("Failed to run %s" % self.program_name) valid_alleles = None try: BasePredictor.__init__( self, alleles, epitope_lengths, valid_alleles=valid_alleles) except UnsupportedAllele as e: if self.supported_alleles_flag: additional_message = ( "\nRun command %s %s to see a list of valid alleles" % ( self.program_name, self.supported_alleles_flag)) else: additional_message = "" raise UnsupportedAllele(str(e) + additional_message)
def __init__( self, program_name, alleles, parse_output_fn, supported_alleles_flag, input_file_flag, length_flag, allele_flag, peptide_mode_flags=["-p"], tempdir_flag=None, extra_flags=[], max_peptides_per_file=10**4, process_limit=-1, default_peptide_lengths=[9], group_peptides_by_length=False, min_peptide_length=8, max_peptide_length=None, ): """ Parameters ---------- program_name : str Name of prediction program to run (e.g. "netMHCcons" or "netMHCIIpan") alleles : list of str MHC alleles supported_alleles_flag : str Flag to pass to the predictor to get a list of supported alleles (e.g. "-A", "-list", "-listMHC") parse_output_fn : fn Takes the stdout string from the predictor and returns a collection of BindingPrediction objects input_file_flag : str How to specify the input FASTA file of source sequences (e.g. "-f") length_flag : str How to specify the desired predicted peptide length (e.g. "-length") allele_flag : str How to specify the allele we want predictions for (e.g. "-a") peptide_mode_flags : list of str How to switch from the default FASTA subsequences input mode to where peptides are explicitly given one per line of a text file. tempdir_flag : str, optional How to specify the predictor's temporary directory (e.g. "-tdir") extra_flags : list of str Extra flags to pass to the predictor max_peptides_per_file : int, optional Maximum number of lines per file when predicting peptides directly. process_limit : int, optional Maximum number of parallel processes to start (0 for no limit, -1 for use all available processors) default_peptide_lengths : list of int, optional When making predictions across subsequences of protein sequences, what peptide lengths to predict for. group_peptides_by_length : bool Run commandline predictor on groups of peptides of equal length min_peptide_length : int Shortest peptide this predictor can handle max_peptide_length : int Longest peptide this predictor can handle """ require_string(program_name, "Predictor program name") self.program_name = program_name if supported_alleles_flag is not None: require_string(supported_alleles_flag, "Supported alleles flag") self.supported_alleles_flag = supported_alleles_flag require_string(input_file_flag, "Input file flag") self.input_file_flag = input_file_flag require_string(length_flag, "Peptide length flag") self.length_flag = length_flag require_string(allele_flag, "Allele flag") self.allele_flag = allele_flag require_iterable_of(peptide_mode_flags, string_types) self.peptide_mode_flags = peptide_mode_flags if tempdir_flag is not None: require_string(tempdir_flag, "Temporary directory flag") self.tempdir_flag = tempdir_flag require_iterable_of(extra_flags, string_types) self.extra_flags = extra_flags require_integer(max_peptides_per_file, "Maximum number of lines in a peptides input file") self.max_peptides_per_file = max_peptides_per_file require_integer(process_limit, "Maximum number of processes") self.process_limit = process_limit self.parse_output_fn = parse_output_fn if isinstance(default_peptide_lengths, int): default_peptide_lengths = [default_peptide_lengths] self.group_peptides_by_length = group_peptides_by_length if self.supported_alleles_flag: valid_alleles = self._determine_supported_alleles( self.program_name, self.supported_alleles_flag) else: # if we're not running the tool to determine supported alleles # then at least try running it by itself to determine if it's # it's present try: run_command([self.program_name]) except: raise SystemError("Failed to run %s" % self.program_name) valid_alleles = None try: BasePredictor.__init__( self, alleles=alleles, valid_alleles=valid_alleles, default_peptide_lengths=default_peptide_lengths, min_peptide_length=min_peptide_length, max_peptide_length=max_peptide_length) except UnsupportedAllele as e: if self.supported_alleles_flag: additional_message = ( "\nRun command %s %s to see a list of valid alleles" % (self.program_name, self.supported_alleles_flag)) else: additional_message = "" raise UnsupportedAllele(str(e) + additional_message)