def cli(directory, resource, species, ver, verbose): """ Creates a new ensimpl snps database <filename> using Ensembl <version> and species <species>. """ configure_logging(verbose) LOG = get_logger() if ver: ensembl_versions = list(ver) else: ensembl_versions = None if species: ensembl_species = list(species) else: ensembl_species = None LOG.info("Creating database...") tstart = time.time() create_ensimpl_snps.create(ensembl_versions, ensembl_species, directory, resource) tend = time.time() LOG.info("Creation time: {}".format(format_time(tstart, tend)))
def finalize(db, ref): """Finalize the database. Move everything to where it needs to be and create the necessary indices. Args: db (str): Name of the database file. ref (:obj:`ensimpl_snps.create.create_ensimpl.EnsemblReference`): Contains information about the Ensembl reference. """ start = time.time() conn = sqlite3.connect(db) cursor = conn.cursor() LOG.info("Finalizing database....") sql_meta_insert = 'INSERT INTO meta_info VALUES (null, ?, ?, ?)' meta_data = [] meta_data.append(('version', ref.version, ref.species_id)) meta_data.append(('assembly', ref.assembly, ref.species_id)) meta_data.append(('assembly_patch', ref.assembly_patch, ref.species_id)) cursor.executemany(sql_meta_insert, meta_data) LOG.info('Creating indices...') for sql in SQL_INDICES: LOG.debug(sql) cursor.execute(sql) conn.row_factory = sqlite3.Row LOG.info('Checking...') for sql in SQL_SELECT_CHECKS: LOG.debug(sql) cursor = conn.cursor() for row in cursor.execute(sql): LOG.info('**** WARNING ****') LOG.info(utils.dictify_row(cursor, row)) break cursor.close() LOG.info('Information') for sql in SQL_SELECT_FINAL_INFO: LOG.debug(sql) cursor = conn.cursor() for row in cursor.execute(sql): LOG.info('{}\t{}\t{}'.format(row[0], row[1], row[2])) cursor.close() conn.commit() conn.close() LOG.info("Finalizing complete: {0}".format( utils.format_time(start, time.time())))
def by_region(region, version, species, limit=None): """Perform the search by region. Args: region (str): The region to look for SNPs. version (int): The Ensembl version number. species (str): The Ensembl species identifier. limit (int, optional): Maximum number of SNPs to return, ``None`` for all. Returns: list: All the SNPs in `region`. Each element is another ``list`` with the following values: * chromosome * position * SNP identifier * reference allele * alternate allele Raises: ValueError: When `region` is empty. """ LOG = utils.get_logger() LOG.debug(sqlite3.version_info) LOG.debug(sqlite3.version) LOG.debug('range={}'.format(region)) LOG.debug('version={}'.format(version)) LOG.debug('species_id={}'.format(species)) LOG.debug('limit={}'.format(limit)) try: if not region: raise ValueError('no ids were passed in') new_region = fetch_utils.str_to_region(region) tabix_file = fetch_utils.get_tabix_file(version, species) tbx = pysam.TabixFile(tabix_file) start_time = time.time() snps = [] for row in tbx.fetch('{}'.format(new_region.chromosome), new_region.start_position, new_region.end_position, parser=pysam.asTuple()): snps.append(list(row[:5])) LOG.info('Done: {}'.format(utils.format_time(start_time, time.time()))) return snps except Exception as e: LOG.error('Error: {}'.format(e)) return None
def parse_config(resource_name): """Take a resource string (file name, url) and open it. Parse the file. Args: resource_name (str): the name of the resource Returns: dict: :obj:`EnsemblReference` with the key being the Ensembl version """ start = time.time() all_releases = {} line = '' try: with utils.open_resource(resource_name) as fd: buffer = io.BufferedReader(fd) line = buffer.readline() # skip first line # parse each line and create an EnsemblReference for line in buffer: line = str(line, 'utf-8') elems = line.strip().split('\t') if len(elems) == 10: elems.append(None) reference = EnsemblReference(*elems) release = all_releases.get(reference.version, {}) release[reference.species_id] = reference all_releases[reference.version] = release LOG.info('Config parsed in {}'.format( utils.format_time(start, time.time()))) except IOError as io_error: LOG.error('Unable to access resource: {}'.format(resource_name)) LOG.debug(io_error) all_releases = None except TypeError as type_error: LOG.error('Unable to parse resource: {}'.format(resource_name)) LOG.debug(type_error) LOG.debug('Error on the following:') LOG.debug(line) all_releases = None return all_releases
def insert_snps(db, snps): """Insert snps into the database. Args: db (str): Name of the database file. snps (list): A ``list`` of snps. """ LOG.info('Inserting snps into database: {}'.format(db)) start = time.time() conn = sqlite3.connect(db) sql_snps_insert = ('INSERT INTO snps ' 'VALUES (?, ?, ?, ?, ?)') cursor = conn.cursor() LOG.debug('Inserting {:,} snps...'.format(len(snps))) cursor.executemany(sql_snps_insert, snps) cursor.close() conn.commit() conn.close() LOG.info('SNPs inserted in: {}'.format( utils.format_time(start, time.time())))
def initialize(db): """Initialize the ensimpl_snps database. Args: db (str): Full path to the database file. """ LOG.info('Initializing database: {}'.format(db)) start = time.time() conn = sqlite3.connect(db) cursor = conn.cursor() LOG.info('Generating tables...') for sql in SQL_CREATE_TABLES: LOG.debug(sql) cursor.execute(sql) cursor.close() conn.commit() conn.close() LOG.info('Database initialized in: {}'.format( utils.format_time(start, time.time())))
def cli(term, ver, exact, display, max, species, verbose): """ Search ensimpl database <filename> for <term> """ configure_logging(verbose) LOG = get_logger() LOG.info("Search database...") maximum = max if max >= 0 else None try: tstart = time.time() result = search_ensimpl.search(term, ver, species, exact, maximum) tend = time.time() LOG.debug("Num Results: {}".format(result.num_results)) count = 0 if len(result.matches) == 0: print("No results found") sys.exit() headers = [ "ID", "SYMBOL", "IDS", "POSITION", "MATCH_REASON", "MATCH_VALUE" ] tbl = [] if display in ('tab', 'csv'): delim = '\t' if display == 'tab' else ',' print(delim.join(headers)) for match in result.matches: line = list() line.append(match.ensembl_gene_id) line.append(match.symbol) if match.external_ids: ext_ids = [] for ids in match.external_ids: ext_ids.append('{}/{}'.format(ids['db'], ids['db_id'])) line.append('||'.join(ext_ids)) else: line.append('') line.append("{}:{}-{}".format(match.chromosome, match.position_start, match.position_end)) line.append(match.match_reason) line.append(match.match_value) if display in ('tab', 'csv'): print(delim.join(map(str, line))) elif display == 'json': tbl.append(dict(zip(headers, line))) else: tbl.append(line) count += 1 if count >= max > 0: break if display in ('tab', 'csv'): pass elif display == 'json': print(json.dumps({'data': tbl}, indent=4)) else: print(tabulate(tbl, headers)) LOG.info("Search time: {}".format(format_time(tstart, tend))) except Exception as e: LOG.error('Error: {}'.format(e))
def by_ids(ids, version, species): """Perform the search for ids. Args: ids (list): A ``list`` of ids to look for. version (int): The Ensembl version. species (str): The Ensembl species identifier. Returns: dict: A ``dict`` withe keys return ``snps`` and ``snps_not_found``. Raises: ValueError: When `ids` is empty. """ LOG = utils.get_logger() LOG.debug('ids={} ...'.format(ids[0:max(len(ids), 10)])) LOG.debug('version={}'.format(version)) LOG.debug('species={}'.format(species)) try: conn = fetch_utils.connect_to_database(version, species) conn.row_factory = sqlite3.Row cursor = conn.cursor() if not ids: raise ValueError('no ids were passed in') temp_table = 'lookup_ids_{}'.format(utils.create_random_string()) # create a temp table and insert into SQL_TEMP = ('CREATE TEMPORARY TABLE {} ( ' 'query_id TEXT, ' 'PRIMARY KEY (query_id) ' ');').format(temp_table) cursor.execute(SQL_TEMP) SQL_TEMP = 'INSERT INTO {} VALUES (?);'.format(temp_table) query_ids = [(_, ) for _ in ids] cursor.executemany(SQL_TEMP, query_ids) SQL_QUERY = ('SELECT s.* ' ' FROM snps s ' ' WHERE s.snp_id IN (SELECT distinct query_id FROM {}) ' ' ORDER BY s.chrom, s.pos').format(temp_table) LOG.info('Query: {}'.format(SQL_QUERY)) start_time = time.time() snps = [] snp_ids = [] for row in cursor.execute(SQL_QUERY): snps.append([ row['chrom'], row['pos'], row['snp_id'], row['ref'], row['alt'] ]) snp_ids.append(row['snp_id']) LOG.info('Done: {}'.format(utils.format_time(start_time, time.time()))) cursor.close() conn.close() snps_found = set(snp_ids) snps_not_found = [x for x in ids if x not in snps_found] return {'snps': snps, 'snps_not_found': snps_not_found} except Exception as e: LOG.error('Error: {}'.format(e)) return None