Esempio n. 1
0
def cli(directory, resource, species, ver, verbose):
    """
    Creates a new ensimpl snps database <filename> using Ensembl <version> and species <species>.
    """
    configure_logging(verbose)
    LOG = get_logger()

    if ver:
        ensembl_versions = list(ver)
    else:
        ensembl_versions = None

    if species:
        ensembl_species = list(species)
    else:
        ensembl_species = None

    LOG.info("Creating database...")

    tstart = time.time()
    create_ensimpl_snps.create(ensembl_versions, ensembl_species, directory,
                               resource)
    tend = time.time()

    LOG.info("Creation time: {}".format(format_time(tstart, tend)))
Esempio n. 2
0
def finalize(db, ref):
    """Finalize the database.  Move everything to where it needs to be and
    create the necessary indices.

     Args:
        db (str): Name of the database file.

        ref (:obj:`ensimpl_snps.create.create_ensimpl.EnsemblReference`):
            Contains information about the Ensembl reference.
     """
    start = time.time()
    conn = sqlite3.connect(db)
    cursor = conn.cursor()

    LOG.info("Finalizing database....")

    sql_meta_insert = 'INSERT INTO meta_info VALUES (null, ?, ?, ?)'

    meta_data = []
    meta_data.append(('version', ref.version, ref.species_id))
    meta_data.append(('assembly', ref.assembly, ref.species_id))
    meta_data.append(('assembly_patch', ref.assembly_patch, ref.species_id))

    cursor.executemany(sql_meta_insert, meta_data)

    LOG.info('Creating indices...')
    for sql in SQL_INDICES:
        LOG.debug(sql)
        cursor.execute(sql)

    conn.row_factory = sqlite3.Row

    LOG.info('Checking...')
    for sql in SQL_SELECT_CHECKS:
        LOG.debug(sql)
        cursor = conn.cursor()

        for row in cursor.execute(sql):
            LOG.info('**** WARNING ****')
            LOG.info(utils.dictify_row(cursor, row))
            break

        cursor.close()

    LOG.info('Information')
    for sql in SQL_SELECT_FINAL_INFO:
        LOG.debug(sql)
        cursor = conn.cursor()

        for row in cursor.execute(sql):
            LOG.info('{}\t{}\t{}'.format(row[0], row[1], row[2]))

        cursor.close()

    conn.commit()
    conn.close()

    LOG.info("Finalizing complete: {0}".format(
        utils.format_time(start, time.time())))
Esempio n. 3
0
def by_region(region, version, species, limit=None):
    """Perform the search by region.

    Args:
        region (str): The region to look for SNPs.
        version (int): The Ensembl version number.
        species (str): The Ensembl species identifier.
        limit (int, optional): Maximum number of SNPs to return, ``None`` for
            all.

    Returns:
        list: All the SNPs in `region`.  Each element is another ``list`` with
        the following values:
            * chromosome
            * position
            * SNP identifier
            * reference allele
            * alternate allele

    Raises:
        ValueError: When `region` is empty.
    """
    LOG = utils.get_logger()

    LOG.debug(sqlite3.version_info)
    LOG.debug(sqlite3.version)

    LOG.debug('range={}'.format(region))
    LOG.debug('version={}'.format(version))
    LOG.debug('species_id={}'.format(species))
    LOG.debug('limit={}'.format(limit))

    try:

        if not region:
            raise ValueError('no ids were passed in')

        new_region = fetch_utils.str_to_region(region)

        tabix_file = fetch_utils.get_tabix_file(version, species)
        tbx = pysam.TabixFile(tabix_file)

        start_time = time.time()

        snps = []
        for row in tbx.fetch('{}'.format(new_region.chromosome),
                             new_region.start_position,
                             new_region.end_position,
                             parser=pysam.asTuple()):
            snps.append(list(row[:5]))

        LOG.info('Done: {}'.format(utils.format_time(start_time, time.time())))

        return snps
    except Exception as e:
        LOG.error('Error: {}'.format(e))
        return None
def parse_config(resource_name):
    """Take a resource string (file name, url) and open it.  Parse the file.

    Args:
        resource_name (str): the name of the resource

    Returns:
        dict: :obj:`EnsemblReference` with the
            key being the Ensembl version
    """
    start = time.time()
    all_releases = {}
    line = ''

    try:
        with utils.open_resource(resource_name) as fd:
            buffer = io.BufferedReader(fd)
            line = buffer.readline()  # skip first line

            # parse each line and create an EnsemblReference
            for line in buffer:
                line = str(line, 'utf-8')
                elems = line.strip().split('\t')
                if len(elems) == 10:
                    elems.append(None)
                reference = EnsemblReference(*elems)
                release = all_releases.get(reference.version, {})
                release[reference.species_id] = reference
                all_releases[reference.version] = release

        LOG.info('Config parsed in {}'.format(
            utils.format_time(start, time.time())))

    except IOError as io_error:
        LOG.error('Unable to access resource: {}'.format(resource_name))
        LOG.debug(io_error)
        all_releases = None
    except TypeError as type_error:
        LOG.error('Unable to parse resource: {}'.format(resource_name))
        LOG.debug(type_error)
        LOG.debug('Error on the following:')
        LOG.debug(line)
        all_releases = None

    return all_releases
Esempio n. 5
0
def insert_snps(db, snps):
    """Insert snps into the database.

    Args:
        db (str): Name of the database file.
        snps (list): A ``list`` of snps.
    """
    LOG.info('Inserting snps into database: {}'.format(db))

    start = time.time()
    conn = sqlite3.connect(db)

    sql_snps_insert = ('INSERT INTO snps ' 'VALUES (?, ?, ?, ?, ?)')

    cursor = conn.cursor()
    LOG.debug('Inserting {:,} snps...'.format(len(snps)))
    cursor.executemany(sql_snps_insert, snps)
    cursor.close()
    conn.commit()
    conn.close()

    LOG.info('SNPs inserted in: {}'.format(
        utils.format_time(start, time.time())))
Esempio n. 6
0
def initialize(db):
    """Initialize the ensimpl_snps database.

    Args:
        db (str): Full path to the database file.
    """
    LOG.info('Initializing database: {}'.format(db))

    start = time.time()
    conn = sqlite3.connect(db)
    cursor = conn.cursor()

    LOG.info('Generating tables...')
    for sql in SQL_CREATE_TABLES:
        LOG.debug(sql)
        cursor.execute(sql)

    cursor.close()
    conn.commit()
    conn.close()

    LOG.info('Database initialized in: {}'.format(
        utils.format_time(start, time.time())))
Esempio n. 7
0
def cli(term, ver, exact, display, max, species, verbose):
    """
    Search ensimpl database <filename> for <term>
    """
    configure_logging(verbose)
    LOG = get_logger()
    LOG.info("Search database...")

    maximum = max if max >= 0 else None

    try:
        tstart = time.time()
        result = search_ensimpl.search(term, ver, species, exact, maximum)
        tend = time.time()

        LOG.debug("Num Results: {}".format(result.num_results))
        count = 0

        if len(result.matches) == 0:
            print("No results found")
            sys.exit()

        headers = [
            "ID", "SYMBOL", "IDS", "POSITION", "MATCH_REASON", "MATCH_VALUE"
        ]
        tbl = []

        if display in ('tab', 'csv'):
            delim = '\t' if display == 'tab' else ','
            print(delim.join(headers))

        for match in result.matches:
            line = list()
            line.append(match.ensembl_gene_id)
            line.append(match.symbol)

            if match.external_ids:
                ext_ids = []
                for ids in match.external_ids:
                    ext_ids.append('{}/{}'.format(ids['db'], ids['db_id']))
                line.append('||'.join(ext_ids))
            else:
                line.append('')

            line.append("{}:{}-{}".format(match.chromosome,
                                          match.position_start,
                                          match.position_end))
            line.append(match.match_reason)
            line.append(match.match_value)

            if display in ('tab', 'csv'):
                print(delim.join(map(str, line)))
            elif display == 'json':
                tbl.append(dict(zip(headers, line)))
            else:
                tbl.append(line)

            count += 1
            if count >= max > 0:
                break

        if display in ('tab', 'csv'):
            pass
        elif display == 'json':
            print(json.dumps({'data': tbl}, indent=4))
        else:
            print(tabulate(tbl, headers))

        LOG.info("Search time: {}".format(format_time(tstart, tend)))

    except Exception as e:
        LOG.error('Error: {}'.format(e))
Esempio n. 8
0
def by_ids(ids, version, species):
    """Perform the search for ids.

    Args:
        ids (list): A ``list`` of ids to look for.
        version (int): The Ensembl version.
        species (str): The Ensembl species identifier.

    Returns:
        dict: A ``dict`` withe keys return ``snps`` and ``snps_not_found``.

    Raises:
        ValueError: When `ids` is empty.
    """
    LOG = utils.get_logger()
    LOG.debug('ids={} ...'.format(ids[0:max(len(ids), 10)]))
    LOG.debug('version={}'.format(version))
    LOG.debug('species={}'.format(species))

    try:
        conn = fetch_utils.connect_to_database(version, species)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        if not ids:
            raise ValueError('no ids were passed in')

        temp_table = 'lookup_ids_{}'.format(utils.create_random_string())

        # create a temp table and insert into
        SQL_TEMP = ('CREATE TEMPORARY TABLE {} ( '
                    'query_id TEXT, '
                    'PRIMARY KEY (query_id) '
                    ');').format(temp_table)

        cursor.execute(SQL_TEMP)

        SQL_TEMP = 'INSERT INTO {} VALUES (?);'.format(temp_table)
        query_ids = [(_, ) for _ in ids]
        cursor.executemany(SQL_TEMP, query_ids)

        SQL_QUERY = ('SELECT s.* '
                     '  FROM snps s '
                     ' WHERE s.snp_id IN (SELECT distinct query_id FROM {}) '
                     ' ORDER BY s.chrom, s.pos').format(temp_table)

        LOG.info('Query: {}'.format(SQL_QUERY))

        start_time = time.time()

        snps = []
        snp_ids = []
        for row in cursor.execute(SQL_QUERY):
            snps.append([
                row['chrom'], row['pos'], row['snp_id'], row['ref'], row['alt']
            ])
            snp_ids.append(row['snp_id'])

        LOG.info('Done: {}'.format(utils.format_time(start_time, time.time())))

        cursor.close()
        conn.close()

        snps_found = set(snp_ids)
        snps_not_found = [x for x in ids if x not in snps_found]

        return {'snps': snps, 'snps_not_found': snps_not_found}

    except Exception as e:
        LOG.error('Error: {}'.format(e))
        return None