Ejemplo n.º 1
0
def _list_mapping_tables():
    import config
    import dblib

    x = dblib.query("SHOW TABLES", config.gm_user, config.gm_passwd,
                    config.gm_db, config.gm_host, config.gm_port,
                    config.gm_socket)
    x = [x[0] for x in x]
    x = [x for x in x if x.startswith("MAP_")]
    x = sorted(x)  # will sort by table name, then date.
    mapping_tables = x

    # If multiple versions of the table exists, use the one with the
    # latest date.
    name2tablename = {}  # ENTREZ_ID -> MAP_ENTREZ_ID_111223
    for tablename in mapping_tables:
        # MAP_ENTREZ_ID_111223
        # MAP_GENBANK_111223
        # MAP_AFFY_HG_U133A_2___na32
        assert tablename.startswith("MAP_")
        x = tablename[4:]

        # If last 6 digits is an integer, then remove it.
        try:
            int(x[-6:])
        except ValueError, y:
            pass
        else:
            assert x[-7] == "_"
            x = x[:-7]
        name = x

        # later ones will overwrite earlier ones
        name2tablename[name] = tablename
Ejemplo n.º 2
0
def _find_entrez_gene_table():
    # Return the name of the most recent one.
    import config
    import dblib

    x = dblib.query("SHOW TABLES", config.gm_user, config.gm_passwd,
                    config.gm_db, config.gm_host, config.gm_port,
                    config.gm_socket)
    x = [x[0] for x in x]
    x = [x for x in x if x.startswith("entrez_gene_")]
    x.sort()
    assert x
    return x[-1]
Ejemplo n.º 3
0
def _lookup_gene(id):
    """id should be the Entez ID of a gene, given as an integer.
    
    Return id, symbol, name, tax_id, organism.  id and tax_id will be
    integers.  Everything else is a string.  If not found, then
    everything (except for the id) will be empty strings.
    
    """
    import config
    import dblib

    id = int(id)
    symbol = name = tax_id = organism = ""
    table = _find_entrez_gene_table()
    q = "SELECT symbol, name, tax_id, organism FROM %s WHERE gene_id=%s" % (
        table, id)
    x = dblib.query(q, config.gm_user, config.gm_passwd, config.gm_db,
                    config.gm_host, config.gm_port, config.gm_socket)
    for x in x:
        symbol, name, tax_id, organism = x
    return id, symbol, name, tax_id, organism
Ejemplo n.º 4
0
def find_many_genes_detailed(genes, tax_id=None):
    """Return list of (gene_id, symbol, name, tax_id, organism,
    name_from_query, source_db, name_in_db).  May not be in the same
    order as the query.

    gene_id and tax_id are integers.  Everything else is a string.

    """
    import itertools
    import config
    import dblib

    genes_str = dblib.format_list(genes)

    # gene_id, source_db, name_in_db, is_official
    results = []
    if 1:
        # Use the master mapping table.
        columns = "gene_id, symbol, official, source_db"
        q = "SELECT %s FROM %s WHERE symbol in (%s);" % (columns, "MASTERMAP",
                                                         genes_str)
        x = dblib.query(q, config.gm_user, config.gm_passwd, config.gm_db,
                        config.gm_host, config.gm_port, config.gm_socket)
        for x in x:
            gene_id, name_in_db, is_official, source_db = x
            gene_id = int(gene_id)
            x = gene_id, source_db, name_in_db, is_official
            results.append(x)
    else:
        # Query each mapping table separately.
        table_names = _list_mapping_tables()
        for table_name in table_names:
            columns = "gene_id, symbol, official"
            q = "SELECT %s FROM %s WHERE symbol in (%s);" % (
                columns, table_name, genes_str)
            x = dblib.query(q, config.gm_user, config.gm_passwd, config.gm_db,
                            config.gm_host, config.gm_port, config.gm_socket)
            for x in x:
                gene_id, name_in_db, is_official = x
                gene_id = int(gene_id)
                x = gene_id, table_name, name_in_db, is_official
                results.append(x)

    # Collect all possible hits, organized by the names in the query.
    # name_from_query -> (gene_id, source_db, name_in_db, is_official)
    hits_by_query = {}
    for (name_from_query, x) in itertools.product(genes, results):
        gene_id, table_name, name_in_db, is_official = x
        if name_from_query.upper() != name_in_db.upper():
            continue
        if name_from_query not in hits_by_query:
            hits_by_query[name_from_query] = []
        hits_by_query[name_from_query].append(x)

    # Clean up the hits a bit.
    for name_from_query in hits_by_query:
        hits = hits_by_query[name_from_query]

        # If some of the cases match exactly, then use the exact
        # matches.  Do this before the official hits, because this can
        # give clues to the organism.
        exact_matches = [int(x[2] == name_from_query) for x in hits]
        if sum(exact_matches) > 0:
            hits = [x for x in hits if x[2] == name_from_query]

        # If some of these hits are official, and others aren't, then
        # only keep the official ones.
        is_official = [x[3] for x in hits]
        if sum(is_official) > 0:
            hits = [x for x in hits if x[3]]

        hits_by_query[name_from_query] = hits

    clean = {}
    for (name_from_query, hits) in hits_by_query.iteritems():
        for x in hits:
            gene_id, source_db, name_in_db, is_official = x

            x = _lookup_gene(gene_id)
            id, symbol, name, tax_id_, organism = x
            x = gene_id, symbol, name, tax_id_, organism, name_from_query, \
                source_db, name_in_db
            clean[x] = 1  # no duplicates
    clean = sorted(clean)

    # If tax_id is given, then only return hits from that tax_id.
    if tax_id:
        tax_id = int(tax_id)
        clean = [x for x in clean if x[3] == tax_id]

    return clean
Ejemplo n.º 5
0
    # If name looks like a gene_id, see if it's a discontinued gene ID.
    is_gene_id = True
    try:
        int(name)
    except ValueError, x:
        is_gene_id = False

    # Need to clean up name for security reasons.
    columns = "old_gene_id, old_symbol, gene_id, tax_id"
    q = "SELECT %s FROM %s WHERE old_symbol='%s';" % (columns, "DISCONTINUED",
                                                      name)
    if is_gene_id:
        q = "SELECT %s FROM %s WHERE old_gene_id=%s;" % (columns,
                                                         "DISCONTINUED", name)
    x = dblib.query(q, config.gm_user, config.gm_passwd, config.gm_db,
                    config.gm_host, config.gm_port, config.gm_socket)
    gene_ids = []
    for x in x:
        old_gene_id, old_symbol, gene_id, tax_id_ = x
        if tax_id is not None and int(tax_id) != int(tax_id_):
            continue
        if gene_id not in gene_ids:
            gene_ids.append(gene_id)
    if not gene_ids:
        return None
    assert len(gene_ids) == 1, "Multiple discontinued for: %s" % name
    gene_id = gene_ids[0]
    return int(gene_id)


def _lookup_gene(id):