Beispiel #1
0
def buildIdentifierMap(query_species):
    columns = ('ensembl_gene_id',
               'entrezgene',
               'hgnc_id',
               'hgnc_symbol')

    data = Biomart.biomart_iterator(
        columns,
        dataset="%s_gene_ensembl" % query_species)

    map_identifiers = collections.defaultdict(set)
    for row in data:
        ensid = row['ensembl_gene_id']
        for column in columns[1:]:
            xid = str(row[column])
            if xid == "NA" or xid == "":
                continue

            if ensid.startswith('LRG_'):
                continue

            map_identifiers[xid].add((ensid, column))

    # convert to lists
    map_identifiers = dict([(x, list(y)) for x, y in list(map_identifiers.items())])

    return map_identifiers
Beispiel #2
0
def buildIdentifierMap(query_species):
    columns = ('ensembl_gene_id',
               'entrezgene',
               'hgnc_id',
               'hgnc_symbol')

    data = Biomart.biomart_iterator(
        columns,
        dataset="%s_gene_ensembl" % query_species)

    map_identifiers = collections.defaultdict(set)
    for row in data:
        ensid = row['ensembl_gene_id']
        for column in columns[1:]:
            xid = str(row[column])
            if xid == "NA" or xid == "":
                continue

            if ensid.startswith('LRG_'):
                continue

            map_identifiers[xid].add((ensid, column))

    # convert to lists
    map_identifiers = dict([(x, list(y)) for x, y in map_identifiers.items()])

    return map_identifiers
Beispiel #3
0
def buildOrthologyMap(query_species,
                      target_species,
                      filter_type="ortholog_one2one"):
    '''build map of genes in query species to
    those in target species.'''

    columns = ('ensembl_gene_id', '%s_homolog_ensembl_gene' % target_species,
               '%s_homolog_orthology_type' % target_species,
               '%s_homolog_orthology_confidence' % target_species)

    data = Biomart.biomart_iterator(columns,
                                    dataset="%s_gene_ensembl" % query_species)

    map_query2target = dict([
        (x['ensembl_gene_id'], x['%s_homolog_ensembl_gene' % target_species])
        for x in data
        if x['%s_homolog_orthology_type' % target_species] == filter_type
    ])

    return map_query2target
Beispiel #4
0
def buildOrthologyMap(query_species,
                      target_species,
                      filter_type="ortholog_one2one"):
    '''build map of genes in query species to
    those in target species.'''

    columns = ('ensembl_gene_id',
               '%s_homolog_ensembl_gene' % target_species,
               '%s_homolog_orthology_type' % target_species,
               '%s_homolog_orthology_confidence' % target_species)

    data = Biomart.biomart_iterator(
        columns,
        dataset="%s_gene_ensembl" % query_species)

    map_query2target = dict([(
        x['ensembl_gene_id'],
        x['%s_homolog_ensembl_gene' % target_species]) for x in data
        if x['%s_homolog_orthology_type' % target_species] == filter_type])

    return map_query2target
Beispiel #5
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    '''import the KEGG annotations from the R KEGG.db annotations
    package. Note that since KEGG is no longer publically availible,
    this is not up-to-date and maybe removed from bioconductor in
    future releases

    '''

    if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus", 
                    biomart_dataset):
        E.warn("KEGG.db doesn't map Entrez ids for %s, %s will"
               " likely be empty" % (biomart_dataset, outfile))

    R.library("KEGG.db")

    E.info("getting entrez to ensembl mapping ...")
    entrez2ensembl = Biomart.biomart_iterator(
        ("ensembl_gene_id", "entrezgene"),
        biomart=mart,
        dataset=biomart_dataset,
        host=host,
        path="/biomart/martservice")

    entrez2ensembl = dict((x['entrezgene'],
                           x['ensembl_gene_id'])
                          for x in entrez2ensembl)

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    # rx2 did not work in rpy2 2.4.2 - workaround uses
    # absolute indices
    for gene_column, gene in enumerate(entrez2path.names):

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path[gene_column]:
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway),
                           pathname, "NA"]) + "\n")
Beispiel #6
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    '''import the KEGG annotations from the R KEGG.db annotations
    package.

    .. note::

        Since KEGG is no longer publically available, this is not
        up-to-date and maybe removed from bioconductor in future
        releases

    The table written to outfile has the following columns:
    ``ontology``, ``gene_id``, ``kegg_ID``, ``kegg_name``,
    ``evidence``.

    Arguments
    ---------
    outfile : string
        Output filename in :term:`tsv` format.
    mart : string
        Name of the biomart
    host : string
        Host name of the biomart server
    biomart_dataset : string
        Biomart dataset

    '''

    if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus",
                    biomart_dataset):
        E.warn("KEGG.db doesn't map Entrez ids for %s, %s will"
               " likely be empty" % (biomart_dataset, outfile))

    R.library("KEGG.db")

    E.info("getting entrez to ensembl mapping ...")

    # Generates an iterator containing the data from biomart
    entrez2ensembl = Biomart.biomart_iterator(
        ("ensembl_gene_id", "entrezgene"),
        biomart=mart,
        dataset=biomart_dataset,
        host=host,
        path="/biomart/martservice")

    entrez2ensembl = dict((x['entrezgene'],
                           x['ensembl_gene_id'])
                          for x in entrez2ensembl)

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(list(zip(pathnames.names, R.unlist(pathnames))))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    # rx2 did not work in rpy2 2.4.2 - workaround uses
    # absolute indices
    for gene_column, gene in enumerate(entrez2path.names):

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path[gene_column]:
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway),
                           pathname, "NA"]) + "\n")