Exemple #1
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org'):
    '''download a dataset from biomart and output as a 
    tab-separated table.

    *columns* is a list with field to obtain.

    *biomart* and *dataset* denote the 
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart, dataset=dataset, host=host)
    result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns),
                     mart=mart)

    # result is a dataframe.
    # rx returns a dataframe.
    # rx()[0] returns a vector
    for data in zip(*[result.rx(x)[0] for x in columns]):
        yield dict(zip(columns, data))
Exemple #2
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    *columns* is a dictionary mapping biomart columns to columns in
    the output tables. *biomart* and *dataset* denote the database and
    dataset to get the data from.

    '''

    R.library("biomaRt")

    keys = columns.keys()

    mart = R.useMart(biomart=biomart, dataset=dataset, host=host)
    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
def biomart_iterator( columns, 
                      biomart = "ensembl", 
                      dataset = "hsapiens_gene_ensembl",
                      host = 'www.biomart.org' ):
    '''download a dataset from biomart and output as a 
    tab-separated table.

    *columns* is a list with field to obtain.
    
    *biomart* and *dataset* denote the 
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart, dataset= dataset, host=host )
    result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), 
                      mart=mart )
    
    # result is a dataframe.
    # rx returns a dataframe.
    # rx()[0] returns a vector
    for data in zip( *[ result.rx(x)[0] for x in columns] ):
        yield dict( zip(columns, data) )
Exemple #4
0
def get_exons(mart):
    """Queries a Mart object to find all exons of its dataset attribute.

    Forms a specific getBM query that is sent to the BioMart API to
    retrieve information about the exons (and their exonic coordinates)
    of a specific Dataset. The output is then transformed via the GRanges
    Bioconductor package and seqnames converted to UCSC standard.

    Args:
        mart: an rpy2-converted biomaRt Mart object.

    Returns:
        An rpy2 DataFrame containing a table of relevant exon information.
        DataFrame column headers are:
        ["seqnames", "start", "end", "width", "strand"]
    """
    exons = R.getBM(attributes = StrVector(("chromosome_name",
                "exon_chrom_start", "exon_chrom_end", "strand")),
                mart=mart)

    exons_ranges = R.GRanges(
        seqnames=exons.rx2('chromosome_name'),
        ranges=R.IRanges(start=exons.rx2('exon_chrom_start'),
                         end=exons.rx2('exon_chrom_end')),
        strand='+' if exons.rx2('strand') == '1L' else '-')

    # This was hell to find
    # https://stackoverflow.com/questions/38806898/
    set_method = R("`seqlevelsStyle<-`")
    exons_ranges = set_method(exons_ranges, "UCSC")

    as_data_frame = R("function(x) as.data.frame(x)")
    exons_ranges_df = as_data_frame(exons_ranges)

    return exons_ranges_df
Exemple #5
0
def get_genes(mart):
    """Queries a Mart object to find all genes of its dataset attribute.

    Forms a specific getBM query that is sent to the BioMart API to
    retrieve information about the genes of a specifc Dataset. This
    output is then converted from an rpy2 DataFrame to a pandas
    DataFrame.

    Args:
        mart: an rpy2-converted biomaRt Mart object.

    Returns:
        An pandas DataFrame containing a table of relevant gene information.
        DataFrame column headers are:
        ["gene_name", "chromosome_name", "start_position", "end_position"]
    """
    genes = R.getBM(
        attributes = StrVector(("external_gene_name", "chromosome_name",
            "start_position", "end_position")),
        mart=mart)

    genes_df = pandas2ri.ri2py(genes)
    genes_df.rename(columns={'external_gene_name': 'gene_name'}, inplace=True)

    return genes_df
Exemple #6
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(
        ["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write("\t".join(["kegg", ensid,
                                  str(pathway), pathname, "NA"]) + "\n")
Exemple #7
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def convert_hgnc2ensembl(hgnc_id):

    init_biomaRt()

    v = R.c(hgnc_id)
    res = R.getBM(attributes=R.c("ensembl_gene_id"), filters="hgnc_symbol", values=v, mart=__mart)

    try:
        return R.get("ensembl_gene_id", res)[0]
    except:
        print 'Error convert_hgnc2ensembl: '+str(hgnc_id)+' not found in database'
        return None
def convert_list_ensembl2hgnc(ensembl_id_list):

    init_biomaRt()

    v = R.c(ensembl_id_list)
    res = R.getBM(attributes=R.c("hgnc_symbol"), filters="ensembl_gene_id", values=v, mart=__mart)

    try:
        return R.get("hgnc_symbol", res)
    except:
        print 'Error convert_ensembl2hgnc: '+str(ensembl_id)+' not found in database'
        return None
Exemple #10
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org',
                     path="/biomart/martservice",
                     filters=None,
                     values=None,
                     archive=False):
    '''download a dataset from biomart and output as a
    tab-separated table.

    *columns* is a list with field to obtain.

    *biomart* and *dataset* denote the
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart,
                     dataset=dataset,
                     host=host,
                     path=path,
                     archive=archive)

    if filters is not None:
        filter_names = R.StrVector(filters)
    else:
        filter_names = ""

    if values is not None:
        filter_values = values
    else:
        filter_values = ""

    # result is a dataframe
    result = R.getBM(
        attributes=rpy2.robjects.vectors.StrVector(columns),
        filters=filter_names,
        values=filter_values,
        mart=mart)

    # access via result.rx was broken in rpy2 2.4.2, thus try
    # numeric access
    assert tuple(result.colnames) == tuple(columns),\
        "colnames in dataframe: %s different from expected: %s" % \
        (str(tuple(result.colnames)), tuple(columns))

    for data in zip(*[result[x] for x in range(len(columns))]):
        yield dict(zip(columns, data))
Exemple #11
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    Arguments
    ---------
    outfile : string
       Filename of output file
    columns : dict
       Dictionary mapping biomart columns to columns in the output table.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    '''

    R.library("biomaRt")

    keys = columns.keys()

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart, dataset=dataset)
    else:
        mart = R.useMart(biomart=biomart, dataset=dataset, host=host)

    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
Exemple #12
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    Arguments
    ---------
    outfile : string
       Filename of output file
    columns : dict
       Dictionary mapping biomart columns to columns in the output table.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    '''

    R.library("biomaRt")

    keys = list(columns.keys())

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart, dataset=dataset)
    else:
        mart = R.useMart(biomart=biomart, dataset=dataset, host=host)

    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
Exemple #13
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org',
                     path="/biomart/martservice",
                     filters=None,
                     values=None,
                     archive=False):
    '''download a dataset from biomart and output as a
    tab-separated table.


    Arguments
    ---------
    columns : dict
       List of fields to obtain.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    filters : list
       List of filter to use
    values : list
       Values of the filters
    archive : bool
       If True, use archived version

    Returns
    -------
    iterator
       Iterator over rows in biomart database. Each row
       is dictionary mapping column names to values.

    '''

    R.library("biomaRt")

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart,
                         dataset=dataset,
                         path=path,
                         archive=archive)
    else:
        mart = R.useMart(biomart=biomart,
                         dataset=dataset,
                         host=host,
                         path=path,
                         archive=archive)

    if filters is not None:
        filter_names = rpy2.robjects.vectors.StrVector(filters)
    else:
        filter_names = ""

    if values is not None:
        filter_values = values
    else:
        filter_values = ""

    # result is a dataframe
    result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns),
                     filters=filter_names,
                     values=filter_values,
                     mart=mart)

    # access via result.rx was broken in rpy2 2.4.2, thus try
    # numeric access
    assert tuple(result.colnames) == tuple(columns),\
        "colnames in dataframe: %s different from expected: %s" % \
        (str(tuple(result.colnames)), tuple(columns))

    for data in zip(*[result[x] for x in range(len(columns))]):
        yield dict(zip(columns, data))
Exemple #14
0
def return_bm_df(atts, mart):
    atts_vector = StrVector([x for x in atts])
    BM = R.getBM(attributes = atts_vector, mart = mart)
    return r2pd_dataframe(BM)
Exemple #15
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org',
                     path="/biomart/martservice",
                     filters=None,
                     values=None,
                     archive=False):
    '''download a dataset from biomart and output as a
    tab-separated table.


    Arguments
    ---------
    columns : dict
       List of fields to obtain.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    filters : list
       List of filter to use
    values : list
       Values of the filters
    archive : bool
       If True, use archived version

    Returns
    -------
    iterator
       Iterator over rows in biomart database. Each row
       is dictionary mapping column names to values.

    '''

    R.library("biomaRt")

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart, dataset=dataset, path=path,
                         archive=archive)
    else:
        mart = R.useMart(biomart=biomart, dataset=dataset, host=host,
                         path=path, archive=archive)

    if filters is not None:
        filter_names = rpy2.robjects.vectors.StrVector(filters)
    else:
        filter_names = ""

    if values is not None:
        filter_values = values
    else:
        filter_values = ""

    # result is a dataframe
    result = R.getBM(
        attributes=rpy2.robjects.vectors.StrVector(columns),
        filters=filter_names,
        values=filter_values,
        mart=mart)

    # access via result.rx was broken in rpy2 2.4.2, thus try
    # numeric access
    assert tuple(result.colnames) == tuple(columns),\
        "colnames in dataframe: %s different from expected: %s" % \
        (str(tuple(result.colnames)), tuple(columns))

    for data in zip(*[result[x] for x in range(len(columns))]):
        yield dict(list(zip(columns, data)))
Exemple #16
0
def make_lookup(mart_name, dataset, attributes, filters=None, values=None,
                unique_rows=True):
    """
    Given a mart name, dataset name, and a list of attributes, return
    a pandas.DataFrame indexed by the first attribute in the list provided.

    In R, filters is a character vector, and values is either a single
    character vector (if only one filter provided) or a list of character
    vectors.

    This function allows `filters` to be a dictionary where keys are filters
    and values are...values.


    >>> mart_name = 'ensembl'
    >>> dataset = 'dmelanogaster_gene_ensembl'
    >>> filters = ['flybase_gene_id', 'chromosome_name']
    >>> attributes = ['flybase_gene_id', 'flybasename_gene', 'chromosome_name']
    >>> values = [['FBgn0031208', 'FBgn0002121', 'FBgn0031209', 'FBgn0051973'], ['2L']]
    >>> df = make_lookup(
    ... mart_name=mart_name,
    ... dataset=dataset,
    ... attributes=attributes,
    ... filters=filters,
    ... values=values)

    Alternatively, make a dictionary of filters: values, in which case you
    don't need to provide `values` separately:

    >>> filters = {
    ... 'flybase_gene_id': ['FBgn0031208', 'FBgn0002121', 'FBgn0031209', 'FBgn0051973'],
    ... 'chromosome_name': ['2L']}

    >>> df2 = make_lookup(
    ... mart_name=mart_name,
    ... dataset=dataset,
    ... attributes=attributes,
    ... filters=filters)

    Confirm that both methods yield identical results:

    >>> assert np.all(df.values == df2.values)

    Check results:

    >>> df.head()
                    flybasename_gene chromosome_name
    flybase_gene_id                                 
    FBgn0002121               l(2)gl              2L
    FBgn0031208              CG11023              2L
    FBgn0031209                Ir21a              2L
    FBgn0051973                 Cda5              2L
    <BLANKLINE>
    [4 rows x 2 columns]


    Indexing by gene ID (or whatever was the first attribute provided):

    >>> df.ix['FBgn0031209']
    flybasename_gene    Ir21a
    chromosome_name        2L
    Name: FBgn0031209, dtype: object



    Extracting data:

    >>> df.ix['FBgn0031209']['flybasename_gene']
    'Ir21a'

    Or get all names:

    >>> df['flybasename_gene']
    flybase_gene_id
    FBgn0002121         l(2)gl
    FBgn0031208        CG11023
    FBgn0031209          Ir21a
    FBgn0051973           Cda5
    Name: flybasename_gene, dtype: object

    """
    mart = r.useDataset(dataset, mart=r.useMart(mart_name))
    attributes = robjects.StrVector(attributes)

    kwargs = dict(
        attributes=attributes,
        uniqueRows=unique_rows,
        mart=mart
    )

    def _filter_and_values_to_RList(d):
        """`d` is a dictionary of filters: values.  Returns a StrVector and
        a ListVector of StrVectors"""
        # Could use ListVector directly with the dict, but want to guarantee
        # positional order of filters and values
        f = robjects.StrVector(list(d.keys()))
        v = robjects.ListVector(
            rpy2.rlike.container.TaggedList(
                d.values(),
                tags=list(d.keys())
            )
        )
        return f, v

    if isinstance(filters, dict):
        if values is not None:
            raise ValueError("`values` are already specified in the "
                             "`filters` dictionary")
        filter_value_dict = filters
        _filters, _values = _filter_and_values_to_RList(filter_value_dict)
        kwargs['filters'] = _filters
        kwargs['values'] = _values

    elif filters is None:
        if values is not None:
            raise ValueError("`filters` must be specified if `values` "
                             "is specified; alternatively use a dictionary "
                             " for `filters`")

    elif filters and values:
        # values needs to be a list of lists; convert it to one if it's not
        # already
        if not isinstance(values[0], (list, tuple)):
            values = [values]

        # If we got here, then assume filters is a list or tuple
        if len(filters) != len(values):
            raise ValueError('Length of `filters` and `values` must match')

        filter_value_dict = dict(zip(filters, values))
        _filters, _values = _filter_and_values_to_RList(filter_value_dict)
        kwargs['filters'] = _filters
        kwargs['values'] = _values

    else:
        raise ValueError('unhandled case')

    results = r.getBM(**kwargs)
    return rpy2_to_pandas(results, index_col=0)