Example #1
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    Arguments
    ---------
    outfile : string
       Filename of output file
    columns : dict
       Dictionary mapping biomart columns to columns in the output table.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    '''

    R.library("biomaRt")

    keys = list(columns.keys())

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart, dataset=dataset)
    else:
        mart = R.useMart(biomart=biomart, dataset=dataset, host=host)

    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
Example #2
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    Arguments
    ---------
    outfile : string
       Filename of output file
    columns : dict
       Dictionary mapping biomart columns to columns in the output table.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    '''

    R.library("biomaRt")

    keys = columns.keys()

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart, dataset=dataset)
    else:
        mart = R.useMart(biomart=biomart, dataset=dataset, host=host)

    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
Example #3
0
def importFromBiomart(outfile,
                      columns,
                      biomart="ensembl",
                      dataset="hsapiens_gene_ensembl",
                      host='www.biomart.org'):
    '''download a dataset from biomart and output as a
    tab-separated table.

    *columns* is a dictionary mapping biomart columns to columns in
    the output tables. *biomart* and *dataset* denote the database and
    dataset to get the data from.

    '''

    R.library("biomaRt")

    keys = columns.keys()

    mart = R.useMart(biomart=biomart, dataset=dataset, host=host)
    result = R.getBM(attributes=keys, mart=mart)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([columns[x] for x in keys]) + "\n")

    # for x in ("mim_gene_accession", "mim_morbid_accession"):
    #     result[x] = [ ("", y)[y >= 0] for y in result[x] ]

    for data in zip(*[result[x] for x in keys]):
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
def biomart_iterator( columns, 
                      biomart = "ensembl", 
                      dataset = "hsapiens_gene_ensembl",
                      host = 'www.biomart.org' ):
    '''download a dataset from biomart and output as a 
    tab-separated table.

    *columns* is a list with field to obtain.
    
    *biomart* and *dataset* denote the 
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart, dataset= dataset, host=host )
    result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), 
                      mart=mart )
    
    # result is a dataframe.
    # rx returns a dataframe.
    # rx()[0] returns a vector
    for data in zip( *[ result.rx(x)[0] for x in columns] ):
        yield dict( zip(columns, data) )
Example #5
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org'):
    '''download a dataset from biomart and output as a 
    tab-separated table.

    *columns* is a list with field to obtain.

    *biomart* and *dataset* denote the 
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart, dataset=dataset, host=host)
    result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns),
                     mart=mart)

    # result is a dataframe.
    # rx returns a dataframe.
    # rx()[0] returns a vector
    for data in zip(*[result.rx(x)[0] for x in columns]):
        yield dict(zip(columns, data))
Example #6
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(
        ["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write("\t".join(["kegg", ensid,
                                  str(pathway), pathname, "NA"]) + "\n")
Example #7
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
Example #8
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org',
                     path="/biomart/martservice",
                     filters=None,
                     values=None,
                     archive=False):
    '''download a dataset from biomart and output as a
    tab-separated table.

    *columns* is a list with field to obtain.

    *biomart* and *dataset* denote the
    database and dataset to get the data from.

    returns a iterator over rows.
    '''

    R.library("biomaRt")

    mart = R.useMart(biomart=biomart,
                     dataset=dataset,
                     host=host,
                     path=path,
                     archive=archive)

    if filters is not None:
        filter_names = R.StrVector(filters)
    else:
        filter_names = ""

    if values is not None:
        filter_values = values
    else:
        filter_values = ""

    # result is a dataframe
    result = R.getBM(
        attributes=rpy2.robjects.vectors.StrVector(columns),
        filters=filter_names,
        values=filter_values,
        mart=mart)

    # access via result.rx was broken in rpy2 2.4.2, thus try
    # numeric access
    assert tuple(result.colnames) == tuple(columns),\
        "colnames in dataframe: %s different from expected: %s" % \
        (str(tuple(result.colnames)), tuple(columns))

    for data in zip(*[result[x] for x in range(len(columns))]):
        yield dict(zip(columns, data))
Example #9
0
def list_filters(mart_name, dataset):
    """
    List filters for mart name and dataset
    >>> mart_name = 'ensembl'
    >>> dataset = 'dmelanogaster_gene_ensembl'
    >>> list_filters(mart_name, dataset)[:3]
                  name      description
    0  chromosome_name  Chromosome name
    1            start  Gene Start (bp)
    2              end    Gene End (bp)
    <BLANKLINE>
    [3 rows x 2 columns]
    """
    dataset = r.useDataset(dataset, mart=r.useMart(mart_name))
    return rpy2_to_pandas(r.listFilters(dataset))
Example #10
0
def list_attributes(mart_name, dataset):
    """
    Returns a pandas.DataFrame listing attributes for mart name and dataset
    >>> mart_name = 'ensembl'
    >>> dataset = 'dmelanogaster_gene_ensembl'
    >>> list_attributes(mart_name, dataset)[:3]
                        name            description
    0        ensembl_gene_id        Ensembl Gene ID
    1  ensembl_transcript_id  Ensembl Transcript ID
    2     ensembl_peptide_id     Ensembl Protein ID
    <BLANKLINE>
    [3 rows x 2 columns]
    """
    dataset = r.useDataset(dataset, mart=r.useMart(mart_name))
    return rpy2_to_pandas(r.listAttributes(dataset))
Example #11
0
def list_datasets(mart_name, verbose=False):
    """
    Returns a pandas.DataFrame listing datasets in mart name
    >>> list_datasets('ensembl').ix[6:7]
                      dataset                          description  \\
    6  csavignyi_gene_ensembl       Ciona savignyi genes (CSAV2.0)   
    7     fcatus_gene_ensembl  Felis catus genes (Felis_catus_6.2)   
    <BLANKLINE>
               version  
    6          CSAV2.0  
    7  Felis_catus_6.2  
    <BLANKLINE>
    [2 rows x 3 columns]
    """
    return rpy2_to_pandas(
        r.listDatasets(mart=r.useMart(mart_name), verbose=verbose))
def init_biomaRt():

    global __biomaRt
    global __mart
    if __biomaRt is None:

        try:
            print "Importing biomaRt ..."
            biomaRt = importr("biomaRt")
        except:
            print ("It looks like biomaRt is not installed. Trying to install biomaRt via"
                   "Bioconductor...")
            try:
                R.source("http://bioconductor.org/biocLite.R")
                R.biocLite("biomaRt")
                biomaRt = importr("biomaRt")
            except:
                print "Problem installing biomaRt from Bioconductor!"
                print ("Please install manually from: "
                       "http://www.bioconductor.org/packages/release/bioc/html/biomaRt.html")

        __biomaRt = biomaRt
        __mart = R.useMart(biomart = "ensembl", dataset = __mart_dataset)
Example #13
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org',
                     path="/biomart/martservice",
                     filters=None,
                     values=None,
                     archive=False):
    '''download a dataset from biomart and output as a
    tab-separated table.


    Arguments
    ---------
    columns : dict
       List of fields to obtain.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    filters : list
       List of filter to use
    values : list
       Values of the filters
    archive : bool
       If True, use archived version

    Returns
    -------
    iterator
       Iterator over rows in biomart database. Each row
       is dictionary mapping column names to values.

    '''

    R.library("biomaRt")

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart,
                         dataset=dataset,
                         path=path,
                         archive=archive)
    else:
        mart = R.useMart(biomart=biomart,
                         dataset=dataset,
                         host=host,
                         path=path,
                         archive=archive)

    if filters is not None:
        filter_names = rpy2.robjects.vectors.StrVector(filters)
    else:
        filter_names = ""

    if values is not None:
        filter_values = values
    else:
        filter_values = ""

    # result is a dataframe
    result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns),
                     filters=filter_names,
                     values=filter_values,
                     mart=mart)

    # access via result.rx was broken in rpy2 2.4.2, thus try
    # numeric access
    assert tuple(result.colnames) == tuple(columns),\
        "colnames in dataframe: %s different from expected: %s" % \
        (str(tuple(result.colnames)), tuple(columns))

    for data in zip(*[result[x] for x in range(len(columns))]):
        yield dict(zip(columns, data))
Example #14
0
def get_fungi_mart(ds):
    return R.useMart(biomart = 'fungi_mart',
                     host = "https://fungi.ensembl.org/",
                     dataset=ds)
Example #15
0
def biomart_iterator(columns,
                     biomart="ensembl",
                     dataset="hsapiens_gene_ensembl",
                     host='www.biomart.org',
                     path="/biomart/martservice",
                     filters=None,
                     values=None,
                     archive=False):
    '''download a dataset from biomart and output as a
    tab-separated table.


    Arguments
    ---------
    columns : dict
       List of fields to obtain.
    biomart : string
       Biomart name
    dataset : string
       Biomart dataset
    host : string
       Biomart host
    filters : list
       List of filter to use
    values : list
       Values of the filters
    archive : bool
       If True, use archived version

    Returns
    -------
    iterator
       Iterator over rows in biomart database. Each row
       is dictionary mapping column names to values.

    '''

    R.library("biomaRt")

    # The default value for host in the biomaRt package is
    # www.biomart.org but for some reason R errors if you specify
    # host manually but then use the default - but it is fine if
    # host is anything valid apart from www.biomart.org.  So I have
    # changed this to only specify a value if the value you
    # are specifying is different to the default KB

    if host == 'www.biomart.org':
        mart = R.useMart(biomart=biomart, dataset=dataset, path=path,
                         archive=archive)
    else:
        mart = R.useMart(biomart=biomart, dataset=dataset, host=host,
                         path=path, archive=archive)

    if filters is not None:
        filter_names = rpy2.robjects.vectors.StrVector(filters)
    else:
        filter_names = ""

    if values is not None:
        filter_values = values
    else:
        filter_values = ""

    # result is a dataframe
    result = R.getBM(
        attributes=rpy2.robjects.vectors.StrVector(columns),
        filters=filter_names,
        values=filter_values,
        mart=mart)

    # access via result.rx was broken in rpy2 2.4.2, thus try
    # numeric access
    assert tuple(result.colnames) == tuple(columns),\
        "colnames in dataframe: %s different from expected: %s" % \
        (str(tuple(result.colnames)), tuple(columns))

    for data in zip(*[result[x] for x in range(len(columns))]):
        yield dict(list(zip(columns, data)))
Example #16
0
def make_lookup(mart_name, dataset, attributes, filters=None, values=None,
                unique_rows=True):
    """
    Given a mart name, dataset name, and a list of attributes, return
    a pandas.DataFrame indexed by the first attribute in the list provided.

    In R, filters is a character vector, and values is either a single
    character vector (if only one filter provided) or a list of character
    vectors.

    This function allows `filters` to be a dictionary where keys are filters
    and values are...values.


    >>> mart_name = 'ensembl'
    >>> dataset = 'dmelanogaster_gene_ensembl'
    >>> filters = ['flybase_gene_id', 'chromosome_name']
    >>> attributes = ['flybase_gene_id', 'flybasename_gene', 'chromosome_name']
    >>> values = [['FBgn0031208', 'FBgn0002121', 'FBgn0031209', 'FBgn0051973'], ['2L']]
    >>> df = make_lookup(
    ... mart_name=mart_name,
    ... dataset=dataset,
    ... attributes=attributes,
    ... filters=filters,
    ... values=values)

    Alternatively, make a dictionary of filters: values, in which case you
    don't need to provide `values` separately:

    >>> filters = {
    ... 'flybase_gene_id': ['FBgn0031208', 'FBgn0002121', 'FBgn0031209', 'FBgn0051973'],
    ... 'chromosome_name': ['2L']}

    >>> df2 = make_lookup(
    ... mart_name=mart_name,
    ... dataset=dataset,
    ... attributes=attributes,
    ... filters=filters)

    Confirm that both methods yield identical results:

    >>> assert np.all(df.values == df2.values)

    Check results:

    >>> df.head()
                    flybasename_gene chromosome_name
    flybase_gene_id                                 
    FBgn0002121               l(2)gl              2L
    FBgn0031208              CG11023              2L
    FBgn0031209                Ir21a              2L
    FBgn0051973                 Cda5              2L
    <BLANKLINE>
    [4 rows x 2 columns]


    Indexing by gene ID (or whatever was the first attribute provided):

    >>> df.ix['FBgn0031209']
    flybasename_gene    Ir21a
    chromosome_name        2L
    Name: FBgn0031209, dtype: object



    Extracting data:

    >>> df.ix['FBgn0031209']['flybasename_gene']
    'Ir21a'

    Or get all names:

    >>> df['flybasename_gene']
    flybase_gene_id
    FBgn0002121         l(2)gl
    FBgn0031208        CG11023
    FBgn0031209          Ir21a
    FBgn0051973           Cda5
    Name: flybasename_gene, dtype: object

    """
    mart = r.useDataset(dataset, mart=r.useMart(mart_name))
    attributes = robjects.StrVector(attributes)

    kwargs = dict(
        attributes=attributes,
        uniqueRows=unique_rows,
        mart=mart
    )

    def _filter_and_values_to_RList(d):
        """`d` is a dictionary of filters: values.  Returns a StrVector and
        a ListVector of StrVectors"""
        # Could use ListVector directly with the dict, but want to guarantee
        # positional order of filters and values
        f = robjects.StrVector(list(d.keys()))
        v = robjects.ListVector(
            rpy2.rlike.container.TaggedList(
                d.values(),
                tags=list(d.keys())
            )
        )
        return f, v

    if isinstance(filters, dict):
        if values is not None:
            raise ValueError("`values` are already specified in the "
                             "`filters` dictionary")
        filter_value_dict = filters
        _filters, _values = _filter_and_values_to_RList(filter_value_dict)
        kwargs['filters'] = _filters
        kwargs['values'] = _values

    elif filters is None:
        if values is not None:
            raise ValueError("`filters` must be specified if `values` "
                             "is specified; alternatively use a dictionary "
                             " for `filters`")

    elif filters and values:
        # values needs to be a list of lists; convert it to one if it's not
        # already
        if not isinstance(values[0], (list, tuple)):
            values = [values]

        # If we got here, then assume filters is a list or tuple
        if len(filters) != len(values):
            raise ValueError('Length of `filters` and `values` must match')

        filter_value_dict = dict(zip(filters, values))
        _filters, _values = _filter_and_values_to_RList(filter_value_dict)
        kwargs['filters'] = _filters
        kwargs['values'] = _values

    else:
        raise ValueError('unhandled case')

    results = r.getBM(**kwargs)
    return rpy2_to_pandas(results, index_col=0)