def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- outfile : string Filename of output file columns : dict Dictionary mapping biomart columns to columns in the output table. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host ''' R.library("biomaRt") keys = list(columns.keys()) # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- outfile : string Filename of output file columns : dict Dictionary mapping biomart columns to columns in the output table. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host ''' R.library("biomaRt") keys = columns.keys() # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. *columns* is a dictionary mapping biomart columns to columns in the output tables. *biomart* and *dataset* denote the database and dataset to get the data from. ''' R.library("biomaRt") keys = columns.keys() mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def biomart_iterator( columns, biomart = "ensembl", dataset = "hsapiens_gene_ensembl", host = 'www.biomart.org' ): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset= dataset, host=host ) result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), mart=mart ) # result is a dataframe. # rx returns a dataframe. # rx()[0] returns a vector for data in zip( *[ result.rx(x)[0] for x in columns] ): yield dict( zip(columns, data) )
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns), mart=mart) # result is a dataframe. # rx returns a dataframe. # rx()[0] returns a vector for data in zip(*[result.rx(x)[0] for x in columns]): yield dict(zip(columns, data))
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector( ["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write("\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector(["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org', path="/biomart/martservice", filters=None, values=None, archive=False): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset=dataset, host=host, path=path, archive=archive) if filters is not None: filter_names = R.StrVector(filters) else: filter_names = "" if values is not None: filter_values = values else: filter_values = "" # result is a dataframe result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), filters=filter_names, values=filter_values, mart=mart) # access via result.rx was broken in rpy2 2.4.2, thus try # numeric access assert tuple(result.colnames) == tuple(columns),\ "colnames in dataframe: %s different from expected: %s" % \ (str(tuple(result.colnames)), tuple(columns)) for data in zip(*[result[x] for x in range(len(columns))]): yield dict(zip(columns, data))
def list_filters(mart_name, dataset): """ List filters for mart name and dataset >>> mart_name = 'ensembl' >>> dataset = 'dmelanogaster_gene_ensembl' >>> list_filters(mart_name, dataset)[:3] name description 0 chromosome_name Chromosome name 1 start Gene Start (bp) 2 end Gene End (bp) <BLANKLINE> [3 rows x 2 columns] """ dataset = r.useDataset(dataset, mart=r.useMart(mart_name)) return rpy2_to_pandas(r.listFilters(dataset))
def list_attributes(mart_name, dataset): """ Returns a pandas.DataFrame listing attributes for mart name and dataset >>> mart_name = 'ensembl' >>> dataset = 'dmelanogaster_gene_ensembl' >>> list_attributes(mart_name, dataset)[:3] name description 0 ensembl_gene_id Ensembl Gene ID 1 ensembl_transcript_id Ensembl Transcript ID 2 ensembl_peptide_id Ensembl Protein ID <BLANKLINE> [3 rows x 2 columns] """ dataset = r.useDataset(dataset, mart=r.useMart(mart_name)) return rpy2_to_pandas(r.listAttributes(dataset))
def list_datasets(mart_name, verbose=False): """ Returns a pandas.DataFrame listing datasets in mart name >>> list_datasets('ensembl').ix[6:7] dataset description \\ 6 csavignyi_gene_ensembl Ciona savignyi genes (CSAV2.0) 7 fcatus_gene_ensembl Felis catus genes (Felis_catus_6.2) <BLANKLINE> version 6 CSAV2.0 7 Felis_catus_6.2 <BLANKLINE> [2 rows x 3 columns] """ return rpy2_to_pandas( r.listDatasets(mart=r.useMart(mart_name), verbose=verbose))
def init_biomaRt(): global __biomaRt global __mart if __biomaRt is None: try: print "Importing biomaRt ..." biomaRt = importr("biomaRt") except: print ("It looks like biomaRt is not installed. Trying to install biomaRt via" "Bioconductor...") try: R.source("http://bioconductor.org/biocLite.R") R.biocLite("biomaRt") biomaRt = importr("biomaRt") except: print "Problem installing biomaRt from Bioconductor!" print ("Please install manually from: " "http://www.bioconductor.org/packages/release/bioc/html/biomaRt.html") __biomaRt = biomaRt __mart = R.useMart(biomart = "ensembl", dataset = __mart_dataset)
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org', path="/biomart/martservice", filters=None, values=None, archive=False): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- columns : dict List of fields to obtain. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host filters : list List of filter to use values : list Values of the filters archive : bool If True, use archived version Returns ------- iterator Iterator over rows in biomart database. Each row is dictionary mapping column names to values. ''' R.library("biomaRt") # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset, path=path, archive=archive) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host, path=path, archive=archive) if filters is not None: filter_names = rpy2.robjects.vectors.StrVector(filters) else: filter_names = "" if values is not None: filter_values = values else: filter_values = "" # result is a dataframe result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns), filters=filter_names, values=filter_values, mart=mart) # access via result.rx was broken in rpy2 2.4.2, thus try # numeric access assert tuple(result.colnames) == tuple(columns),\ "colnames in dataframe: %s different from expected: %s" % \ (str(tuple(result.colnames)), tuple(columns)) for data in zip(*[result[x] for x in range(len(columns))]): yield dict(zip(columns, data))
def get_fungi_mart(ds): return R.useMart(biomart = 'fungi_mart', host = "https://fungi.ensembl.org/", dataset=ds)
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org', path="/biomart/martservice", filters=None, values=None, archive=False): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- columns : dict List of fields to obtain. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host filters : list List of filter to use values : list Values of the filters archive : bool If True, use archived version Returns ------- iterator Iterator over rows in biomart database. Each row is dictionary mapping column names to values. ''' R.library("biomaRt") # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset, path=path, archive=archive) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host, path=path, archive=archive) if filters is not None: filter_names = rpy2.robjects.vectors.StrVector(filters) else: filter_names = "" if values is not None: filter_values = values else: filter_values = "" # result is a dataframe result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), filters=filter_names, values=filter_values, mart=mart) # access via result.rx was broken in rpy2 2.4.2, thus try # numeric access assert tuple(result.colnames) == tuple(columns),\ "colnames in dataframe: %s different from expected: %s" % \ (str(tuple(result.colnames)), tuple(columns)) for data in zip(*[result[x] for x in range(len(columns))]): yield dict(list(zip(columns, data)))
def make_lookup(mart_name, dataset, attributes, filters=None, values=None, unique_rows=True): """ Given a mart name, dataset name, and a list of attributes, return a pandas.DataFrame indexed by the first attribute in the list provided. In R, filters is a character vector, and values is either a single character vector (if only one filter provided) or a list of character vectors. This function allows `filters` to be a dictionary where keys are filters and values are...values. >>> mart_name = 'ensembl' >>> dataset = 'dmelanogaster_gene_ensembl' >>> filters = ['flybase_gene_id', 'chromosome_name'] >>> attributes = ['flybase_gene_id', 'flybasename_gene', 'chromosome_name'] >>> values = [['FBgn0031208', 'FBgn0002121', 'FBgn0031209', 'FBgn0051973'], ['2L']] >>> df = make_lookup( ... mart_name=mart_name, ... dataset=dataset, ... attributes=attributes, ... filters=filters, ... values=values) Alternatively, make a dictionary of filters: values, in which case you don't need to provide `values` separately: >>> filters = { ... 'flybase_gene_id': ['FBgn0031208', 'FBgn0002121', 'FBgn0031209', 'FBgn0051973'], ... 'chromosome_name': ['2L']} >>> df2 = make_lookup( ... mart_name=mart_name, ... dataset=dataset, ... attributes=attributes, ... filters=filters) Confirm that both methods yield identical results: >>> assert np.all(df.values == df2.values) Check results: >>> df.head() flybasename_gene chromosome_name flybase_gene_id FBgn0002121 l(2)gl 2L FBgn0031208 CG11023 2L FBgn0031209 Ir21a 2L FBgn0051973 Cda5 2L <BLANKLINE> [4 rows x 2 columns] Indexing by gene ID (or whatever was the first attribute provided): >>> df.ix['FBgn0031209'] flybasename_gene Ir21a chromosome_name 2L Name: FBgn0031209, dtype: object Extracting data: >>> df.ix['FBgn0031209']['flybasename_gene'] 'Ir21a' Or get all names: >>> df['flybasename_gene'] flybase_gene_id FBgn0002121 l(2)gl FBgn0031208 CG11023 FBgn0031209 Ir21a FBgn0051973 Cda5 Name: flybasename_gene, dtype: object """ mart = r.useDataset(dataset, mart=r.useMart(mart_name)) attributes = robjects.StrVector(attributes) kwargs = dict( attributes=attributes, uniqueRows=unique_rows, mart=mart ) def _filter_and_values_to_RList(d): """`d` is a dictionary of filters: values. Returns a StrVector and a ListVector of StrVectors""" # Could use ListVector directly with the dict, but want to guarantee # positional order of filters and values f = robjects.StrVector(list(d.keys())) v = robjects.ListVector( rpy2.rlike.container.TaggedList( d.values(), tags=list(d.keys()) ) ) return f, v if isinstance(filters, dict): if values is not None: raise ValueError("`values` are already specified in the " "`filters` dictionary") filter_value_dict = filters _filters, _values = _filter_and_values_to_RList(filter_value_dict) kwargs['filters'] = _filters kwargs['values'] = _values elif filters is None: if values is not None: raise ValueError("`filters` must be specified if `values` " "is specified; alternatively use a dictionary " " for `filters`") elif filters and values: # values needs to be a list of lists; convert it to one if it's not # already if not isinstance(values[0], (list, tuple)): values = [values] # If we got here, then assume filters is a list or tuple if len(filters) != len(values): raise ValueError('Length of `filters` and `values` must match') filter_value_dict = dict(zip(filters, values)) _filters, _values = _filter_and_values_to_RList(filter_value_dict) kwargs['filters'] = _filters kwargs['values'] = _values else: raise ValueError('unhandled case') results = r.getBM(**kwargs) return rpy2_to_pandas(results, index_col=0)