def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns), mart=mart) # result is a dataframe. # rx returns a dataframe. # rx()[0] returns a vector for data in zip(*[result.rx(x)[0] for x in columns]): yield dict(zip(columns, data))
def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. *columns* is a dictionary mapping biomart columns to columns in the output tables. *biomart* and *dataset* denote the database and dataset to get the data from. ''' R.library("biomaRt") keys = columns.keys() mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def biomart_iterator( columns, biomart = "ensembl", dataset = "hsapiens_gene_ensembl", host = 'www.biomart.org' ): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset= dataset, host=host ) result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), mart=mart ) # result is a dataframe. # rx returns a dataframe. # rx()[0] returns a vector for data in zip( *[ result.rx(x)[0] for x in columns] ): yield dict( zip(columns, data) )
def get_exons(mart): """Queries a Mart object to find all exons of its dataset attribute. Forms a specific getBM query that is sent to the BioMart API to retrieve information about the exons (and their exonic coordinates) of a specific Dataset. The output is then transformed via the GRanges Bioconductor package and seqnames converted to UCSC standard. Args: mart: an rpy2-converted biomaRt Mart object. Returns: An rpy2 DataFrame containing a table of relevant exon information. DataFrame column headers are: ["seqnames", "start", "end", "width", "strand"] """ exons = R.getBM(attributes = StrVector(("chromosome_name", "exon_chrom_start", "exon_chrom_end", "strand")), mart=mart) exons_ranges = R.GRanges( seqnames=exons.rx2('chromosome_name'), ranges=R.IRanges(start=exons.rx2('exon_chrom_start'), end=exons.rx2('exon_chrom_end')), strand='+' if exons.rx2('strand') == '1L' else '-') # This was hell to find # https://stackoverflow.com/questions/38806898/ set_method = R("`seqlevelsStyle<-`") exons_ranges = set_method(exons_ranges, "UCSC") as_data_frame = R("function(x) as.data.frame(x)") exons_ranges_df = as_data_frame(exons_ranges) return exons_ranges_df
def get_genes(mart): """Queries a Mart object to find all genes of its dataset attribute. Forms a specific getBM query that is sent to the BioMart API to retrieve information about the genes of a specifc Dataset. This output is then converted from an rpy2 DataFrame to a pandas DataFrame. Args: mart: an rpy2-converted biomaRt Mart object. Returns: An pandas DataFrame containing a table of relevant gene information. DataFrame column headers are: ["gene_name", "chromosome_name", "start_position", "end_position"] """ genes = R.getBM( attributes = StrVector(("external_gene_name", "chromosome_name", "start_position", "end_position")), mart=mart) genes_df = pandas2ri.ri2py(genes) genes_df.rename(columns={'external_gene_name': 'gene_name'}, inplace=True) return genes_df
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector( ["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write("\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector(["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def convert_hgnc2ensembl(hgnc_id): init_biomaRt() v = R.c(hgnc_id) res = R.getBM(attributes=R.c("ensembl_gene_id"), filters="hgnc_symbol", values=v, mart=__mart) try: return R.get("ensembl_gene_id", res)[0] except: print 'Error convert_hgnc2ensembl: '+str(hgnc_id)+' not found in database' return None
def convert_list_ensembl2hgnc(ensembl_id_list): init_biomaRt() v = R.c(ensembl_id_list) res = R.getBM(attributes=R.c("hgnc_symbol"), filters="ensembl_gene_id", values=v, mart=__mart) try: return R.get("hgnc_symbol", res) except: print 'Error convert_ensembl2hgnc: '+str(ensembl_id)+' not found in database' return None
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org', path="/biomart/martservice", filters=None, values=None, archive=False): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset=dataset, host=host, path=path, archive=archive) if filters is not None: filter_names = R.StrVector(filters) else: filter_names = "" if values is not None: filter_values = values else: filter_values = "" # result is a dataframe result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), filters=filter_names, values=filter_values, mart=mart) # access via result.rx was broken in rpy2 2.4.2, thus try # numeric access assert tuple(result.colnames) == tuple(columns),\ "colnames in dataframe: %s different from expected: %s" % \ (str(tuple(result.colnames)), tuple(columns)) for data in zip(*[result[x] for x in range(len(columns))]): yield dict(zip(columns, data))
def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- outfile : string Filename of output file columns : dict Dictionary mapping biomart columns to columns in the output table. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host ''' R.library("biomaRt") keys = columns.keys() # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- outfile : string Filename of output file columns : dict Dictionary mapping biomart columns to columns in the output table. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host ''' R.library("biomaRt") keys = list(columns.keys()) # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org', path="/biomart/martservice", filters=None, values=None, archive=False): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- columns : dict List of fields to obtain. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host filters : list List of filter to use values : list Values of the filters archive : bool If True, use archived version Returns ------- iterator Iterator over rows in biomart database. Each row is dictionary mapping column names to values. ''' R.library("biomaRt") # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset, path=path, archive=archive) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host, path=path, archive=archive) if filters is not None: filter_names = rpy2.robjects.vectors.StrVector(filters) else: filter_names = "" if values is not None: filter_values = values else: filter_values = "" # result is a dataframe result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns), filters=filter_names, values=filter_values, mart=mart) # access via result.rx was broken in rpy2 2.4.2, thus try # numeric access assert tuple(result.colnames) == tuple(columns),\ "colnames in dataframe: %s different from expected: %s" % \ (str(tuple(result.colnames)), tuple(columns)) for data in zip(*[result[x] for x in range(len(columns))]): yield dict(zip(columns, data))
def return_bm_df(atts, mart): atts_vector = StrVector([x for x in atts]) BM = R.getBM(attributes = atts_vector, mart = mart) return r2pd_dataframe(BM)
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org', path="/biomart/martservice", filters=None, values=None, archive=False): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- columns : dict List of fields to obtain. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host filters : list List of filter to use values : list Values of the filters archive : bool If True, use archived version Returns ------- iterator Iterator over rows in biomart database. Each row is dictionary mapping column names to values. ''' R.library("biomaRt") # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset, path=path, archive=archive) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host, path=path, archive=archive) if filters is not None: filter_names = rpy2.robjects.vectors.StrVector(filters) else: filter_names = "" if values is not None: filter_values = values else: filter_values = "" # result is a dataframe result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), filters=filter_names, values=filter_values, mart=mart) # access via result.rx was broken in rpy2 2.4.2, thus try # numeric access assert tuple(result.colnames) == tuple(columns),\ "colnames in dataframe: %s different from expected: %s" % \ (str(tuple(result.colnames)), tuple(columns)) for data in zip(*[result[x] for x in range(len(columns))]): yield dict(list(zip(columns, data)))
def make_lookup(mart_name, dataset, attributes, filters=None, values=None, unique_rows=True): """ Given a mart name, dataset name, and a list of attributes, return a pandas.DataFrame indexed by the first attribute in the list provided. In R, filters is a character vector, and values is either a single character vector (if only one filter provided) or a list of character vectors. This function allows `filters` to be a dictionary where keys are filters and values are...values. >>> mart_name = 'ensembl' >>> dataset = 'dmelanogaster_gene_ensembl' >>> filters = ['flybase_gene_id', 'chromosome_name'] >>> attributes = ['flybase_gene_id', 'flybasename_gene', 'chromosome_name'] >>> values = [['FBgn0031208', 'FBgn0002121', 'FBgn0031209', 'FBgn0051973'], ['2L']] >>> df = make_lookup( ... mart_name=mart_name, ... dataset=dataset, ... attributes=attributes, ... filters=filters, ... values=values) Alternatively, make a dictionary of filters: values, in which case you don't need to provide `values` separately: >>> filters = { ... 'flybase_gene_id': ['FBgn0031208', 'FBgn0002121', 'FBgn0031209', 'FBgn0051973'], ... 'chromosome_name': ['2L']} >>> df2 = make_lookup( ... mart_name=mart_name, ... dataset=dataset, ... attributes=attributes, ... filters=filters) Confirm that both methods yield identical results: >>> assert np.all(df.values == df2.values) Check results: >>> df.head() flybasename_gene chromosome_name flybase_gene_id FBgn0002121 l(2)gl 2L FBgn0031208 CG11023 2L FBgn0031209 Ir21a 2L FBgn0051973 Cda5 2L <BLANKLINE> [4 rows x 2 columns] Indexing by gene ID (or whatever was the first attribute provided): >>> df.ix['FBgn0031209'] flybasename_gene Ir21a chromosome_name 2L Name: FBgn0031209, dtype: object Extracting data: >>> df.ix['FBgn0031209']['flybasename_gene'] 'Ir21a' Or get all names: >>> df['flybasename_gene'] flybase_gene_id FBgn0002121 l(2)gl FBgn0031208 CG11023 FBgn0031209 Ir21a FBgn0051973 Cda5 Name: flybasename_gene, dtype: object """ mart = r.useDataset(dataset, mart=r.useMart(mart_name)) attributes = robjects.StrVector(attributes) kwargs = dict( attributes=attributes, uniqueRows=unique_rows, mart=mart ) def _filter_and_values_to_RList(d): """`d` is a dictionary of filters: values. Returns a StrVector and a ListVector of StrVectors""" # Could use ListVector directly with the dict, but want to guarantee # positional order of filters and values f = robjects.StrVector(list(d.keys())) v = robjects.ListVector( rpy2.rlike.container.TaggedList( d.values(), tags=list(d.keys()) ) ) return f, v if isinstance(filters, dict): if values is not None: raise ValueError("`values` are already specified in the " "`filters` dictionary") filter_value_dict = filters _filters, _values = _filter_and_values_to_RList(filter_value_dict) kwargs['filters'] = _filters kwargs['values'] = _values elif filters is None: if values is not None: raise ValueError("`filters` must be specified if `values` " "is specified; alternatively use a dictionary " " for `filters`") elif filters and values: # values needs to be a list of lists; convert it to one if it's not # already if not isinstance(values[0], (list, tuple)): values = [values] # If we got here, then assume filters is a list or tuple if len(filters) != len(values): raise ValueError('Length of `filters` and `values` must match') filter_value_dict = dict(zip(filters, values)) _filters, _values = _filter_and_values_to_RList(filter_value_dict) kwargs['filters'] = _filters kwargs['values'] = _values else: raise ValueError('unhandled case') results = r.getBM(**kwargs) return rpy2_to_pandas(results, index_col=0)