def get_datasets(mart): bm = BioMart(verbose=False) datasets = bm.datasets(mart, raw=True) return pd.read_table(StringIO(datasets), header=None, usecols=[1, 2], names = ["Name", "Description"])
def get_attributes(dataset): bm = BioMart(verbose=False) attributes = bm.attributes(dataset) attr_dicts = [{"Attribute": k, "Description": v[0]} for k, v in attributes.items()] return pd.DataFrame.from_dict(attr_dicts)
def query_biomart(self, dataset, attributes, host="www.ensembl.org", cache=True, save_filename=None): bm = BioMart(host=host) bm.new_query() bm.add_dataset_to_xml(dataset) for at in attributes: bm.add_attribute_to_xml(at) xml_query = bm.get_xml() print("Querying {} from {} with attributes {}...".format( dataset, host, attributes)) results = bm.query(xml_query) df = pd.read_csv(StringIO(results), header=None, names=attributes, sep="\t", low_memory=True) if cache: self.cache_dataset(dataset, df, save_filename) return df
def test_biomart_constructor(): s = BioMart() try: s.registry() except: pass try: s.host = "dummy" except: pass s.host = "www.ensembl.org"
def run(self, inputs, outputs): """Run the analysis.""" for exp in inputs.expressions: if exp.output.species != inputs.expressions[0].output.species: self.error( "Input samples are of different Species: " f"{exp.output.species} and {inputs.expressions[0].output.species}." ) if exp.output.exp_type != inputs.expressions[0].output.exp_type: self.error( "Input samples have different Normalization types: " f"{exp.output.exp_type} and {inputs.expressions[0].output.exp_type}." ) if exp.output.platform != inputs.expressions[0].output.platform: self.error( "Input samples have different Microarray platform types: " f"{exp.output.platform} and {inputs.expressions[0].output.platform}." ) if exp.output.platform_id != inputs.expressions[ 0].output.platform_id: self.error( "Input samples have different GEO platform IDs: " f"{exp.output.platform_id} and {inputs.expressions[0].output.platform_id}." ) species = inputs.expressions[0].output.species platform = inputs.expressions[0].output.platform platform_id = inputs.expressions[0].output.platform_id joined_expressions = join_expressions(inputs.expressions) probe_ids = joined_expressions.index.unique() if inputs.mapping_file: mapping_file = inputs.mapping_file.import_file( imported_format="compressed") stem = Path(mapping_file).stem supported_extensions = (".tab", ".tsv", ".txt") if not stem.endswith(supported_extensions): self.error( "Mapping file has unsupported file name extension. " f"The supported extensions are {supported_extensions}.") mapping = pd.read_csv( mapping_file, sep="\t", header=0, names=["ensembl_id", "probe"], dtype=str, ) mapping = mapping.drop_duplicates() if inputs.source: source = inputs.source else: self.error( "Custom probe id mapping file was provided but no source was selected." ) if inputs.build: build = inputs.build else: self.error( "Custom probe id mapping file was provided but genome build was not defined." ) probe_mapping = "Custom" else: if not platform_id: self.error( "Custom mapping file should be provided when samples do not have a GEO platform defined" ) if platform_id not in PLATFORM_MAP: self.error(f"GEO platform {platform_id} is not supported.") species_low = species.lower() dataset = f"{species_low[0]}{species_low.split(' ')[1]}_gene_ensembl" probe_mapping = PLATFORM_MAP[platform_id] try: b = BioMart() except IOError: raise Exception( "None of the ENSEMBL Biomart hosts is reachable.") except Exception as e: raise Exception(f"Unexpected biomart error: {e}") b.add_dataset_to_xml(dataset) b.add_attribute_to_xml("ensembl_gene_id") b.add_attribute_to_xml(probe_mapping) # type of microarray b.add_filter_to_xml(probe_mapping, ",".join(probe_ids)) xml_query = b.get_xml() res = b.query(xml_query) mapping = pd.read_csv( StringIO(res), sep="\t", header=None, names=["ensembl_id", "probe"], dtype=str, ) mapping = mapping.drop_duplicates() mapping_file = f"{platform}_mapping.tsv" mapping.to_csv(mapping_file, sep="\t", index=False) dataset_names = b.get_datasets("ENSEMBL_MART_ENSEMBL") display_name = dataset_names.loc[ dataset_names["name"] == dataset]["description"].to_string() # Typical display name would be Human genes (GRCh38.p13) build = re.search("\((.+?)\)", display_name).group(1) source = "ENSEMBL" mapping = mapping.drop_duplicates(subset=["probe"], keep=False) data = joined_expressions.loc[mapping["probe"]] data["ensembl_id"] = mapping["ensembl_id"].tolist() data = data.reset_index() # For Ensembl IDs with multiple probe IDs retain the one with highest expression. data["mean"] = data.loc[:, data.columns. difference(["probe", "ensembl_id"])].mean( axis=1) idx_max = data.groupby(["ensembl_id"])["mean"].idxmax() data = data.loc[idx_max].set_index("ensembl_id") data = data.drop(columns=["probe", "mean"]) data.index.name = "Gene" mapped_file = "mapped_expressions.tsv.gz" data.to_csv(mapped_file, sep="\t", index=True, compression="gzip") for column, exp in zip(data.columns, inputs.expressions): mapped_column = f"{column}_mapped_exp.tsv.gz" data.to_csv( mapped_column, sep="\t", index=True, columns=[column], header=["Expression"], index_label="Gene", compression="gzip", ) self.run_process( "mapped-microarray-expression", { "exp_unmapped": exp.id, "exp": mapped_column, "source": source, "build": build, "probe_mapping": probe_mapping, }, ) outputs.mapped_exp = mapped_file outputs.mapping = mapping_file outputs.probe_mapping = probe_mapping outputs.platform = platform if platform_id: outputs.platform_id = platform_id
def _test_reactome_example(): # this is not working anymore... s = BioMart() s.lookfor("reactome") s.datasets("REACTOME") #['interaction', 'complex', 'reaction', 'pathway'] s.new_query() s.add_dataset_to_xml("pathway") s.add_filter_to_xml("species_selection", "H**o sapiens") s.add_attribute_to_xml("pathway_db_id") s.add_attribute_to_xml("_displayname") xmlq = s.get_xml() res = s.query(xmlq)
def test_general(self): # test another host s = BioMart(host="www.ensembl.org")
def id_map_ensembl(to_annotation,version,species,psm_protein_id): ''' :param to_annotation: target identifier annotation (i.e. uniprot_swissprot) :param version: Database version :param species: Full species name :param psm_protein_id: list of IDs to be converted :return: BioMart results ''' # If species is in plantsDB, execute plants adjusted function if species=="arabidopsis_thaliana": result=id_map_ensembl_plants(to_annotation,version,species,psm_protein_id) return result else: #create connection query_string=_id_in_xml_query_(psm_protein_id) version=_get_ensembl_archive_(version,species) dataset=_get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml(to_annotation,query_string) #add attributs biomart.add_attribute_to_xml("ensembl_transcript_id") biomart.add_attribute_to_xml("transcript_length") biomart.add_attribute_to_xml("uniprot_sptrembl") attributes=biomart.attributes(dataset) #execute query xml_query=biomart.get_xml() result=biomart.query(xml_query) result=result.split("\n") return result
def id_map_ensembl_plants(to_annotation,version,species,psm_protein_id): #create connection query_string=_id_in_xml_query_(psm_protein_id) version=_get_ensembl_archive_(version,species) dataset=_get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml(to_annotation+"_accession",query_string) #add attributs biomart.add_attribute_to_xml("ensembl_transcript_id") biomart.add_attribute_to_xml("transcript_start") biomart.add_attribute_to_xml("uniprot_swissprot_accession") biomart.add_attribute_to_xml("transcript_end") #execute query xml_query=biomart.get_xml() xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"') #parse results and adjust length temp_result=biomart.query(xml_query).split("\n") result=[] for row in temp_result: items=row.split("\t") # print row if len(items)==4: length=int(items[3])-int(items[1])+1 result.append(items[0]+"\t"+str(length)+"\t"+items[2]) return result
def id_map_ensembl_plants(to_annotation, version, species, psm_protein_id): ''' :param to_annotation: to which annotation :param version: ensembl version :param species: species :param psm_protein_id: list of protein IDs :return: list of protein ID's converted to ENSEMBL ''' #create connection query_string = _id_in_xml_query_(psm_protein_id) version = _get_ensembl_archive_(version, species) dataset = _get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml(to_annotation + "_accession", query_string) #add attributs biomart.add_attribute_to_xml("ensembl_transcript_id") biomart.add_attribute_to_xml("transcript_start") biomart.add_attribute_to_xml("uniprot_swissprot_accession") biomart.add_attribute_to_xml("transcript_end") #execute query xml_query = biomart.get_xml() xml_query = xml_query.replace('virtualSchemaName = "default"', 'virtualSchemaName = "plants_mart_30"') #parse results and adjust length temp_result = biomart.query(xml_query).split("\n") result = [] for row in temp_result: items = row.split("\t") # print row if len(items) == 4: length = int(items[3]) - int(items[1]) + 1 result.append(items[0] + "\t" + str(length) + "\t" + items[2]) return result
def retrieve_data_from_biomart(version,species,transcript_id,three_frame_translation): ''' :param version: Database version :param species: Full species name :param transcript_id: list of transcript IDs :return: BioMart results ''' #create connection tr_query=_id_in_xml_query_(transcript_id) version=_get_ensembl_archive_(version,species) dataset=_get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml("ensembl_transcript_id",tr_query) #add attributes biomart.add_attribute_to_xml('ensembl_transcript_id') biomart.add_attribute_to_xml("chromosome_name") biomart.add_attribute_to_xml("strand") if three_frame_translation=="Y": biomart.add_attribute_to_xml("cdna") else: biomart.add_attribute_to_xml("coding") attributes=biomart.attributes(dataset) #execute query xml_query=biomart.get_xml() # create bypass for plants database if species=="arabidopsis_thaliana": xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"') result=biomart.query(xml_query) result=result.split("\n") return result
def setup_class(klass): klass.s = BioMart(verbose=False)
def id_map_ensembl(to_annotation, version, species, psm_protein_id): ''' :param to_annotation: target identifier annotation (i.e. uniprot_swissprot) :param version: Database version :param species: Full species name :param psm_protein_id: list of IDs to be converted :return: BioMart results ''' # If species is in plantsDB, execute plants adjusted function if species == "arabidopsis_thaliana": result = id_map_ensembl_plants(to_annotation, version, species, psm_protein_id) return result else: #adjust UniProt xml annotation for BioMart version >87 if int(version) > 87 and "uniprot" in to_annotation: to_annotation = to_annotation.replace('_', '') #create connection query_string = _id_in_xml_query_(psm_protein_id) version = _get_ensembl_archive_(version, species) dataset = _get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml(to_annotation, query_string) #add attributs biomart.add_attribute_to_xml("ensembl_transcript_id") biomart.add_attribute_to_xml("transcript_start") biomart.add_attribute_to_xml("transcript_end") biomart.add_attribute_to_xml(to_annotation) attributes = biomart.attributes(dataset) #execute query xml_query = biomart.get_xml() tmp_result = biomart.query(xml_query) if len(tmp_result) == 1: print "ERROR: could not convert ID's trough BioMart, " \ "Please check whether Ensembl version/species were correctly supplied" tmp_result = tmp_result.split("\n") result = [] if tmp_result != []: for i in tmp_result: i = i.split("\t") if i[0] != "": result.append([i[0], (int(i[2]) - int(i[1])), i[3]]) else: result.append(i) return result
def setup_class(klass): # ideally we should not provide a host to be more generic # but this takes lots of time or is simply down. klass.s = BioMart(host='www.ensembl.org', verbose=False) klass.mart_test = 'ENSEMBL_MART_ENSEMBL'
def get_bm(intype, outtype, dataset, mart): """Queries biomart for data. Gets the whole map between INTYPE <-> OUTTYPE and caches it so that disk based lookups are used afterwards.""" bm = BioMart(verbose=False) bm.new_query() bm.add_dataset_to_xml(dataset) bm.add_attribute_to_xml(intype) bm.add_attribute_to_xml(outtype) xml_query = bm.get_xml() results = bm.query(xml_query) map_df = pd.read_table(StringIO(results), header=None, names=[intype, outtype]) outfile = _get_data_output_filename(intype, outtype, dataset, mart, default_cache_path=default_cache_path) map_df.to_csv(outfile, sep="\t", index=False) return map_df
def test_general(): s = BioMart() #s.registry() s.datasets("ensembl") s.version("ensembl") s.attributes("oanatinus_gene_ensembl") s.filters("oanatinus_gene_ensembl") s.configuration("oanatinus_gene_ensembl") xmlq = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE Query> <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > <Dataset name = "pathway" interface = "default" > <Filter name = "referencepeptidesequence_uniprot_id_list" value = "P43403"/> <Attribute name = "stableidentifier_identifier" /> <Attribute name = "pathway_db_id" /> </Dataset> </Query> """ s.query(s._xml_example) # build own xml using the proper functions s.add_dataset_to_xml("protein") s.get_xml()
def biomart(): biomart = BioMart(host='www.ensembl.org', verbose=False) biomart.mart_test = 'ENSEMBL_MART_ENSEMBL' return biomart
def _test_reactome_example(): # this is not working anymore... s = BioMart("reactome.org") s.lookfor("reactome") s.datasets("REACTOME") #['interaction', 'complex', 'reaction', 'pathway'] s.new_query() s.add_dataset_to_xml("pathway") s.add_filter_to_xml("species_selection", "H**o sapiens") s.add_attribute_to_xml("pathway_db_id") s.add_attribute_to_xml("_displayname") xmlq = s.get_xml() res = s.query(xmlq)
def retrieve_data_from_biomart(version, species, transcript_id, three_frame_translation): ''' :param version: Database version :param species: Full species name :param transcript_id: list of transcript IDs :return: BioMart results ''' #create connection tr_query = _id_in_xml_query_(transcript_id) version = _get_ensembl_archive_(version, species) dataset = _get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml("ensembl_transcript_id", tr_query) #add attributes biomart.add_attribute_to_xml('ensembl_transcript_id') biomart.add_attribute_to_xml("chromosome_name") biomart.add_attribute_to_xml("strand") if three_frame_translation == "Y": biomart.add_attribute_to_xml("cdna") else: biomart.add_attribute_to_xml("coding") attributes = biomart.attributes(dataset) #execute query xml_query = biomart.get_xml() # create bypass for plants database if species == "arabidopsis_thaliana": xml_query = xml_query.replace('virtualSchemaName = "default"', 'virtualSchemaName = "plants_mart_30"') result = biomart.query(xml_query) result = result.split("\n") return result