def test_general(): s = BioMart() #s.registry() s.datasets("ensembl") s.version("ensembl") s.attributes("oanatinus_gene_ensembl") s.filters("oanatinus_gene_ensembl") s.configuration("oanatinus_gene_ensembl") xmlq = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE Query> <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" > <Dataset name = "pathway" interface = "default" > <Filter name = "referencepeptidesequence_uniprot_id_list" value = "P43403"/> <Attribute name = "stableidentifier_identifier" /> <Attribute name = "pathway_db_id" /> </Dataset> </Query> """ s.query(s._xml_example) # build own xml using the proper functions s.add_dataset_to_xml("protein") s.get_xml()
def get_attributes(dataset): bm = BioMart(verbose=False) attributes = bm.attributes(dataset) attr_dicts = [{"Attribute": k, "Description": v[0]} for k, v in attributes.items()] return pd.DataFrame.from_dict(attr_dicts)
def id_map_ensembl(to_annotation, version, species, psm_protein_id): ''' :param to_annotation: target identifier annotation (i.e. uniprot_swissprot) :param version: Database version :param species: Full species name :param psm_protein_id: list of IDs to be converted :return: BioMart results ''' # If species is in plantsDB, execute plants adjusted function if species == "arabidopsis_thaliana": result = id_map_ensembl_plants(to_annotation, version, species, psm_protein_id) return result else: #adjust UniProt xml annotation for BioMart version >87 if int(version) > 87 and "uniprot" in to_annotation: to_annotation = to_annotation.replace('_', '') #create connection query_string = _id_in_xml_query_(psm_protein_id) version = _get_ensembl_archive_(version, species) dataset = _get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml(to_annotation, query_string) #add attributs biomart.add_attribute_to_xml("ensembl_transcript_id") biomart.add_attribute_to_xml("transcript_start") biomart.add_attribute_to_xml("transcript_end") biomart.add_attribute_to_xml(to_annotation) attributes = biomart.attributes(dataset) #execute query xml_query = biomart.get_xml() tmp_result = biomart.query(xml_query) if len(tmp_result) == 1: print "ERROR: could not convert ID's trough BioMart, " \ "Please check whether Ensembl version/species were correctly supplied" tmp_result = tmp_result.split("\n") result = [] if tmp_result != []: for i in tmp_result: i = i.split("\t") if i[0] != "": result.append([i[0], (int(i[2]) - int(i[1])), i[3]]) else: result.append(i) return result
def retrieve_data_from_biomart(version, species, transcript_id, three_frame_translation): ''' :param version: Database version :param species: Full species name :param transcript_id: list of transcript IDs :return: BioMart results ''' #create connection tr_query = _id_in_xml_query_(transcript_id) version = _get_ensembl_archive_(version, species) dataset = _get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml("ensembl_transcript_id", tr_query) #add attributes biomart.add_attribute_to_xml('ensembl_transcript_id') biomart.add_attribute_to_xml("chromosome_name") biomart.add_attribute_to_xml("strand") if three_frame_translation == "Y": biomart.add_attribute_to_xml("cdna") else: biomart.add_attribute_to_xml("coding") attributes = biomart.attributes(dataset) #execute query xml_query = biomart.get_xml() # create bypass for plants database if species == "arabidopsis_thaliana": xml_query = xml_query.replace('virtualSchemaName = "default"', 'virtualSchemaName = "plants_mart_30"') result = biomart.query(xml_query) result = result.split("\n") return result
def retrieve_data_from_biomart(version,species,transcript_id,three_frame_translation): ''' :param version: Database version :param species: Full species name :param transcript_id: list of transcript IDs :return: BioMart results ''' #create connection tr_query=_id_in_xml_query_(transcript_id) version=_get_ensembl_archive_(version,species) dataset=_get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml("ensembl_transcript_id",tr_query) #add attributes biomart.add_attribute_to_xml('ensembl_transcript_id') biomart.add_attribute_to_xml("chromosome_name") biomart.add_attribute_to_xml("strand") if three_frame_translation=="Y": biomart.add_attribute_to_xml("cdna") else: biomart.add_attribute_to_xml("coding") attributes=biomart.attributes(dataset) #execute query xml_query=biomart.get_xml() # create bypass for plants database if species=="arabidopsis_thaliana": xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"') result=biomart.query(xml_query) result=result.split("\n") return result
def id_map_ensembl(to_annotation,version,species,psm_protein_id): ''' :param to_annotation: target identifier annotation (i.e. uniprot_swissprot) :param version: Database version :param species: Full species name :param psm_protein_id: list of IDs to be converted :return: BioMart results ''' # If species is in plantsDB, execute plants adjusted function if species=="arabidopsis_thaliana": result=id_map_ensembl_plants(to_annotation,version,species,psm_protein_id) return result else: #create connection query_string=_id_in_xml_query_(psm_protein_id) version=_get_ensembl_archive_(version,species) dataset=_get_ensembl_dataset_(species) biomart = BioMart(host=version) #add filters biomart.add_dataset_to_xml(dataset) biomart.add_filter_to_xml(to_annotation,query_string) #add attributs biomart.add_attribute_to_xml("ensembl_transcript_id") biomart.add_attribute_to_xml("transcript_length") biomart.add_attribute_to_xml("uniprot_sptrembl") attributes=biomart.attributes(dataset) #execute query xml_query=biomart.get_xml() result=biomart.query(xml_query) result=result.split("\n") return result