Example #1
0
def get_datasets(mart):

    bm = BioMart(verbose=False)
    datasets = bm.datasets(mart, raw=True)

    return pd.read_table(StringIO(datasets), header=None, usecols=[1, 2],
                         names = ["Name", "Description"])
Example #2
0
def get_attributes(dataset):

    bm = BioMart(verbose=False)
    attributes = bm.attributes(dataset)
    attr_dicts = [{"Attribute": k, "Description": v[0]}
                  for k, v in attributes.items()]
    return pd.DataFrame.from_dict(attr_dicts)
Example #3
0
    def query_biomart(self,
                      dataset,
                      attributes,
                      host="www.ensembl.org",
                      cache=True,
                      save_filename=None):
        bm = BioMart(host=host)
        bm.new_query()
        bm.add_dataset_to_xml(dataset)
        for at in attributes:
            bm.add_attribute_to_xml(at)
        xml_query = bm.get_xml()

        print("Querying {} from {} with attributes {}...".format(
            dataset, host, attributes))
        results = bm.query(xml_query)
        df = pd.read_csv(StringIO(results),
                         header=None,
                         names=attributes,
                         sep="\t",
                         low_memory=True)

        if cache:
            self.cache_dataset(dataset, df, save_filename)
        return df
Example #4
0
def test_biomart_constructor():
    s = BioMart()
    try:
        s.registry()
    except:
        pass
    try:
        s.host = "dummy"
    except:
        pass
    s.host = "www.ensembl.org"
Example #5
0
def test_biomart_constructor():
    s = BioMart()
    try:
        s.registry()
    except:
        pass
    try:
        s.host = "dummy"
    except:
        pass
    s.host = "www.ensembl.org"
Example #6
0
    def run(self, inputs, outputs):
        """Run the analysis."""

        for exp in inputs.expressions:
            if exp.output.species != inputs.expressions[0].output.species:
                self.error(
                    "Input samples are of different Species: "
                    f"{exp.output.species} and {inputs.expressions[0].output.species}."
                )
            if exp.output.exp_type != inputs.expressions[0].output.exp_type:
                self.error(
                    "Input samples have different Normalization types: "
                    f"{exp.output.exp_type} and {inputs.expressions[0].output.exp_type}."
                )
            if exp.output.platform != inputs.expressions[0].output.platform:
                self.error(
                    "Input samples have different Microarray platform types: "
                    f"{exp.output.platform} and {inputs.expressions[0].output.platform}."
                )
            if exp.output.platform_id != inputs.expressions[
                    0].output.platform_id:
                self.error(
                    "Input samples have different GEO platform IDs: "
                    f"{exp.output.platform_id} and {inputs.expressions[0].output.platform_id}."
                )

        species = inputs.expressions[0].output.species
        platform = inputs.expressions[0].output.platform
        platform_id = inputs.expressions[0].output.platform_id

        joined_expressions = join_expressions(inputs.expressions)
        probe_ids = joined_expressions.index.unique()

        if inputs.mapping_file:
            mapping_file = inputs.mapping_file.import_file(
                imported_format="compressed")
            stem = Path(mapping_file).stem
            supported_extensions = (".tab", ".tsv", ".txt")
            if not stem.endswith(supported_extensions):
                self.error(
                    "Mapping file has unsupported file name extension. "
                    f"The supported extensions are {supported_extensions}.")
            mapping = pd.read_csv(
                mapping_file,
                sep="\t",
                header=0,
                names=["ensembl_id", "probe"],
                dtype=str,
            )
            mapping = mapping.drop_duplicates()

            if inputs.source:
                source = inputs.source
            else:
                self.error(
                    "Custom probe id mapping file was provided but no source was selected."
                )
            if inputs.build:
                build = inputs.build
            else:
                self.error(
                    "Custom probe id mapping file was provided but genome build was not defined."
                )
            probe_mapping = "Custom"
        else:
            if not platform_id:
                self.error(
                    "Custom mapping file should be provided when samples do not have a GEO platform defined"
                )
            if platform_id not in PLATFORM_MAP:
                self.error(f"GEO platform {platform_id} is not supported.")

            species_low = species.lower()
            dataset = f"{species_low[0]}{species_low.split(' ')[1]}_gene_ensembl"
            probe_mapping = PLATFORM_MAP[platform_id]

            try:
                b = BioMart()
            except IOError:
                raise Exception(
                    "None of the ENSEMBL Biomart hosts is reachable.")
            except Exception as e:
                raise Exception(f"Unexpected biomart error: {e}")

            b.add_dataset_to_xml(dataset)
            b.add_attribute_to_xml("ensembl_gene_id")
            b.add_attribute_to_xml(probe_mapping)  # type of microarray
            b.add_filter_to_xml(probe_mapping, ",".join(probe_ids))
            xml_query = b.get_xml()
            res = b.query(xml_query)

            mapping = pd.read_csv(
                StringIO(res),
                sep="\t",
                header=None,
                names=["ensembl_id", "probe"],
                dtype=str,
            )
            mapping = mapping.drop_duplicates()
            mapping_file = f"{platform}_mapping.tsv"
            mapping.to_csv(mapping_file, sep="\t", index=False)

            dataset_names = b.get_datasets("ENSEMBL_MART_ENSEMBL")
            display_name = dataset_names.loc[
                dataset_names["name"] == dataset]["description"].to_string()
            # Typical display name would be Human genes (GRCh38.p13)
            build = re.search("\((.+?)\)", display_name).group(1)
            source = "ENSEMBL"

        mapping = mapping.drop_duplicates(subset=["probe"], keep=False)

        data = joined_expressions.loc[mapping["probe"]]
        data["ensembl_id"] = mapping["ensembl_id"].tolist()
        data = data.reset_index()

        # For Ensembl IDs with multiple probe IDs retain the one with highest expression.
        data["mean"] = data.loc[:,
                                data.columns.
                                difference(["probe", "ensembl_id"])].mean(
                                    axis=1)
        idx_max = data.groupby(["ensembl_id"])["mean"].idxmax()
        data = data.loc[idx_max].set_index("ensembl_id")

        data = data.drop(columns=["probe", "mean"])
        data.index.name = "Gene"

        mapped_file = "mapped_expressions.tsv.gz"
        data.to_csv(mapped_file, sep="\t", index=True, compression="gzip")
        for column, exp in zip(data.columns, inputs.expressions):
            mapped_column = f"{column}_mapped_exp.tsv.gz"
            data.to_csv(
                mapped_column,
                sep="\t",
                index=True,
                columns=[column],
                header=["Expression"],
                index_label="Gene",
                compression="gzip",
            )
            self.run_process(
                "mapped-microarray-expression",
                {
                    "exp_unmapped": exp.id,
                    "exp": mapped_column,
                    "source": source,
                    "build": build,
                    "probe_mapping": probe_mapping,
                },
            )

        outputs.mapped_exp = mapped_file
        outputs.mapping = mapping_file
        outputs.probe_mapping = probe_mapping
        outputs.platform = platform
        if platform_id:
            outputs.platform_id = platform_id
Example #7
0
def _test_reactome_example():
    # this is not working anymore...
    s = BioMart()
    s.lookfor("reactome")
    s.datasets("REACTOME")
    #['interaction', 'complex', 'reaction', 'pathway']
    s.new_query()
    s.add_dataset_to_xml("pathway")
    s.add_filter_to_xml("species_selection", "H**o sapiens")
    s.add_attribute_to_xml("pathway_db_id")
    s.add_attribute_to_xml("_displayname")
    xmlq = s.get_xml()
    res = s.query(xmlq)
Example #8
0
 def test_general(self):
     # test another host
     s = BioMart(host="www.ensembl.org")
Example #9
0
def id_map_ensembl(to_annotation,version,species,psm_protein_id):
    '''
    :param to_annotation: target identifier annotation (i.e. uniprot_swissprot)
    :param version: Database version
    :param species: Full species name
    :param psm_protein_id: list of IDs to be converted
    :return: BioMart results
    '''

    # If species is in plantsDB, execute plants adjusted function
    if species=="arabidopsis_thaliana":
        result=id_map_ensembl_plants(to_annotation,version,species,psm_protein_id)
        return result
    else:

        #create connection
        query_string=_id_in_xml_query_(psm_protein_id)
        version=_get_ensembl_archive_(version,species)
        dataset=_get_ensembl_dataset_(species)
        biomart = BioMart(host=version)

        #add filters
        biomart.add_dataset_to_xml(dataset)
        biomart.add_filter_to_xml(to_annotation,query_string)

        #add attributs
        biomart.add_attribute_to_xml("ensembl_transcript_id")
        biomart.add_attribute_to_xml("transcript_length")
        biomart.add_attribute_to_xml("uniprot_sptrembl")
        attributes=biomart.attributes(dataset)

        #execute query
        xml_query=biomart.get_xml()
        result=biomart.query(xml_query)
        result=result.split("\n")

        return result
Example #10
0
def id_map_ensembl_plants(to_annotation,version,species,psm_protein_id):

    #create connection
    query_string=_id_in_xml_query_(psm_protein_id)
    version=_get_ensembl_archive_(version,species)
    dataset=_get_ensembl_dataset_(species)
    biomart = BioMart(host=version)

    #add filters
    biomart.add_dataset_to_xml(dataset)
    biomart.add_filter_to_xml(to_annotation+"_accession",query_string)

    #add attributs
    biomart.add_attribute_to_xml("ensembl_transcript_id")
    biomart.add_attribute_to_xml("transcript_start")
    biomart.add_attribute_to_xml("uniprot_swissprot_accession")
    biomart.add_attribute_to_xml("transcript_end")

    #execute query
    xml_query=biomart.get_xml()
    xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"')

    #parse results and adjust length
    temp_result=biomart.query(xml_query).split("\n")
    result=[]
    for row in temp_result:
        items=row.split("\t")
        # print row
        if len(items)==4:
            length=int(items[3])-int(items[1])+1
            result.append(items[0]+"\t"+str(length)+"\t"+items[2])
    return result
def id_map_ensembl_plants(to_annotation, version, species, psm_protein_id):
    '''
    :param to_annotation: to which annotation
    :param version: ensembl version
    :param species: species
    :param psm_protein_id: list of protein IDs
    :return: list of protein ID's converted to ENSEMBL
    '''
    #create connection
    query_string = _id_in_xml_query_(psm_protein_id)
    version = _get_ensembl_archive_(version, species)
    dataset = _get_ensembl_dataset_(species)
    biomart = BioMart(host=version)

    #add filters
    biomart.add_dataset_to_xml(dataset)
    biomart.add_filter_to_xml(to_annotation + "_accession", query_string)

    #add attributs
    biomart.add_attribute_to_xml("ensembl_transcript_id")
    biomart.add_attribute_to_xml("transcript_start")
    biomart.add_attribute_to_xml("uniprot_swissprot_accession")
    biomart.add_attribute_to_xml("transcript_end")

    #execute query
    xml_query = biomart.get_xml()
    xml_query = xml_query.replace('virtualSchemaName = "default"',
                                  'virtualSchemaName = "plants_mart_30"')

    #parse results and adjust length
    temp_result = biomart.query(xml_query).split("\n")
    result = []
    for row in temp_result:
        items = row.split("\t")
        # print row
        if len(items) == 4:
            length = int(items[3]) - int(items[1]) + 1
            result.append(items[0] + "\t" + str(length) + "\t" + items[2])
    return result
Example #12
0
def retrieve_data_from_biomart(version,species,transcript_id,three_frame_translation):
    '''
    :param version: Database version
    :param species: Full species name
    :param transcript_id: list of transcript IDs
    :return: BioMart results
    '''

    #create connection
    tr_query=_id_in_xml_query_(transcript_id)
    version=_get_ensembl_archive_(version,species)

    dataset=_get_ensembl_dataset_(species)
    biomart = BioMart(host=version)

    #add filters
    biomart.add_dataset_to_xml(dataset)
    biomart.add_filter_to_xml("ensembl_transcript_id",tr_query)

    #add attributes
    biomart.add_attribute_to_xml('ensembl_transcript_id')
    biomart.add_attribute_to_xml("chromosome_name")
    biomart.add_attribute_to_xml("strand")
    if three_frame_translation=="Y":
        biomart.add_attribute_to_xml("cdna")
    else:
        biomart.add_attribute_to_xml("coding")
    attributes=biomart.attributes(dataset)

    #execute query
    xml_query=biomart.get_xml()

    # create bypass for plants database
    if species=="arabidopsis_thaliana":
        xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"')

    result=biomart.query(xml_query)
    result=result.split("\n")

    return result
Example #13
0
 def setup_class(klass):
     klass.s = BioMart(verbose=False)
def id_map_ensembl(to_annotation, version, species, psm_protein_id):
    '''
    :param to_annotation: target identifier annotation (i.e. uniprot_swissprot)
    :param version: Database version
    :param species: Full species name
    :param psm_protein_id: list of IDs to be converted
    :return: BioMart results
    '''
    # If species is in plantsDB, execute plants adjusted function
    if species == "arabidopsis_thaliana":
        result = id_map_ensembl_plants(to_annotation, version, species,
                                       psm_protein_id)
        return result
    else:
        #adjust UniProt xml annotation for BioMart version >87
        if int(version) > 87 and "uniprot" in to_annotation:
            to_annotation = to_annotation.replace('_', '')
        #create connection
        query_string = _id_in_xml_query_(psm_protein_id)
        version = _get_ensembl_archive_(version, species)
        dataset = _get_ensembl_dataset_(species)
        biomart = BioMart(host=version)

        #add filters
        biomart.add_dataset_to_xml(dataset)
        biomart.add_filter_to_xml(to_annotation, query_string)

        #add attributs
        biomart.add_attribute_to_xml("ensembl_transcript_id")
        biomart.add_attribute_to_xml("transcript_start")
        biomart.add_attribute_to_xml("transcript_end")
        biomart.add_attribute_to_xml(to_annotation)
        attributes = biomart.attributes(dataset)

        #execute query
        xml_query = biomart.get_xml()
        tmp_result = biomart.query(xml_query)
        if len(tmp_result) == 1:
            print "ERROR: could not convert ID's trough BioMart, " \
                  "Please check whether Ensembl version/species were correctly supplied"
        tmp_result = tmp_result.split("\n")
        result = []

        if tmp_result != []:
            for i in tmp_result:
                i = i.split("\t")
                if i[0] != "":
                    result.append([i[0], (int(i[2]) - int(i[1])), i[3]])
                else:
                    result.append(i)
        return result
Example #15
0
 def setup_class(klass):
     # ideally we should not provide a host to be more generic
     # but this takes lots of time or is simply down.
     klass.s = BioMart(host='www.ensembl.org', verbose=False)
     klass.mart_test = 'ENSEMBL_MART_ENSEMBL'
Example #16
0
def get_bm(intype, outtype, dataset, mart):

    """Queries biomart for data.
    Gets the whole map between INTYPE <-> OUTTYPE and caches it so that disk
    based lookups are used afterwards."""

    bm = BioMart(verbose=False)

    bm.new_query()

    bm.add_dataset_to_xml(dataset)

    bm.add_attribute_to_xml(intype)
    bm.add_attribute_to_xml(outtype)

    xml_query = bm.get_xml()

    results = bm.query(xml_query)

    map_df = pd.read_table(StringIO(results), header=None, names=[intype,
                                                                  outtype])

    outfile = _get_data_output_filename(intype, outtype, dataset, mart,
                                        default_cache_path=default_cache_path)

    map_df.to_csv(outfile, sep="\t", index=False)

    return map_df
Example #17
0
def test_general():
    s = BioMart()
    #s.registry()
    s.datasets("ensembl")
    s.version("ensembl")
    s.attributes("oanatinus_gene_ensembl")
    s.filters("oanatinus_gene_ensembl")
    s.configuration("oanatinus_gene_ensembl")

    xmlq = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
                        
        <Dataset name = "pathway" interface = "default" >
                <Filter name = "referencepeptidesequence_uniprot_id_list" value = "P43403"/>
                <Attribute name = "stableidentifier_identifier" />
                <Attribute name = "pathway_db_id" />
        </Dataset>
</Query>
"""
    s.query(s._xml_example)


    # build own xml using the proper functions
    s.add_dataset_to_xml("protein")
    s.get_xml()
Example #18
0
def biomart():
    biomart = BioMart(host='www.ensembl.org', verbose=False)
    biomart.mart_test = 'ENSEMBL_MART_ENSEMBL'
    return biomart
Example #19
0
def biomart():
    biomart = BioMart(host='www.ensembl.org', verbose=False)
    biomart.mart_test = 'ENSEMBL_MART_ENSEMBL'
    return biomart
Example #20
0
def _test_reactome_example():
    # this is not working anymore...
    s = BioMart("reactome.org")
    s.lookfor("reactome")
    s.datasets("REACTOME")
    #['interaction', 'complex', 'reaction', 'pathway']
    s.new_query()
    s.add_dataset_to_xml("pathway")
    s.add_filter_to_xml("species_selection", "H**o sapiens")
    s.add_attribute_to_xml("pathway_db_id")
    s.add_attribute_to_xml("_displayname")
    xmlq = s.get_xml()
    res = s.query(xmlq)
def retrieve_data_from_biomart(version, species, transcript_id,
                               three_frame_translation):
    '''
    :param version: Database version
    :param species: Full species name
    :param transcript_id: list of transcript IDs
    :return: BioMart results
    '''

    #create connection
    tr_query = _id_in_xml_query_(transcript_id)
    version = _get_ensembl_archive_(version, species)

    dataset = _get_ensembl_dataset_(species)
    biomart = BioMart(host=version)

    #add filters
    biomart.add_dataset_to_xml(dataset)
    biomart.add_filter_to_xml("ensembl_transcript_id", tr_query)

    #add attributes
    biomart.add_attribute_to_xml('ensembl_transcript_id')
    biomart.add_attribute_to_xml("chromosome_name")
    biomart.add_attribute_to_xml("strand")
    if three_frame_translation == "Y":
        biomart.add_attribute_to_xml("cdna")
    else:
        biomart.add_attribute_to_xml("coding")
    attributes = biomart.attributes(dataset)

    #execute query
    xml_query = biomart.get_xml()

    # create bypass for plants database
    if species == "arabidopsis_thaliana":
        xml_query = xml_query.replace('virtualSchemaName = "default"',
                                      'virtualSchemaName = "plants_mart_30"')

    result = biomart.query(xml_query)
    result = result.split("\n")

    return result