def id_map_ensembl_plants(to_annotation,version,species,psm_protein_id):

    #create connection
    query_string=_id_in_xml_query_(psm_protein_id)
    version=_get_ensembl_archive_(version,species)
    dataset=_get_ensembl_dataset_(species)
    biomart = BioMart(host=version)

    #add filters
    biomart.add_dataset_to_xml(dataset)
    biomart.add_filter_to_xml(to_annotation+"_accession",query_string)

    #add attributs
    biomart.add_attribute_to_xml("ensembl_transcript_id")
    biomart.add_attribute_to_xml("transcript_start")
    biomart.add_attribute_to_xml("uniprot_swissprot_accession")
    biomart.add_attribute_to_xml("transcript_end")

    #execute query
    xml_query=biomart.get_xml()
    xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"')

    #parse results and adjust length
    temp_result=biomart.query(xml_query).split("\n")
    result=[]
    for row in temp_result:
        items=row.split("\t")
        # print row
        if len(items)==4:
            length=int(items[3])-int(items[1])+1
            result.append(items[0]+"\t"+str(length)+"\t"+items[2])
    return result
Exemple #2
0
def _test_reactome_example():
    # this is not working anymore...
    s = BioMart("reactome.org")
    s.lookfor("reactome")
    s.datasets("REACTOME")
    #['interaction', 'complex', 'reaction', 'pathway']
    s.new_query()
    s.add_dataset_to_xml("pathway")
    s.add_filter_to_xml("species_selection", "H**o sapiens")
    s.add_attribute_to_xml("pathway_db_id")
    s.add_attribute_to_xml("_displayname")
    xmlq = s.get_xml()
    res = s.query(xmlq)
def _test_reactome_example():
    # this is not working anymore...
    s = BioMart()
    s.lookfor("reactome")
    s.datasets("REACTOME")
    #['interaction', 'complex', 'reaction', 'pathway']
    s.new_query()
    s.add_dataset_to_xml("pathway")
    s.add_filter_to_xml("species_selection", "H**o sapiens")
    s.add_attribute_to_xml("pathway_db_id")
    s.add_attribute_to_xml("_displayname")
    xmlq = s.get_xml()
    res = s.query(xmlq)
def id_map_ensembl(to_annotation, version, species, psm_protein_id):
    '''
    :param to_annotation: target identifier annotation (i.e. uniprot_swissprot)
    :param version: Database version
    :param species: Full species name
    :param psm_protein_id: list of IDs to be converted
    :return: BioMart results
    '''
    # If species is in plantsDB, execute plants adjusted function
    if species == "arabidopsis_thaliana":
        result = id_map_ensembl_plants(to_annotation, version, species,
                                       psm_protein_id)
        return result
    else:
        #adjust UniProt xml annotation for BioMart version >87
        if int(version) > 87 and "uniprot" in to_annotation:
            to_annotation = to_annotation.replace('_', '')
        #create connection
        query_string = _id_in_xml_query_(psm_protein_id)
        version = _get_ensembl_archive_(version, species)
        dataset = _get_ensembl_dataset_(species)
        biomart = BioMart(host=version)

        #add filters
        biomart.add_dataset_to_xml(dataset)
        biomart.add_filter_to_xml(to_annotation, query_string)

        #add attributs
        biomart.add_attribute_to_xml("ensembl_transcript_id")
        biomart.add_attribute_to_xml("transcript_start")
        biomart.add_attribute_to_xml("transcript_end")
        biomart.add_attribute_to_xml(to_annotation)
        attributes = biomart.attributes(dataset)

        #execute query
        xml_query = biomart.get_xml()
        tmp_result = biomart.query(xml_query)
        if len(tmp_result) == 1:
            print "ERROR: could not convert ID's trough BioMart, " \
                  "Please check whether Ensembl version/species were correctly supplied"
        tmp_result = tmp_result.split("\n")
        result = []

        if tmp_result != []:
            for i in tmp_result:
                i = i.split("\t")
                if i[0] != "":
                    result.append([i[0], (int(i[2]) - int(i[1])), i[3]])
                else:
                    result.append(i)
        return result
def retrieve_data_from_biomart(version, species, transcript_id,
                               three_frame_translation):
    '''
    :param version: Database version
    :param species: Full species name
    :param transcript_id: list of transcript IDs
    :return: BioMart results
    '''

    #create connection
    tr_query = _id_in_xml_query_(transcript_id)
    version = _get_ensembl_archive_(version, species)

    dataset = _get_ensembl_dataset_(species)
    biomart = BioMart(host=version)

    #add filters
    biomart.add_dataset_to_xml(dataset)
    biomart.add_filter_to_xml("ensembl_transcript_id", tr_query)

    #add attributes
    biomart.add_attribute_to_xml('ensembl_transcript_id')
    biomart.add_attribute_to_xml("chromosome_name")
    biomart.add_attribute_to_xml("strand")
    if three_frame_translation == "Y":
        biomart.add_attribute_to_xml("cdna")
    else:
        biomart.add_attribute_to_xml("coding")
    attributes = biomart.attributes(dataset)

    #execute query
    xml_query = biomart.get_xml()

    # create bypass for plants database
    if species == "arabidopsis_thaliana":
        xml_query = xml_query.replace('virtualSchemaName = "default"',
                                      'virtualSchemaName = "plants_mart_30"')

    result = biomart.query(xml_query)
    result = result.split("\n")

    return result
def retrieve_data_from_biomart(version,species,transcript_id,three_frame_translation):
    '''
    :param version: Database version
    :param species: Full species name
    :param transcript_id: list of transcript IDs
    :return: BioMart results
    '''

    #create connection
    tr_query=_id_in_xml_query_(transcript_id)
    version=_get_ensembl_archive_(version,species)

    dataset=_get_ensembl_dataset_(species)
    biomart = BioMart(host=version)

    #add filters
    biomart.add_dataset_to_xml(dataset)
    biomart.add_filter_to_xml("ensembl_transcript_id",tr_query)

    #add attributes
    biomart.add_attribute_to_xml('ensembl_transcript_id')
    biomart.add_attribute_to_xml("chromosome_name")
    biomart.add_attribute_to_xml("strand")
    if three_frame_translation=="Y":
        biomart.add_attribute_to_xml("cdna")
    else:
        biomart.add_attribute_to_xml("coding")
    attributes=biomart.attributes(dataset)

    #execute query
    xml_query=biomart.get_xml()

    # create bypass for plants database
    if species=="arabidopsis_thaliana":
        xml_query=xml_query.replace('virtualSchemaName = "default"','virtualSchemaName = "plants_mart_30"')

    result=biomart.query(xml_query)
    result=result.split("\n")

    return result
def id_map_ensembl_plants(to_annotation, version, species, psm_protein_id):
    '''
    :param to_annotation: to which annotation
    :param version: ensembl version
    :param species: species
    :param psm_protein_id: list of protein IDs
    :return: list of protein ID's converted to ENSEMBL
    '''
    #create connection
    query_string = _id_in_xml_query_(psm_protein_id)
    version = _get_ensembl_archive_(version, species)
    dataset = _get_ensembl_dataset_(species)
    biomart = BioMart(host=version)

    #add filters
    biomart.add_dataset_to_xml(dataset)
    biomart.add_filter_to_xml(to_annotation + "_accession", query_string)

    #add attributs
    biomart.add_attribute_to_xml("ensembl_transcript_id")
    biomart.add_attribute_to_xml("transcript_start")
    biomart.add_attribute_to_xml("uniprot_swissprot_accession")
    biomart.add_attribute_to_xml("transcript_end")

    #execute query
    xml_query = biomart.get_xml()
    xml_query = xml_query.replace('virtualSchemaName = "default"',
                                  'virtualSchemaName = "plants_mart_30"')

    #parse results and adjust length
    temp_result = biomart.query(xml_query).split("\n")
    result = []
    for row in temp_result:
        items = row.split("\t")
        # print row
        if len(items) == 4:
            length = int(items[3]) - int(items[1]) + 1
            result.append(items[0] + "\t" + str(length) + "\t" + items[2])
    return result
def id_map_ensembl(to_annotation,version,species,psm_protein_id):
    '''
    :param to_annotation: target identifier annotation (i.e. uniprot_swissprot)
    :param version: Database version
    :param species: Full species name
    :param psm_protein_id: list of IDs to be converted
    :return: BioMart results
    '''

    # If species is in plantsDB, execute plants adjusted function
    if species=="arabidopsis_thaliana":
        result=id_map_ensembl_plants(to_annotation,version,species,psm_protein_id)
        return result
    else:

        #create connection
        query_string=_id_in_xml_query_(psm_protein_id)
        version=_get_ensembl_archive_(version,species)
        dataset=_get_ensembl_dataset_(species)
        biomart = BioMart(host=version)

        #add filters
        biomart.add_dataset_to_xml(dataset)
        biomart.add_filter_to_xml(to_annotation,query_string)

        #add attributs
        biomart.add_attribute_to_xml("ensembl_transcript_id")
        biomart.add_attribute_to_xml("transcript_length")
        biomart.add_attribute_to_xml("uniprot_sptrembl")
        attributes=biomart.attributes(dataset)

        #execute query
        xml_query=biomart.get_xml()
        result=biomart.query(xml_query)
        result=result.split("\n")

        return result
Exemple #9
0
    def run(self, inputs, outputs):
        """Run the analysis."""

        for exp in inputs.expressions:
            if exp.output.species != inputs.expressions[0].output.species:
                self.error(
                    "Input samples are of different Species: "
                    f"{exp.output.species} and {inputs.expressions[0].output.species}."
                )
            if exp.output.exp_type != inputs.expressions[0].output.exp_type:
                self.error(
                    "Input samples have different Normalization types: "
                    f"{exp.output.exp_type} and {inputs.expressions[0].output.exp_type}."
                )
            if exp.output.platform != inputs.expressions[0].output.platform:
                self.error(
                    "Input samples have different Microarray platform types: "
                    f"{exp.output.platform} and {inputs.expressions[0].output.platform}."
                )
            if exp.output.platform_id != inputs.expressions[
                    0].output.platform_id:
                self.error(
                    "Input samples have different GEO platform IDs: "
                    f"{exp.output.platform_id} and {inputs.expressions[0].output.platform_id}."
                )

        species = inputs.expressions[0].output.species
        platform = inputs.expressions[0].output.platform
        platform_id = inputs.expressions[0].output.platform_id

        joined_expressions = join_expressions(inputs.expressions)
        probe_ids = joined_expressions.index.unique()

        if inputs.mapping_file:
            mapping_file = inputs.mapping_file.import_file(
                imported_format="compressed")
            stem = Path(mapping_file).stem
            supported_extensions = (".tab", ".tsv", ".txt")
            if not stem.endswith(supported_extensions):
                self.error(
                    "Mapping file has unsupported file name extension. "
                    f"The supported extensions are {supported_extensions}.")
            mapping = pd.read_csv(
                mapping_file,
                sep="\t",
                header=0,
                names=["ensembl_id", "probe"],
                dtype=str,
            )
            mapping = mapping.drop_duplicates()

            if inputs.source:
                source = inputs.source
            else:
                self.error(
                    "Custom probe id mapping file was provided but no source was selected."
                )
            if inputs.build:
                build = inputs.build
            else:
                self.error(
                    "Custom probe id mapping file was provided but genome build was not defined."
                )
            probe_mapping = "Custom"
        else:
            if not platform_id:
                self.error(
                    "Custom mapping file should be provided when samples do not have a GEO platform defined"
                )
            if platform_id not in PLATFORM_MAP:
                self.error(f"GEO platform {platform_id} is not supported.")

            species_low = species.lower()
            dataset = f"{species_low[0]}{species_low.split(' ')[1]}_gene_ensembl"
            probe_mapping = PLATFORM_MAP[platform_id]

            try:
                b = BioMart()
            except IOError:
                raise Exception(
                    "None of the ENSEMBL Biomart hosts is reachable.")
            except Exception as e:
                raise Exception(f"Unexpected biomart error: {e}")

            b.add_dataset_to_xml(dataset)
            b.add_attribute_to_xml("ensembl_gene_id")
            b.add_attribute_to_xml(probe_mapping)  # type of microarray
            b.add_filter_to_xml(probe_mapping, ",".join(probe_ids))
            xml_query = b.get_xml()
            res = b.query(xml_query)

            mapping = pd.read_csv(
                StringIO(res),
                sep="\t",
                header=None,
                names=["ensembl_id", "probe"],
                dtype=str,
            )
            mapping = mapping.drop_duplicates()
            mapping_file = f"{platform}_mapping.tsv"
            mapping.to_csv(mapping_file, sep="\t", index=False)

            dataset_names = b.get_datasets("ENSEMBL_MART_ENSEMBL")
            display_name = dataset_names.loc[
                dataset_names["name"] == dataset]["description"].to_string()
            # Typical display name would be Human genes (GRCh38.p13)
            build = re.search("\((.+?)\)", display_name).group(1)
            source = "ENSEMBL"

        mapping = mapping.drop_duplicates(subset=["probe"], keep=False)

        data = joined_expressions.loc[mapping["probe"]]
        data["ensembl_id"] = mapping["ensembl_id"].tolist()
        data = data.reset_index()

        # For Ensembl IDs with multiple probe IDs retain the one with highest expression.
        data["mean"] = data.loc[:,
                                data.columns.
                                difference(["probe", "ensembl_id"])].mean(
                                    axis=1)
        idx_max = data.groupby(["ensembl_id"])["mean"].idxmax()
        data = data.loc[idx_max].set_index("ensembl_id")

        data = data.drop(columns=["probe", "mean"])
        data.index.name = "Gene"

        mapped_file = "mapped_expressions.tsv.gz"
        data.to_csv(mapped_file, sep="\t", index=True, compression="gzip")
        for column, exp in zip(data.columns, inputs.expressions):
            mapped_column = f"{column}_mapped_exp.tsv.gz"
            data.to_csv(
                mapped_column,
                sep="\t",
                index=True,
                columns=[column],
                header=["Expression"],
                index_label="Gene",
                compression="gzip",
            )
            self.run_process(
                "mapped-microarray-expression",
                {
                    "exp_unmapped": exp.id,
                    "exp": mapped_column,
                    "source": source,
                    "build": build,
                    "probe_mapping": probe_mapping,
                },
            )

        outputs.mapped_exp = mapped_file
        outputs.mapping = mapping_file
        outputs.probe_mapping = probe_mapping
        outputs.platform = platform
        if platform_id:
            outputs.platform_id = platform_id