Beispiel #1
0
def parse_sabio_rk_for_model(model: cobra.Model, json_output_path: str,
                             bigg_id_name_mapping_path: str) -> None:
    """Retrieves kcats from SABIO-RK for the given model and stores it in a JSON for the given model in the given path.

    Algorithm
    ----------
    Using the SABIO-RK REST API (as of 2019/30/04, it is explained under
    http://sabiork.h-its.org/layouts/content/docuRESTfulWeb/RESTWebserviceIntro.gsp),


    Arguments
    ----------
    * model: cobra.Model ~ The model for which kcats shall be retrieved from SABIO-RK.
    * json_output_path: str ~ The path of the JSON that shall be created

    Output
    ----------
    * A JSON in the given project folder with the following structure:
    <pre>
        {
            "$EC_NUMBER_OR_KEGG_REACTION_ID": {
                "$SUBSTRATE_WITH_BIGG_ID_1": {
                    "$ORGANISM_1": [
                        $kcat_1,
                        (...)
                        $kcat_n,
                    ]
                },
                (...),
                "REST": {
                    "$ORGANISM_1": [
                        $kcat_1,
                        (...)
                        $kcat_n,
                    ]
                }
            }
            (...),
        }
    </pre>
    'REST' stands for a substrate without found BIGG ID.
    """
    # GET LIST OF EC NUMBERS
    ec_numbers_list: List[str] = []
    for reaction in model.reactions:
        if "ec-code" not in reaction.annotation.keys():
            continue
        ec_codes = reaction.annotation["ec-code"]
        if type(ec_codes) is str:
            ec_codes = [ec_codes]
        ec_numbers_list += ec_codes
    ec_numbers_list = list(set(ec_numbers_list))

    # GET KCATS FOR EC NUMBERS
    ec_number_kcat_mapping = sabio_rk.get_ec_number_kcats_wildcard_search(
        ec_numbers_list, bigg_id_name_mapping_path)

    json_write(json_output_path, ec_number_kcat_mapping)
Beispiel #2
0
def parse_bigg_metabolites_file(bigg_metabolites_file_path: str, json_output_folder: str) -> None:
    """Parses a BIGG metabolites text file and returns a dictionary for this file.

    As of 29/04/2019, a BIGG metabolites list of all BIGG-included metabolites
    is retrievable under http://bigg.ucsd.edu/data_access

    Arguments
    ----------
    * bigg_metabolites_file_path: str ~ The file path to the BIGG metabolites file.
      The usual file name (which has to be included too in this argument) is
      bigg_models_metabolites.txt
    * output_folder: str ~ The folder in which the JSON including the parsed BIGG
      metabolites file data is stored with the name 'bigg_id_name_mapping.json'

    Output
    ----------
    * A JSON file with the name 'bigg_id_name_mapping.json' in the given output folder,
      with the following structure:
    <pre>
     {
         "$BIGG_ID": "$CHEMICAL_OR_USUAL_NAME",
         (...),
         "$BIGG_ID": "$BIGG_ID",
         (...),
     }
    </pre>
    The BIGG ID <-> BIGG ID mapping is done for models which already use the BIGG IDs.
    """
    # Standardize output folder
    json_output_folder = standardize_folder(json_output_folder)

    # Open the BIGG metabolites file as string list, and remove all newlines
    with open(bigg_metabolites_file_path, "r") as f:
        lines = f.readlines()
    lines = [x.replace("\n", "") for x in lines if len(x) > 0]

    # Mapping variable which will store the BIGG ID<->
    bigg_id_name_mapping = {}
    # Go through each BIGG metabolites file line (which is a tab-separated file)
    # and retrieve the BIGG ID and the name (if there is a name for the given BIGG
    # ID)
    for line in lines:
        bigg_id = line.split("\t")[1]
        # Exception to check if there is no name :O
        try:
            name = line.split("\t")[2].lower()
        except Exception:
            continue

        bigg_id_name_mapping[name] = bigg_id
        bigg_id_name_mapping[bigg_id] = bigg_id

    # Write the JSON in the given folder :D
    json_write(json_output_folder+"bigg_id_name_mapping.json", bigg_id_name_mapping)
Beispiel #3
0
def parse_brenda_json_for_model(sbml_path: str, brenda_json_path: str,
                                output_json_path: str) -> None:
    """Reads out a BRENDA JSON file created with parse_brenda_textfile and creates a model-specific JSON.

    Arguments
    ----------
    * sbml_path: str ~ The path of the SBML model of which a specific BRENDA JSON kcat database
      shall be created
    * brenda_json_path: str ~ The full path to the BRENDA JSON created with parse_brenda_textfile.
    * output_json_path: str ~ The full path to the newly created JSON.

    Output
    ----------
    A JSON in the given folder and the name 'kcat_database_brenda.json', and with the following structure:
    <pre>
    {
        '$EC_NUMBER': {
            '$BIGG_ID_METABOLITE': {
                '$ORGANISM': [
                    kcat_list: float
                ],
                (...)
            },
            (...)
        },
        (...)
    }
    </pre>
    """
    model: cobra.Model = cobra.io.read_sbml_model(sbml_path)

    # Get EC numbers of the model's reactions
    ec_numbers_of_model: List[str] = []
    for reaction in model.reactions:
        if "ec-code" not in reaction.annotation.keys():
            continue

        ec_numbers_of_reaction = reaction.annotation["ec-code"]
        if type(ec_numbers_of_reaction) is str:
            ec_numbers_of_reaction = [ec_numbers_of_reaction]
        ec_numbers_of_model += ec_numbers_of_reaction
    ec_numbers_of_model = list(set(ec_numbers_of_model))

    # Get EC number entries for each EC number of the model
    brenda_kcat_database_original = json_load(brenda_json_path)
    brenda_kcat_database_for_model = {}
    for ec_number in ec_numbers_of_model:
        entry_error = False
        if ec_number in brenda_kcat_database_original.keys():
            ec_number_entry = _get_transfer_ec_number_entry(
                ec_number, brenda_kcat_database_original)
            if "ERROR" in ec_number_entry.keys():
                entry_error = True
            else:
                ec_number_entry["WILDCARD"] = False
                brenda_kcat_database_for_model[ec_number] = ec_number_entry

        if (ec_number
                not in brenda_kcat_database_original.keys()) or entry_error:
            eligible_ec_number_entries: List[Dict[str, Any]] = []
            for wildcard_level in range(1, 5):
                for database_ec_number in list(
                        brenda_kcat_database_original.keys()):
                    if is_fitting_ec_numbers(ec_number, database_ec_number,
                                             wildcard_level):
                        database_ec_number_entry = _get_transfer_ec_number_entry(
                            database_ec_number, brenda_kcat_database_original)
                        if "ERROR" not in database_ec_number_entry.keys():
                            eligible_ec_number_entries.append(
                                database_ec_number_entry)
                if len(eligible_ec_number_entries) > 0:
                    break
            ec_number_entry = {}
            for eligible_ec_number_entry in eligible_ec_number_entries:
                for metabolite_key in eligible_ec_number_entry.keys():
                    metabolite_entry = eligible_ec_number_entry[metabolite_key]
                    if metabolite_key not in ec_number_entry.keys():
                        ec_number_entry[metabolite_key] = metabolite_entry
                    else:
                        ec_number_entry[metabolite_key] = {
                            **ec_number_entry[metabolite_key],
                            **metabolite_entry
                        }
            ec_number_entry["WILDCARD"] = True
            brenda_kcat_database_for_model[ec_number] = ec_number_entry

    json_write(output_json_path, brenda_kcat_database_for_model)
Beispiel #4
0
# Read SBML model
print("Reading SBML model...")
original_thermogecko_sbml_path: str = "ec_model_2019_06_25_output_optimization/iJO1366_sMOMENT_2019_06_25_STANDARD_EXCHANGE_SCENARIO_MANUAL_CHANGES.xml"
model: cobra.Model = set_up_ec_model_with_sbml(original_thermogecko_sbml_path,
                                               .095)

# Set protein bound
model.reactions.get_by_id("ER_pool_TG_").upper_bound = .095

# Get flux controlling proteins
print("Getting flux control files...")
reaction_flux_control_by_scenario(model, flux_control_folder, project_name,
                                  ec_model_scenarios_for_optimization)

# Get differential proteins
print("Getting differential reactions (Growth)...")
unique_differential_reactions_of_scenarios, _ = \
    get_differential_reactions(list(ec_model_scenarios_for_optimization.keys()), flux_control_folder, project_name,
                               ec_model_scenarios_for_optimization,
                               threshold=(.1) / 1000, print_result=True)

# Get unique reactions in MATLAB style
for scenario_key in unique_differential_reactions_of_scenarios.keys():
    print(f"% {scenario_key}")
    unique_reactions = unique_differential_reactions_of_scenarios[scenario_key]
    for unique_reaction in unique_reactions:
        print(f'"R_{unique_reaction}",')
json_write(
    "ec_model_2019_06_25_output_optimization/iJO1366_sMOMENT_2019_06_25_STANDARD_EXCHANGE_SCENARIO_MANUAL_CHANGES_unique_differential_reactions_of_scenarios.json",
    unique_differential_reactions_of_scenarios)
Beispiel #5
0

# Get directionality data
model = cobra.io.read_sbml_model("ec_model_2019_06_25_input/iJO1366.xml")
pfba_solution = cobra.flux_analysis.pfba(model)
for reaction in model.reactions:
    gene_reaction_rule = reaction.gene_reaction_rule
    if gene_reaction_rule == "":
        continue

    gene_reaction_rule = gene_reaction_rule.replace(" or ", "\t")
    gene_reaction_rule = gene_reaction_rule.replace(" and ", "\t")
    gene_names = gene_reaction_rule.split("\t")

    has_negative_flux = pfba_solution.fluxes[reaction.id] < 0
    if has_negative_flux:
        direction = "reverse"
    else:
        direction = "forward"
    for gene_name in gene_names:
        if gene_name not in gene_id_data_mapping.keys():
            continue

        if "direction" not in gene_id_data_mapping[gene_name].keys():
            gene_id_data_mapping[gene_name]["direction"] = {}

        gene_id_data_mapping[gene_name]["direction"][reaction.id] = direction

# Write JSON :D
json_write("ec_model_2019_06_25_input_keff_paper/gene_id_data_mapping.json", gene_id_data_mapping)
Beispiel #6
0
def get_reactions_kcat_mapping(sbml_path: str, project_folder: str,
                               project_name: str, organism: str,
                               kcat_database_path: str,
                               protein_kcat_database_path: str,
                               type_of_kcat_selection: str) -> None:
    """Returns a reaction<->kcat mapping for the given model :D

    The selection of kcats is depending on the affected metabolites of the reaction direction (one
    kcat is given for each the forward and reverse direction), and on the organism (the kcats
    from the taxonomically nearest organism is prefered).

    Arguments
    ----------
    *sbml_path: str ~ Te SBML path to the model
    *project_folder: str ~ The folder in which the model data files are sored
    *project_name: str ~ The name of the used project
    *organism: str ~ The organism's name
    *kcat_database_path: str ~ A path to an already created EC number<->kcats database
    *protein_kcat_database_path: str ~ A path to the custom protein<->kcat database
    *type_of_kcat_selection: str ~ Can be "mean", "median" or "random". Refers to the selection of found kcats of a reaction.
                                   Is "mean" by default.

    Output
    ----------
    A JSON in the given project folder with the name $project_name+'_reactions_kcat_mapping_combined.json' and
    the following structure:
    <pre>
    {
        "$REACTION_NAME": {
            "forward": $forward_kcat,
            "reverse": $reverse_kcat
        },
        (...)
    }
    </pre>
    """
    # Standardize project folder
    project_folder = standardize_folder(project_folder)
    # Set the path for the output JSON
    basepath = project_folder + project_name
    # Load the combined, EC-number-dependent kcat database :D
    kcat_database = json_load(kcat_database_path)
    # If given, load the protein-dependent kcat database :D
    if protein_kcat_database_path != "none":
        protein_kcat_database = json_load(protein_kcat_database_path)
    else:
        protein_kcat_database = {}

    # Load the given stoichiometric model
    model = cobra.io.read_sbml_model(sbml_path)

    # Set-up dictionary which will be the content of the output JSON
    reactions_kcat_mapping: Dict[str, Dict[str, float]] = {}
    # Go through each reaction in order to assign kcats for it :D
    for reaction in model.reactions:
        # If no EC number is given in the reaction's annotations,
        # the protein-dependent database is read out in order to
        # find a kcat. This only works if at least one of the assigned
        # enzymes of the reaction's gene rule has a kcat in the
        # protein-dependent database.
        if "ec-code" not in reaction.annotation.keys():
            # 0 means that no kcat can be assigned
            forward_kcat: Any = 0
            reverse_kcat: Any = 0

            if protein_kcat_database != {}:
                # Retrieve the kcats from the protein-dependent database :D
                forward_kcat = _get_kcat_from_protein_kcat_database(
                    "forward", reaction, protein_kcat_database)
                reverse_kcat = _get_kcat_from_protein_kcat_database(
                    "reverse", reaction, protein_kcat_database)

            # If no kcat could be assigned, set the kcat to math.nan
            # which indicates this case
            if forward_kcat == 0.0:
                forward_kcat = math.nan
            if reverse_kcat == 0.0:
                reverse_kcat = math.nan

            # Add the retrieved forward and reverse kcats to the reaction<->kcat mapping dictionary :D
            reactions_kcat_mapping[reaction.id] = {}
            reactions_kcat_mapping[reaction.id]["forward"] = forward_kcat
            reactions_kcat_mapping[reaction.id]["reverse"] = reverse_kcat

            # Print the assigned kcats
            _print_assigned_kcats(reaction.id, forward_kcat, reverse_kcat)
            continue

        # Retrieve the reaction's associated EC numbers
        reaction_ids = reaction.annotation["ec-code"]
        # If only one EC number is given, set the EC number string to
        # a list in order to make it work with the following code lines
        if type(reaction_ids) is str:
            reaction_ids = [reaction_ids]
        # Get all EC numbers which do not contain a - wildcard, such as
        # in 2.1.1.-
        # These wildcarded EC numbers are in general too permissive in order
        # to get useful kcats
        eligible_reaction_ids = [x for x in reaction_ids if "-" not in x]
        if len(eligible_reaction_ids) == 0:
            eligible_reaction_ids = [x for x in reaction_ids]

        # Create a 'complete entry' from all eligible (i.e., non-wildcarded)
        # EC numbers. This complete entry contains - for every organism
        # and substrate given in the EC number kcat entries - all kcats
        # of all eligible EC numbers. In addition, the pseudo-substrate
        # "ALL" is added which contains all organisms. "ALL" is used
        # later if no fitting substrate can be found.
        complete_entry: Dict[str, Any] = {}
        complete_entry["ALL"] = {}
        # Go through each reaction ID :D
        for reaction_id in eligible_reaction_ids:
            # If the EC number could not be found in the given EC number<->kcat
            # database, print it and proceed with the next eligible EC number
            if reaction_id not in kcat_database.keys():
                print(f"INFO: No entry for EC number {reaction_id}")
                print("")
                continue
            # Otherwise, get the reaction ID entry from the given database :D
            reaction_id_entry = kcat_database[reaction_id]
            # Exclude all kcat entries which come from a wildcard search
            # with *
            if reaction_id_entry["WILDCARD"]:
                continue
            # Go trough each metabolite in the EC number<->kcat database entries
            for metabolite_key in reaction_id_entry.keys():
                # Ignore the keys which show additional information
                # about the nature of the kcat data
                if metabolite_key in ("WILDCARD", "SOURCE", "TRANSFER"):
                    continue
                # Add the metabolite to the complete entry if it does not already occur
                if metabolite_key not in complete_entry:
                    complete_entry[metabolite_key] = {}
                # Go throudh each species in the currently analyzed EC number
                for species_key in reaction_id_entry[metabolite_key]:
                    # Add the species to the metabolite entry if it does not already occur
                    if species_key not in complete_entry[metabolite_key]:
                        complete_entry[metabolite_key][species_key] = []
                    # ...and do the same for the pseudo-metabolite "ALL"
                    if species_key not in complete_entry["ALL"].keys():
                        complete_entry["ALL"][species_key] = []
                    # Add the list of kcats of the currently analyzed EC number to the current species
                    # and the current metabolite, and for "ALL"
                    complete_entry[metabolite_key][
                        species_key] += reaction_id_entry[metabolite_key][
                            species_key]
                    complete_entry["ALL"][species_key] += reaction_id_entry[
                        metabolite_key][species_key]

        # If no entries with kcats could be found for any of the eligible EC numbers, continue with the next reaction.
        if complete_entry["ALL"] == {}:
            continue

        # Get the BIGG IDs of the educts and products uusing the SBML's BIGG ID annotation
        educt_bigg_ids: List[str] = []
        for reactant in reaction.reactants:
            if "bigg.metabolite" in reactant.annotation.keys():
                educt_bigg_ids.append(reactant.annotation["bigg.metabolite"])
        product_bigg_ids: List[str] = []
        for product in reaction.products:
            if "bigg.metabolite" in product.annotation.keys():
                product_bigg_ids.append(product.annotation["bigg.metabolite"])
        # If no bigg IDs could be found in the SBML, add the pseudo-metabolite "X"
        # which indicated that "ALL" should be used later.
        if len(educt_bigg_ids) == 0:
            educt_bigg_ids = ["X"]
        if len(product_bigg_ids) == 0:
            product_bigg_ids = ["X"]

        # Get the metabolites which are used in the subsequent forward kcat search
        searched_educts = _get_searched_metabolites(complete_entry,
                                                    educt_bigg_ids)
        # Get the forward kcat depending on the educts and the organism
        forward_kcat = _get_kcat(searched_educts, complete_entry, organism,
                                 "forward", reaction, protein_kcat_database,
                                 type_of_kcat_selection)

        # Get the metabolites which are used in the subsequent forward kcat search
        searched_products = _get_searched_metabolites(complete_entry,
                                                      product_bigg_ids)
        # Get the reverse kcat depending on the products and the organism
        reverse_kcat = _get_kcat(searched_products, complete_entry, organism,
                                 "reverse", reaction, protein_kcat_database,
                                 type_of_kcat_selection)

        # Set the found out kcats in the reactions<->kcat mapping :D
        reactions_kcat_mapping[reaction.id] = {}
        reactions_kcat_mapping[reaction.id]["forward"] = forward_kcat
        reactions_kcat_mapping[reaction.id]["reverse"] = reverse_kcat

        # display the found out kcats for this reaction \o/
        _print_assigned_kcats(reaction.id, forward_kcat, reverse_kcat)

    # Export the kcat mapping results as JSON :D
    json_write(basepath + "_reactions_kcat_mapping_combined.json",
               reactions_kcat_mapping)
Beispiel #7
0
def get_protein_mass_mapping(model: cobra.Model, project_folder: str,
                             project_name: str) -> None:
    """Returns a JSON with a mapping of protein IDs as keys, and as values the protein mass in kDa.

    The protein masses are calculated using the amino acid sequence from UniProt (retrieved using
    UniProt's REST API).

    Arguments
    ----------
    * model: cobra.Model ~ The model in the cobrapy format
    * project_folder: str ~ The folder in which the JSON shall be created
    * project_name: str ~ The beginning of the JSON's file name

    Output
    ----------
    A JSON file with the path project_folder+project_name+'_protein_id_mass_mapping.json'
    and the following structure:
    <pre>
    {
        "$PROTEIN_ID": $PROTEIN_MASS_IN_KDA,
        (...),
    }
    </pre>
    """
    # Standardize project folder
    project_folder = standardize_folder(project_folder)

    # The beginning of the created JSON's path :D
    basepath: str = project_folder + project_name

    # GET UNIPROT ID - PROTEIN MAPPING
    uniprot_id_protein_id_mapping: Dict[str, List[str]] = {}
    for gene in model.genes:
        # Without a UniProt ID, no mass mapping can be found
        if "uniprot" not in gene.annotation:
            continue
        uniprot_id = gene.annotation["uniprot"]
        if uniprot_id in uniprot_id_protein_id_mapping.keys():
            uniprot_id_protein_id_mapping[uniprot_id].append(gene.id)
        else:
            uniprot_id_protein_id_mapping[uniprot_id] = [gene.id]

    # GET UNIPROT ID<->PROTEIN MASS MAPPING
    uniprot_id_protein_mass_mapping: Dict[str, float] = {}
    # The cache stored UniProt masses for already searched
    # UniProt IDs (each file in the cache folder has the name
    # of the corresponding UniProt ID). This prevents searching
    # UniProt for already found protein masses. :-)
    cache_basepath = "./_cache/uniprot/"
    ensure_folder_existence("./_cache/")
    ensure_folder_existence(cache_basepath)
    cache_files = get_files(cache_basepath)
    # Go through each batch of UniProt IDs (multiple UniProt IDs
    # are searched at once in order to save an amount of UniProt API calls)
    # and retrieve the amino acid sequences and using these sequences, their
    # masses.
    uniprot_ids = list(uniprot_id_protein_id_mapping.keys())
    batch_size = 5
    batch_start = 0
    while batch_start < len(uniprot_ids):
        # Create the batch with all UniProt IDs
        prebatch = uniprot_ids[batch_start:batch_start + batch_size]
        batch = []
        # Remove all IDs which are present in the cache (i.e.,
        # which were searched for already).
        # The cache consists of pickled protein mass floats, each
        # onein a file with the name of the associated protein.
        for uniprot_id in prebatch:
            if uniprot_id not in cache_files:
                batch.append(uniprot_id)
            else:
                cache_filepath = cache_basepath + uniprot_id
                uniprot_id_protein_mass_mapping[uniprot_id] = pickle_load(
                    cache_filepath)
                print(uniprot_id + ":",
                      uniprot_id_protein_mass_mapping[uniprot_id])

        # If all IDs could be found in the cache, continue with the next batch.
        if len(batch) == 0:
            batch_start += batch_size
            continue

        # Create the UniProt query for the batch
        # With 'OR', all given IDs are searched, and subsequently in this script,
        # the right associated masses are being picked.
        query = " OR ".join(batch)
        uniprot_query_url = f"https://www.uniprot.org/uniprot/?query={query}&format=tab&columns=id,sequence"

        # Call UniProt's API :-)
        uniprot_data = requests.get(uniprot_query_url).text.split("\n")
        # Wait in order to cool down their server :-)
        time.sleep(2.0)

        # Read out the API-returned lines
        for line in uniprot_data[1:]:
            if line == "":
                continue
            uniprot_id = line.split("\t")[0]
            sequence = line.split("\t")[1]
            # Get the protein mass using biopython's associated function for amino acid sequences
            mass = ProteinAnalysis(sequence,
                                   monoisotopic=False).molecular_weight()
            uniprot_id_protein_mass_mapping[uniprot_id] = float(mass)

        # Create the pickled cache files for the searched protein masses
        for uniprot_id in batch:
            cache_filepath = cache_basepath + uniprot_id
            pickle_write(cache_filepath,
                         uniprot_id_protein_mass_mapping[uniprot_id])

        # Continue with the next batch :D
        batch_start += batch_size

    # Create the final protein ID <-> mass mapping
    protein_id_mass_mapping: Dict[str, float] = {}
    for uniprot_id in list(uniprot_id_protein_mass_mapping.keys()):
        try:
            protein_ids = uniprot_id_protein_id_mapping[uniprot_id]
        except Exception:
            print(f"No mass found for {uniprot_id}!")
            continue
        for protein_id in protein_ids:
            protein_id_mass_mapping[
                protein_id] = uniprot_id_protein_mass_mapping[uniprot_id]

    # Write protein mass list JSON :D
    json_write(basepath + "_protein_id_mass_mapping.json",
               protein_id_mass_mapping)
Beispiel #8
0
def get_id_associated_kcats(searched_ids: List[str], id_type: str,
                            bigg_id_name_mapping_path: str, batch_size: int = 5) -> Dict[str, Any]:
    """Returns a dictionary with SABIO-RK kcat data for the given EC numbers or KEGG IDs.

    This function calls the SABIO-RK API.

    Input
    ----------
    * searched_ids: List[str] ~ The list of searched IDs
    * id_type: str ~ Must be either 'EC' or 'KEGG', depending on whether you are looking for kcats for EC numbers
      or KEGG IDs.
    * batch_size: int = 5 ~ The SABIO-RK API search batching number (i.e., with satch_size=5 five IDs are searched at once)

    Output
    ----------
    A dictionary with the following content:
    <pre>
    {
        "$EC_NUMBER_OR_KEGG_REACTION_ID": {
            "$SUBSTRATE_WITH_BIGG_ID_1": {
                "$ORGANISM_1": [
                    $kcat_1,
                    (...)
                    $kcat_n,
                ]
            },
            (...),
            "REST": {
                "$ORGANISM_1": [
                    $kcat_1,
                    (...)
                    $kcat_n,
                ]
            }
        }
        (...),
    }
    </pre>
    'REST' stands for a substrate without found BIGG ID.
    """
    # Set-up the cache if it does not exist yet \o/
    cache_basepath = "./_cache/sabio_rk_total/"
    ensure_folder_existence("./_cache/")
    ensure_folder_existence(cache_basepath)
    cache_files = get_files(cache_basepath)
    # Load the given BIGG ID<->metabolite common name mapping
    bigg_id_name_mapping = json_load(bigg_id_name_mapping_path)
    # In order to save search time, use the seat (i.e., a list where
    # every member occurs only once) of the given searched IDs
    searched_ids = list(set(searched_ids))
    
    # Set the given ID name to the name which SABIO-RK uses for them
    if id_type == "EC":
        id_name = "ECNumber"
    elif id_type == "KEGG":
        id_name = "KeggReactionID"

    # Depending on the wildcard level which is serched, either
    # the output or the wildcard output will be used as output
    # These central dictionaries will contain the ID<->kcat mapping
    output = {}
    wildcard_output = {}
    # We use batched searched in order to save search time :D
    batch_start = 0
    # Loop while not all IDs were searched \o/
    while batch_start < len(searched_ids):
        # Get the batch for the search :-)
        batch = searched_ids[batch_start: batch_start + batch_size]
        # The query dicts contain a list of dictionaries which contain
        # the data for a SABIO-RK search entry
        query_dicts: List[Dict[str, str]] = []
        # Go through each single EC number in the search bath
        for ec_number in batch:
            # Create the cache filename
            cache_filename = ec_number.replace(".", "_").replace("*", "W") + ".json"
            # If the EC number is already searched, i.e. it can be found in the cache,
            # take the results from there in order to save much search time :D
            if cache_filename in cache_files:
                cache_filepath = cache_basepath + cache_filename
                output[ec_number] = json_load(cache_filepath)
                print(f"Loading {cache_filename}...")
            # Otherwise, create an actual SABIO-RK API search query
            else:
                query_dicts.append({id_name: ec_number, "Parametertype": "kcat", "EnzymeType": "wildtype"})
        # If not all of the searched IDs are present in the cache...
        if len(query_dicts) > 0:
            # ...use SABIO-RK's API :D
            print(f"Performing query {query_dicts}...")
            result = sabio_rk_query_get_csv_lines(query_dicts)

            # If there was an error with the SABIO-RK result (i.e., no result found or an invalid given ID),
            # continue with the next batch
            if result == "NO_RESULT":
                batch_start += batch_size
                continue
        # ...otherwise set the query result to nothing
        else:
            result = []

        # Loop through every SABIO-RK API query call result :D
        temp_ec_numbers_found_in_search = []
        result = _extract_kcat_lines(result)
        for row in result:
            # Get the unit of the parameter
            unit = row["parameter.unit"]
            # If it is a weird unusable unit, do not use this result and continue with the next result \o/
            if unit not in list(UNIT_MULTIPLIER.keys()):  # e.g. (s^-1)*(mg^-1)
                continue

            # Get the serached ID
            ec_number = row[id_name]
            # Generate a lowercarse and semicolon seperated list of substrates
            substrates_names = row["Substrate"]
            substrates_list = [x.lower() for x in substrates_names.replace("+", "").split(";")]
            substrates_list = sorted(substrates_list)
            # Convert the substrates name list into a BIGG ID list (only works
            # if there is a name<->BIGG ID mapping present for each substrate)
            bigg_ig_substrates_list = []
            for substrate in substrates_list:
                if substrate in bigg_id_name_mapping.keys():
                    bigg_id = bigg_id_name_mapping[substrate]
                    bigg_ig_substrates_list.append(bigg_id)
                # If one of the substrates cannot be found, use the pseudometabolite "REST"
                # and break :O
                else:
                    bigg_ig_substrates_list = ["REST"]
                    break
            # Set the substrate list to a semicolon-connected string
            substrate = ";".join(bigg_ig_substrates_list)
            # Get the result's organism :D
            species = row["Organism"]
            # Get the kcat and set
            # it to 1/s for consistent behaviour :D
            raw_kcat = float(row["parameter.startValue"])  # Without unit correction
            kcat = raw_kcat * UNIT_MULTIPLIER[unit]  # With unit correction 🎉

            # Add the result to the output for the given EC number, sustrate and species
            if ec_number not in output.keys():
                output[ec_number] = {}
            if substrate not in output[ec_number].keys():
                output[ec_number][substrate] = {}
            if species not in output[ec_number][substrate].keys():
                output[ec_number][substrate][species] = []
            output[ec_number][substrate][species].append(kcat)

            # Since we found a result, add the EC number :D
            temp_ec_numbers_found_in_search.append(ec_number)

        # Create cache files for all newly found EC numbers which were not present
        # in the cache
        temp_ec_numbers_found_in_search = list(set(temp_ec_numbers_found_in_search))
        for ec_number in temp_ec_numbers_found_in_search:
            cache_filename = ec_number.replace(".", "_") + ".json"
            if cache_filename not in cache_files:
                json_write(cache_basepath + cache_filename, output[ec_number])

        # Get all wildcarded searched EC numbers...
        wildcarded_searched_ec_numbers = [x for x in batch if "*" in x]
        # ...and loop through them in order to create a result for the EC numbers
        # which fit into the wildcard (i.e 1.1.1.123 in 1.1.1.*) :D
        for wildcarded_ec_number in wildcarded_searched_ec_numbers:
            # Ste the cache name for the wildcarded EC number
            cache_filename = wildcarded_ec_number.replace(".", "_").replace("*", "W") + ".json"
            # If the wildcarded EC number cannot be found in the cache, search for
            # fitting EC numbers, and combine their entries into a huge entry for the
            # wildcarded EC number
            if cache_filename not in cache_files:
                fitting_ec_numbers = []
                for found_ec_number in temp_ec_numbers_found_in_search:
                    if is_fitting_ec_numbers(wildcarded_ec_number, found_ec_number, wildcarded_ec_number.count("*")):
                        fitting_ec_numbers.append(found_ec_number)

                # Combine the EC number entries of fitting EC numbers :D
                wildcarded_ec_number_dict: Dict[str, Any] = {}
                for fitting_ec_number in fitting_ec_numbers:
                    fitting_ec_number_result = output[fitting_ec_number]
                    for metabolite_key in fitting_ec_number_result.keys():
                        if metabolite_key not in wildcarded_ec_number_dict.keys():
                            wildcarded_ec_number_dict[metabolite_key] = fitting_ec_number_result[metabolite_key]
                        else:
                            for organism_key in fitting_ec_number_result[metabolite_key].keys():
                                if organism_key not in wildcarded_ec_number_dict[metabolite_key].keys():
                                    wildcarded_ec_number_dict[metabolite_key][organism_key] =\
                                        copy.deepcopy(fitting_ec_number_result[metabolite_key][organism_key])
                                else:
                                    wildcarded_ec_number_dict[metabolite_key][organism_key] +=\
                                        copy.deepcopy(fitting_ec_number_result[metabolite_key][organism_key])
                                wildcarded_ec_number_dict[metabolite_key][organism_key] =\
                                    list(set(wildcarded_ec_number_dict[metabolite_key][organism_key]))
                # Create cache files for the searched wildcarded EC numbers \o/
                if wildcarded_ec_number_dict != {}:
                    json_write(cache_basepath + cache_filename, wildcarded_ec_number_dict)
                    wildcard_output[wildcarded_ec_number] = wildcarded_ec_number_dict
            # If the wildcarded EC number is in the cache, load the cache file :D
            else:
                wildcard_output[wildcarded_ec_number] = json_load(cache_basepath + cache_filename)
                print(f"Loading {cache_filename}...")

        # Continue with the next searched ID batch :D
        batch_start += batch_size

    # If the wildcard level is greater than 0, set the wildcard output as output
    if len(wildcard_output.keys()) > 0:
        output = wildcard_output

    return output
Beispiel #9
0
def parse_brenda_textfile(brenda_textfile_path: str,
                          bigg_metabolites_json_folder: str,
                          json_output_path: str) -> None:
    """Goes through a BRENDA database textfile and converts it into a machine-readable JSON.

    The JSON includes kcats for found organisms and substrates.
    As of 29/04/2019, the BRENDA database can be downloaded as textfile under
    https://www.brenda-enzymes.org/download_brenda_without_registration.php

    The BRENDA database is not in a completely standardized format, so that this functions
    contains many convoluted checks and circumventions of non-standardized data.

    kcats from mutated enzymes are excluded.

    Arguments
    ----------
    * brenda_textfile_path: str ~ The BRENDA database text file path
    * bigg_metabolites_json_folder: str ~ The folder in which the BIGG metabolites
      database is stored (it has to have the name 'bigg_id_name_mapping.json').
    * json_output_path: str ~ The path of the JSON that shall be created

    Output
    ----------
    * A JSON containing the BRENDA textfile kcat data in a machine-readable format:
    <pre>
        {
            "$EC_NUMBER": {
                "$SUBSTRATE_WITH_BIGG_ID_1": {
                    "$ORGANISM_1": [
                        $kcat_1,
                        (...)
                        $kcat_n,
                    ]
                },
                (...),
                "REST": {
                    "$ORGANISM_1": [
                        $kcat_1,
                        (...)
                        $kcat_n,
                    ]
                }
            }
            (...),
        }
    </pre>
    'REST' stands for a substrate without found BIGG ID.
    """
    # Standardize output folder
    bigg_metabolites_json_folder = standardize_folder(
        bigg_metabolites_json_folder)

    # Load BIGG ID <-> metabolite name mapping :D
    bigg_id_name_mapping: Dict[str,
                               str] = json_load(bigg_metabolites_json_folder +
                                                "bigg_id_name_mapping.json")

    # Load BRENDA textfile as list of strings without newlines :D
    with open(brenda_textfile_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    lines = [x.replace("\n", "") for x in lines]

    # Go through each line and collect the organism lines and kcat lines for each EC number
    in_turnover_numbers = False
    in_organism_reference = False
    ec_number_kcat_lines_mapping: Dict[str, List[str]] = {}
    ec_number_organsism_lines_mapping: Dict[str, List[str]] = {}
    current_ec_number = ""
    organism_lines: List[str] = []
    kcat_lines: List[str] = []
    i = 0
    while i < len(lines):
        line = lines[i]
        if line.startswith("ID\t"):
            if current_ec_number != "":
                ec_number_organsism_lines_mapping[
                    current_ec_number] = organism_lines
                ec_number_kcat_lines_mapping[current_ec_number] = kcat_lines
            current_ec_number = line.replace("ID\t", "").replace(" ()", "")
            organism_lines = []
            kcat_lines = []

        if len(line) == 0:
            in_turnover_numbers = False
            in_organism_reference = False
        elif line.startswith("PROTEIN"):
            in_organism_reference = True
            i += 1
            line = lines[i]
        elif line.startswith("TURNOVER_NUMBER"):
            in_turnover_numbers = True
            i += 1
            line = lines[i]

        if in_organism_reference:
            if line.startswith("PR"):
                organism_lines.append("")
            if len(organism_lines[-1]) > 0:
                organism_lines[-1] += " "
            organism_lines[-1] += " " + line

        elif in_turnover_numbers:
            if line.startswith("TN"):
                kcat_lines.append("")
            if len(kcat_lines[-1]) > 0:
                kcat_lines[-1] += " "
            kcat_lines[-1] += line

        if len(line) == 0:
            in_turnover_numbers = False
            in_organism_reference = False

        i += 1

    # Create the BRENDA database dictionary using the collected kcat and organism lines
    # of each EC number :D
    ec_numbers = list(ec_number_kcat_lines_mapping.keys())
    brenda_kcat_database: Dict[str, Any] = {}
    for ec_number in ec_numbers:
        if "(transferred to " in ec_number:
            actual_ec_number = ec_number.split(" (transferred")[0]
            try:
                brenda_kcat_database[actual_ec_number] = {}
                brenda_kcat_database[actual_ec_number]["TRANSFER"] = \
                    ec_number.lower().replace("  ", " ").split("(transferred to ec")[1].replace(")", "").lstrip()
            except Exception:
                # Some transfers go to general subgroups instead of single EC numbers so that
                # no kcat database can be built from it D:
                print("WARNING: BRENDA text file line " + ec_number +
                      " is not interpretable!")
            continue

        brenda_kcat_database[ec_number] = {}

        reference_number_organism_mapping = {}
        organism_lines = ec_number_organsism_lines_mapping[ec_number]
        for organism_line in organism_lines:
            reference_number = organism_line.split("#")[1]
            organism_line_split_first_part = organism_line.split("# ")[1]
            organism_line_split = organism_line_split_first_part.split(" ")
            organism_line_split = [
                x for x in organism_line_split if len(x) > 0
            ]

            end = 1
            for part in organism_line_split:
                # Some organism names contain their SwissProt or UniProt ID,
                # since we don't nned them they are excluded
                if ("swissprot" in part.lower()) or \
                 (part.lower() == "and") or \
                 ("uniprot" in part.lower()) or \
                 ("genbank" in part.lower()) or \
                 ("trembl" in part.lower()):
                    end -= 2
                    break

                if ("<" in part) or ("(" in part):
                    end -= 1
                    break

                end += 1
            organism_name = " ".join(organism_line_split[:end])
            reference_number_organism_mapping[reference_number] = organism_name

        kcat_lines = ec_number_kcat_lines_mapping[ec_number]
        for kcat_line in kcat_lines:
            kcat_line = kcat_line
            # Exclude kcats of mutated/changed proteins since
            # they may not have a biological relevance
            if ("mutant" in kcat_line.lower()) or ("mutated"
                                                   in kcat_line.lower()):
                continue
            reference_number = kcat_line.split("#")[1].split(",")[0]
            organism = reference_number_organism_mapping[reference_number]
            kcat_str = "".join(
                kcat_line.split("#")[2]).split("{")[0].lstrip().rstrip()
            kcat = max([float(x) for x in kcat_str.split("-") if len(x) > 0])
            substrate = "".join(kcat_line.split("{")[1]).split("}")[0]

            substrate = substrate.lower()
            if substrate in bigg_id_name_mapping.keys():
                substrate = bigg_id_name_mapping[substrate]
            else:
                substrate = "REST"

            if substrate not in brenda_kcat_database[ec_number].keys():
                brenda_kcat_database[ec_number][substrate] = {}
            if organism not in brenda_kcat_database[ec_number][substrate].keys(
            ):
                brenda_kcat_database[ec_number][substrate][organism] = []
            brenda_kcat_database[ec_number][substrate][organism].append(kcat)

    # Write final BRENDA kcat database :D
    json_write(json_output_path, brenda_kcat_database)
Beispiel #10
0
def create_combined_kcat_database(sabio_rk_kcat_database_path: str, brenda_kcat_database_path: str, output_path: str) -> None:
    """Creates a combined JSON of the given SABIO-K and BRENDA kcat databases with non-wildcard entries only.

    Arguments
    ----------
    * sabio_rk_kcat_database_path: str ~ The path to the SABIO-RK kcat database JSON
    * brenda_kcat_database_path: str ~ The path to the BRENDA kcat database JSON
    * output_path: str ~ The outputh path (with filename) of the genreated combined kcat database JSON

    Output:
    A JSON with the following format:
    <pre>
    {
        '$EC_NUMBER': {
            '$BIGG_IDS_OF_SUBSTRATES': {
                '$ORGANISM': {
                    kcat: float
                },
                (...)
            },
            (...),
            'SOURCE': 'SABIO_RK' or 'BRENDA' or 'BRENDA and SABIO-RK',
            'WILDCARD': false
        },
        (...)
    }
    </pre>
    """
    # Load the two given databases as JSONs
    sabio_rk_database = json_load(sabio_rk_kcat_database_path)
    brenda_database = json_load(brenda_kcat_database_path)

    # Get all EC number keys (BRENDA contains all relevant EC numbers)
    ec_number_keys: List[str] = list(brenda_database.keys())
    # Set-up combined kcat database dictionary
    combined_database: Dict[str, Dict[str, Any]] = {}
    # Go through each EC number :D...
    for ec_number_key in ec_number_keys:
        # Get the wildcard status (i.e., found with a * wildcard?)
        is_sabio_rk_from_wildcard: bool = sabio_rk_database[ec_number_key]["WILDCARD"]
        is_brenda_from_wildcard: bool = brenda_database[ec_number_key]["WILDCARD"]

        # If both are from wildcards, ignore them :3
        if (is_sabio_rk_from_wildcard) and (is_brenda_from_wildcard):
            continue

        # Set-up dictionary for the EC number since at least one of the two databases
        # is not from a wildcarded search :D
        combined_database[ec_number_key] = {}
        # If both are not from wildcards, combine them :D...
        if (not is_sabio_rk_from_wildcard) and (not is_brenda_from_wildcard):
            # ...by reading their metabolites...
            sabio_rk_metabolite_keys = list(sabio_rk_database[ec_number_key].keys())
            brenda_metabolite_keys = list(brenda_database[ec_number_key].keys())
            metabolite_keys = list(set(sabio_rk_metabolite_keys + brenda_metabolite_keys))
            # ...going through them...
            for metabolite_key in metabolite_keys:
                # ...excluding the WILDCARD key...
                if metabolite_key == "WILDCARD":
                    continue
                # ...and adding the metabolites according to their presence in the databases :D
                is_metabolite_in_brenda: bool = metabolite_key in brenda_metabolite_keys
                is_metabolite_in_sabio_rk: bool = metabolite_key in sabio_rk_metabolite_keys
                if is_metabolite_in_brenda and is_metabolite_in_sabio_rk:
                    sabio_rk_entry = sabio_rk_database[ec_number_key][metabolite_key]
                    brenda_entry = brenda_database[ec_number_key][metabolite_key]
                    combined_database[ec_number_key][metabolite_key] = {**sabio_rk_entry, **brenda_entry}
                elif is_metabolite_in_brenda:
                    brenda_entry = brenda_database[ec_number_key][metabolite_key]
                    combined_database[ec_number_key][metabolite_key] = brenda_entry
                else:
                    sabio_rk_entry = sabio_rk_database[ec_number_key][metabolite_key]
                    combined_database[ec_number_key][metabolite_key] = sabio_rk_entry
            combined_database[ec_number_key]["WILDCARD"] = is_sabio_rk_from_wildcard
            combined_database[ec_number_key]["SOURCE"] = "BRENDA and SABIO-RK"
        # If only the SABIO-RK entry does not come from a wildcard, use it :D
        elif not is_sabio_rk_from_wildcard:
            combined_database[ec_number_key] = sabio_rk_database[ec_number_key]
            combined_database[ec_number_key]["WILDCARD"] = False
            combined_database[ec_number_key]["SOURCE"] = "SABIO-RK"
        # If only the BRENDA entry does not come from a wildcard, use it :-)
        elif not is_brenda_from_wildcard:
            combined_database[ec_number_key] = brenda_database[ec_number_key]
            combined_database[ec_number_key]["WILDCARD"] = False
            combined_database[ec_number_key]["SOURCE"] = "BRENDA"
    json_write(output_path, combined_database)