Beispiel #1
0
def reaction_flux_control_by_scenario(model: cobra.Model, output_folder: str,
                                      project_name: str, scenarios):
    """Creates reaction flux control files for the given model in the given folder, and depending on the given scenarios.

    Definition of 'reaction flux control file':
    A reaction flux control file contains a list of reactions and the realative change of the objective
    solution if the protein constraint of the reaction is removed.

    Arguments
    ----------
    * model: cobra.Model ~ The metabolic model for which the reaction flux control file shall be created.
    * output_folder: str ~ The folder in which the reaction flux control files shall be created
    * project_name: str ~ The name of the current project
    * scenarios ~ The scenarios for which the reaction control shall be calculated; these scenarios are
      applied on the model, one by one

    Output
    ----------
    Reaction flux control files in the given folder (see _reaction_flux_control()'s comment for more)
    """
    # Standardize output folder
    output_folder = standardize_folder(output_folder)

    # Go through each given scenario :D
    for scenario_key in scenarios.keys():
        scenario = scenarios[scenario_key]
        objective = scenario["target"]["reaction"]
        with model:
            model = apply_scenario_on_model(model, scenario)
            _reaction_flux_control(model, output_folder, project_name,
                                   scenario_key, objective)
Beispiel #2
0
def parse_bigg_metabolites_file(bigg_metabolites_file_path: str, json_output_folder: str) -> None:
    """Parses a BIGG metabolites text file and returns a dictionary for this file.

    As of 29/04/2019, a BIGG metabolites list of all BIGG-included metabolites
    is retrievable under http://bigg.ucsd.edu/data_access

    Arguments
    ----------
    * bigg_metabolites_file_path: str ~ The file path to the BIGG metabolites file.
      The usual file name (which has to be included too in this argument) is
      bigg_models_metabolites.txt
    * output_folder: str ~ The folder in which the JSON including the parsed BIGG
      metabolites file data is stored with the name 'bigg_id_name_mapping.json'

    Output
    ----------
    * A JSON file with the name 'bigg_id_name_mapping.json' in the given output folder,
      with the following structure:
    <pre>
     {
         "$BIGG_ID": "$CHEMICAL_OR_USUAL_NAME",
         (...),
         "$BIGG_ID": "$BIGG_ID",
         (...),
     }
    </pre>
    The BIGG ID <-> BIGG ID mapping is done for models which already use the BIGG IDs.
    """
    # Standardize output folder
    json_output_folder = standardize_folder(json_output_folder)

    # Open the BIGG metabolites file as string list, and remove all newlines
    with open(bigg_metabolites_file_path, "r") as f:
        lines = f.readlines()
    lines = [x.replace("\n", "") for x in lines if len(x) > 0]

    # Mapping variable which will store the BIGG ID<->
    bigg_id_name_mapping = {}
    # Go through each BIGG metabolites file line (which is a tab-separated file)
    # and retrieve the BIGG ID and the name (if there is a name for the given BIGG
    # ID)
    for line in lines:
        bigg_id = line.split("\t")[1]
        # Exception to check if there is no name :O
        try:
            name = line.split("\t")[2].lower()
        except Exception:
            continue

        bigg_id_name_mapping[name] = bigg_id
        bigg_id_name_mapping[bigg_id] = bigg_id

    # Write the JSON in the given folder :D
    json_write(json_output_folder+"bigg_id_name_mapping.json", bigg_id_name_mapping)
Beispiel #3
0
def get_initial_spreadsheets(model: cobra.Model, project_folder: str, project_name: str) -> None:
    """Creates a number of initially needed XLSX spreadsheets in the given folder.

    Output
    ----------
    The following spreadsheets are going to be created (all file names start with project_name+"_"):
    * reactions.xlsx ~ A list of all KEGG IDs for each reaction of the model, the user can then select
      one of the given KEGG IDs. It is checked (using the KEGG REST API) if a reaction does not occur
      in any KEGG pathway, and such reactions are automatically marked as not included.
    * metabolites.xlsx ~ A list of all KEGG IDs for each metabolite of the model, the user can then select
      one of the given KEGG IDs. It is checked (using the KEGG REST API) if a metabolite does not occur
      in any KEGG reaction, and such metabolites are automatically marked as not included.
    * compartments.xlsx ~ A list of the model's compartments, which allows the user to set
      a pH and ionic strength value for each of them. THis data is used for the calculation of
      thermodaynamic data.
    * metabolite_concentration.xlsx ~ Allows to set default maximal and minimal metabolite
      concentrations (used in THermoFBA or OptMDFPathway in CellNetAnalyzer) for each of the
      model's metabolites.
    * protein_data.xlsx ~ Allows to set the total protein pool as well as single protein concentrations
      of the model's proteins.
    * protein_data.xlsx ~ Allows to set the total protein pool as well as single protein concentrations
      of the model's proteins.
    * enzyme_stoichiometries.xlsx ~ Allows to set the internal stoichiometries for each enzyme of a given reaction.

    Arguments
    ----------
    * model: cobra.Model ~ The cobrapy model for which the initial spreadsheets will be created.
    * project_folder: str ~ In this folder, all XLSX files will be stored.
    * project_name: str ~ This project name is added before each XLSX file name.
    """
    # Standardize project folder
    project_folder = standardize_folder(project_folder)

    # Get basepath for all spreadsheets
    basepath = project_folder + project_name

    # GET REACTION KEGG IDS AND AMBIGUITIES
    reaction_id_kegg_id_mapping: Dict[str, str] = {}
    reaction_id_reaction_name_mapping: Dict[str, str] = {}
    reaction_id_eligible_ids_mapping: Dict[str, str] = {}
    for reaction in model.reactions:
        if "kegg.reaction" not in reaction.annotation.keys():
            print(f"INFO: Reaction {reaction.id} does not have a KEGG ID annotation")
            continue
        kegg_ids = reaction.annotation["kegg.reaction"]
        if type(kegg_ids) is str:  # Single ID :O
            kegg_ids = [kegg_ids]
        else:  # Multiple IDs :O
            entries = kegg_rest_get_batch(kegg_ids, batch_size=len(kegg_ids))
            i = 0
            eligible_ids = []
            for entry in entries:
                entry_as_str = "".join(entry)
                if "PATHWAY" in entry_as_str:
                    eligible_ids.append(kegg_ids[i])
                i += 1
            if len(eligible_ids) == 1:
                reaction_id_eligible_ids_mapping[reaction.id] = eligible_ids[0]

        reaction_id_kegg_id_mapping[reaction.id] = kegg_ids
        reaction_id_reaction_name_mapping[reaction.id] = reaction.name

    # GET METABOLITE KEGG IDS AND AMBIGUITIES
    metabolite_id_kegg_id_mapping: Dict[str, str] = {}
    metabolite_id_metabolite_name_mapping: Dict[str, str] = {}
    metabolite_name_eligible_ids_mapping: Dict[str, str] = {}
    searched_metabolites: List[str] = []
    for metabolite in model.metabolites:
        if "kegg.compound" not in metabolite.annotation.keys():
            print(f"INFO: Metabolite {metabolite.id} does not have a KEGG ID annotation")
            continue
        kegg_ids = metabolite.annotation["kegg.compound"]
        if type(kegg_ids) is str:  # Single ID :O
            kegg_ids = [kegg_ids]
        else:  # Multiple IDs :O
            if metabolite.name not in searched_metabolites:
                entries = kegg_rest_get_batch(kegg_ids, batch_size=len(kegg_ids))
                i = 0
                eligible_ids = []
                for entry in entries:
                    entry_as_str = "".join(entry)
                    if "REACTION" in entry_as_str:
                        eligible_ids.append(kegg_ids[i])
                    i += 1
                if len(eligible_ids) == 1:
                    metabolite_name_eligible_ids_mapping[metabolite.name] = eligible_ids[0]
                searched_metabolites.append(metabolite.name)
        metabolite_id_kegg_id_mapping[metabolite.id] = kegg_ids
        metabolite_id_metabolite_name_mapping[metabolite.id] = metabolite.name

    # Reactions <-> KEGG ID mapping XLSX :D
    workbook = xlsxwriter.Workbook(basepath+"_reactions.xlsx")
    worksheet = workbook.add_worksheet("Reaction IDs")

    yellow = workbook.add_format()
    yellow.set_bg_color("#FFFF00")
    blue = workbook.add_format()
    blue.set_bg_color("#FF00FF")

    row = 0
    for key in reaction_id_reaction_name_mapping.keys():
        worksheet.write(row, 0, key)
        worksheet.write(row, 1, reaction_id_reaction_name_mapping[key])
        column = 2
        kegg_ids = reaction_id_kegg_id_mapping[key]
        for kegg_id in kegg_ids:
            if len(kegg_ids) == 1:
                worksheet.write_url(row, column, "https://www.kegg.jp/dbget-bin/www_bget?rn:"+kegg_id)
                worksheet.write(row, column+1, kegg_id)
                worksheet.write(row, column+2, "Yes")
            else:
                if key in reaction_id_eligible_ids_mapping.keys():
                    if kegg_id == reaction_id_eligible_ids_mapping[key]:
                        worksheet.write_url(row, column, "https://www.kegg.jp/dbget-bin/www_bget?rn:"+kegg_id, blue)
                        worksheet.write(row, column+1, kegg_id, blue)
                        worksheet.write(row, column+2, "Yes")
                    else:
                        worksheet.write_url(row, column, "https://www.kegg.jp/dbget-bin/www_bget?rn:"+kegg_id, blue)
                        worksheet.write(row, column+1, kegg_id, blue)
                else:
                    worksheet.write_url(row, column, "https://www.kegg.jp/dbget-bin/www_bget?rn:"+kegg_id, yellow)
                    worksheet.write(row, column+1, kegg_id, yellow)
            column += 3
        row += 1
    workbook.close()

    # Metabolites<->KEGG ID mapping XLSX :D
    workbook = xlsxwriter.Workbook(basepath+"_metabolites.xlsx")
    worksheet = workbook.add_worksheet("Metabolite IDs")

    yellow = workbook.add_format()
    yellow.set_bg_color("#FFFF00")
    blue = workbook.add_format()
    blue.set_bg_color("#FF00FF")

    row = 0
    for key in metabolite_id_metabolite_name_mapping.keys():
        metabolite_name = metabolite_id_metabolite_name_mapping[key]
        worksheet.write(row, 0, key)
        worksheet.write(row, 1, metabolite_name)
        column = 2
        kegg_ids = metabolite_id_kegg_id_mapping[key]
        for kegg_id in kegg_ids:
            if len(kegg_ids) == 1:
                worksheet.write_url(row, column, "https://www.kegg.jp/dbget-bin/www_bget?cpd:"+kegg_id)
                worksheet.write(row, column+1, kegg_id)
                worksheet.write(row, column+2, "Yes")
            else:
                if metabolite_name in metabolite_name_eligible_ids_mapping.keys():
                    if kegg_id == metabolite_name_eligible_ids_mapping[metabolite_name]:
                        worksheet.write_url(row, column, "https://www.kegg.jp/dbget-bin/www_bget?cpd:"+kegg_id, blue)
                        worksheet.write(row, column+1, kegg_id, blue)
                        worksheet.write(row, column+2, "Yes")
                    else:
                        worksheet.write_url(row, column, "https://www.kegg.jp/dbget-bin/www_bget?cpd:"+kegg_id, blue)
                        worksheet.write(row, column+1, kegg_id, blue)
                else:
                    worksheet.write_url(row, column, "https://www.kegg.jp/dbget-bin/www_bget?cpd:"+kegg_id, yellow)
                    worksheet.write(row, column+1, kegg_id, yellow)
            column += 3
        row += 1
    workbook.close()

    # Compartment data XLSX :D
    workbook = xlsxwriter.Workbook(basepath+"_compartments.xlsx")
    worksheet = workbook.add_worksheet("Compartment pH")

    row = 0
    for compartment_key in model.compartments.keys():
        compartment_name = model.compartments[compartment_key]
        worksheet.write(row, 0, compartment_key)
        worksheet.write(row, 1, compartment_name)
        worksheet.write(row, 2, "NA")
        worksheet.write(row, 3, "NA")
        row += 1
    workbook.close()

    # Protein data XLSX :D
    print("NOTE: "+project_name+"_protein_data.xlsx has as default value for the enzyme pool P 0.095 mmol/gDW.")
    print("Please adjust the value accordingly for your model!")
    workbook = xlsxwriter.Workbook(basepath+"_protein_data.xlsx")
    worksheet = workbook.add_worksheet("Total protein data")
    worksheet.write(0, 0, "Total protein content [g/gDW]:")
    worksheet.write(0, 1, .095)
    worksheet.write(1, 0, "Fraction of masses of model-included enzymes in comparison to all enzymes (0.0 to 1.0):")
    worksheet.write(1, 1, 1.0)
    worksheet.write(2, 0, "Average saturation level (0.0 to 1.0):")
    worksheet.write(2, 1, 1.0)
    worksheet2 = workbook.add_worksheet("Single protein data")
    worksheet2.write(0, 0, "Protein ID (as in SBML model)")
    worksheet2.write(0, 1, "Protein concentration [mmol/gDW]")
    workbook.close()

    # Metabolite concentrations XLSX :D
    workbook = xlsxwriter.Workbook(basepath+"_metabolite_concentrations.xlsx")
    worksheet = workbook.add_worksheet("Default data")
    worksheet.write(0, 0, "Default minimal metabolite concentration [M]:")
    worksheet.write(1, 0, "Default maximal metabolite concentration [M]:")
    worksheet2 = workbook.add_worksheet("Single metabolite data")
    worksheet2.write(0, 0, "Metabolite ID (as in SBML model)")
    worksheet2.write(0, 1, "Minimal metabolite concentration [M]")
    worksheet2.write(0, 2, "Maximal metabolite concentration [M]")
    workbook.close()

    # Enzyme stoichiometry XLSX :D
    # Get gene rule <-> Reaction ID mapping
    reaction_id_gene_rules_mapping = {}
    for reaction in model.reactions:
        listed_gene_rules = _gene_rule_as_list(reaction.gene_reaction_rule)
        if listed_gene_rules != ['']:
            reaction_id_gene_rules_mapping[reaction.id] = listed_gene_rules

    # Write XLSX
    workbook = xlsxwriter.Workbook(basepath+"_enzyme_stoichiometries.xlsx")
    # Gene stoichiometry worksheets
    worksheet_stoichiometry = workbook.add_worksheet("Stoichiometries of complexes")
    line = 0
    for reaction_id in reaction_id_gene_rules_mapping.keys():
        gene_rule = reaction_id_gene_rules_mapping[reaction_id]
        if gene_rule == [""]:
            continue
        worksheet_stoichiometry.write(line, 0, reaction_id)
        row = 1
        for or_part in gene_rule:
            worksheet_stoichiometry.write(line, row, str(or_part))
            if type(or_part) is str:
                default_stoichiometry = "1"
            else:
                default_stoichiometry = ";".join(["1" for _ in range(len(or_part))])
            worksheet_stoichiometry.write(line, row+1, default_stoichiometry)
            row += 2
        line += 1

    workbook.close()
Beispiel #4
0
def create_gecko_model_reaction_wise(model: cobra.Model, output_sbml_name: str,
                                     project_folder: str, project_name: str, excluded_reactions: List[str]) -> cobra.Model:
    """Creates a GECKO model as described in

    <i>
    Sánchez, B. J., Zhang, C., Nilsson, A., Lahtvee, P. J., Kerkhoven, E. J., & Nielsen, J. (2017).
    Improving the phenotype predictions of a yeast genome‐scale metabolic model by incorporating enzymatic
    constraints. Molecular systems biology, 13(8).
    </i>

    Arguments
    ----------
    * model: cobra.Model ~ A cobra Model representation of the metabolic network. This model will
      be changed using cobrapy functions in order to add the proteomic constraints.
    * output_sbml_name: str ~ The base name of the created SBML.
    * project_folder: str ~ The folder in which the spreadsheets and JSONs with the model's supplemental
      data can be found.
    * project_name: str ~ The sMOMENTed model creation's name, which will be added at the beginning
      of the created SBML's name.
    * excluded_reactions: List[str] ~ A string list of reaction IDs (the 'reverse' and 'forward'
      name additions must not be added, i.e. for 'ACALD_forward' just 'ACALD' has to be given) to
      which no kcat shall be added. Typically used for gas exchange reactions such as 'CO2tex'.
    """
    # Standardize project folder
    project_folder = standardize_folder(project_folder)

    # This base path is the location were the generated files wil be stored
    basepath: str = project_folder + project_name

    # READ REACTIONS<->KEGG ID XLSX
    protein_id_mass_mapping: Dict[str, float] = json_load(basepath+"_protein_id_mass_mapping.json")

    # LOAD XLSX WITH PROTEIN DATA
    # Load protein data XLSX
    protein_id_concentration_mapping, p_total, unmeasured_protein_fraction, mean_saturation = \
        read_protein_data_xlsx(basepath)
    # Read enzyme kinetics xlsx
    reaction_id_gene_rules_mapping, reaction_id_gene_rules_protein_stoichiometry_mapping = \
        read_enzyme_stoichiometries_xlsx(basepath)

    # Read reaction <-> kcat mapping :D
    reactions_kcat_mapping_database = json_load(basepath + "_reactions_kcat_mapping_combined.json")
    all_kcats = [x["forward"] for x in reactions_kcat_mapping_database.values()] + \
                [x["reverse"] for x in reactions_kcat_mapping_database.values()]
    all_kcats = [x for x in all_kcats if not math.isnan(x)]
    default_kcat = statistics.median(all_kcats)
    print(f"Default kcat is: {default_kcat}")

    # GECKO :D #
    # This ID addition will be added to all reactions which are modified by this method
    id_addition = "_TG_"
    # Calculate p_measured
    p_measured = get_p_measured(protein_id_concentration_mapping, protein_id_mass_mapping)
    # Make model irreversible
    model = get_irreversible_model(model, id_addition)
    # Add prot_pool reaction
    model, prot_pool_metabolite = add_prot_pool_reaction(model, id_addition, p_total, p_measured,
                                                         unmeasured_protein_fraction, mean_saturation)

    # Add enzyme source reaction for every unmeasured protein
    for protein_id in list(protein_id_mass_mapping.keys()):
        if protein_id in list(protein_id_concentration_mapping.keys()):  # Measured
            eu = cobra.Reaction(id=id_addition+"EU_"+protein_id,
                                name=f"Enzyme usage reaction of measured protein {protein_id}",
                                subsystem="AutoPACMEN")
            enzyme = cobra.Metabolite(id=protein_id+"_met",
                                      name=f"Protein {protein_id}",
                                      compartment="AutoPACMEN")

            eu.add_metabolites({enzyme: 1.0})
            eu.lower_bound = 0
            eu.upper_bound = protein_id_concentration_mapping[protein_id]
            model.add_reactions([eu])
        else:  # Unmeasured
            er = cobra.Reaction(id=id_addition+"ER_"+protein_id,
                                name=f"Enzyme usage reaction of unmeasured protein {protein_id}",
                                subsystem="AutoPACMEN")
            enzyme = cobra.Metabolite(id=protein_id+"_met",
                                      name=f"Protein {protein_id}",
                                      compartment="AutoPACMEN")
            molecular_weight = protein_id_mass_mapping[protein_id] / 1000  # Mapping is in Da, GECKO uses kDa (g/mmol)
            er.add_metabolites({prot_pool_metabolite: -molecular_weight,
                                enzyme: 1})
            er.lower_bound = 0
            er.upper_bound = 1000.0
            model.add_reactions([er])

    # Add enzymes to reactions
    current_arm_reaction = 1
    model_reaction_ids = [x.id for x in model.reactions]
    for model_reaction_id in model_reaction_ids:
        reaction = model.reactions.get_by_id(model_reaction_id)
        splitted_id = reaction.id.split(id_addition)

        # If the reaction has no name, ignore it
        if splitted_id[0] == "":
            continue
        # Take the reaction ID from the first part of the split
        reaction_id = splitted_id[0]
        # If the reaction has no associated enzyme stoichiometries, ignore it
        if reaction_id not in list(reaction_id_gene_rules_mapping.keys()):
            continue
        # If the reaction has no gene rule, ignore it
        gene_rule = reaction_id_gene_rules_mapping[reaction_id]
        if gene_rule == [""]:
            continue
        # If the reaction is manually excluded, ignore it
        if reaction_id in excluded_reactions:
            continue

        all_available = True
        for enzyme in gene_rule:
            if type(enzyme) == str:
                try:
                    model.metabolites.get_by_id(enzyme+"_met")
                except Exception:
                    all_available = False
                    break
            else:
                for enzyme_id in enzyme:
                    try:
                        model.metabolites.get_by_id(enzyme_id+"_met")
                    except Exception:
                        all_available = False
                        break
        if not all_available:
            continue

        # Retrieve the reaction's forward and reverse kcats from the given reaction<->kcat database
        if reaction_id in reactions_kcat_mapping_database.keys():
            forward_kcat = reactions_kcat_mapping_database[reaction_id]["forward"]
            reverse_kcat = reactions_kcat_mapping_database[reaction_id]["reverse"]
        # If the reaction is not in the database, set the default kcat
        else:
            forward_kcat = default_kcat
            reverse_kcat = default_kcat

        # If the given reaction<->kcat database contains math.nan as the reaction's kcat,
        # set the default kcat as math.nan means that no kcat could be found.
        if math.isnan(forward_kcat):
            forward_kcat = default_kcat
        if math.isnan(reverse_kcat):
            reverse_kcat = default_kcat

        # Add the given forward or reverse kcat is the reaction was
        # splitted due to its reversibility.
        # If the reaction is not splitted, add the forward kcat (this
        # is the only possible direction for non-splitted=non-reversible
        # reactions)
        if model_reaction_id.endswith(id_addition + "forward"):
            reaction_kcat = forward_kcat
        elif model_reaction_id.endswith(id_addition + "reverse"):
            reaction_kcat = reverse_kcat
        else:
            reaction_kcat = forward_kcat

        # Add arm reaction if isozymes occur
        if len(gene_rule) > 1:  # Isozymes occur :O
            arm_reaction_id = id_addition+f"arm_reaction_{current_arm_reaction}"
            arm_reaction_name = f"Arm reaction no. {current_arm_reaction} for gene rule {str(gene_rule)}"
            arm_reaction = cobra.Reaction(id=arm_reaction_id,
                                          name=arm_reaction_name,
                                          subsystem="AutoPACMEN")

            arm_reaction_metabolites = {}
            for metabolite in list(reaction.metabolites.keys()):
                stoichiometry = reaction.metabolites[metabolite]
                if stoichiometry < 0:  # Educt
                    arm_reaction_metabolites[metabolite] = stoichiometry
                    reaction.add_metabolites({metabolite: -stoichiometry})

            im_id = f"im_{current_arm_reaction}"
            im_name = f"Intermediate metabolite of arm reaction {current_arm_reaction}"
            intermediate_metabolite = cobra.Metabolite(id=im_id,
                                                       name=im_name,
                                                       compartment="AutoPACMEN")
            arm_reaction_metabolites[intermediate_metabolite] = 1
            arm_reaction.add_metabolites(arm_reaction_metabolites)
            reaction.add_metabolites({intermediate_metabolite: -1})

            arm_reaction.lower_bound = 0
            arm_reaction.upper_bound = reaction.upper_bound

            model.add_reactions([arm_reaction])

            current_arm_reaction += 1

        # Add reactions depending on isozyme complex presence
        new_reactions = []
        i = 1
        for isozyme_id in gene_rule:
            new_reaction = copy.deepcopy(reaction)
            new_reaction.id = new_reaction.id + id_addition + str(i)
            protein_ids = []
            if type(isozyme_id) is str:  # No complex :O
                protein = model.metabolites.get_by_id(isozyme_id+"_met")
                reaction_id = reaction_id.split("_TG_")[0]

                stoichiometry = reaction_id_gene_rules_protein_stoichiometry_mapping[reaction_id][isozyme_id][isozyme_id]
                stoichiometry /= (reaction_kcat * 3600)
                stoichiometry *= -1

                metabolites = {}
                metabolites[protein] = stoichiometry
                protein_ids.append(isozyme_id)
                new_reaction.add_metabolites(metabolites)
            else:  # Complex :O
                metabolites = {}
                isozyme_id = tuple(isozyme_id)

                for single_id in isozyme_id:
                    protein = model.metabolites.get_by_id(single_id+"_met")
                    reaction_id = reaction_id.split("_TG_")[0]

                    stoichiometry = reaction_id_gene_rules_protein_stoichiometry_mapping[reaction_id][isozyme_id][single_id]
                    stoichiometry /= (reaction_kcat * 3600)
                    stoichiometry *= -1

                    metabolites[protein] = stoichiometry
                    protein_ids.append(single_id)
                new_reaction.add_metabolites(metabolites)

            gene_reaction_rule = " and ".join(protein_ids)
            new_reaction.gene_reaction_rule = gene_reaction_rule
            new_reactions.append(new_reaction)
            i += 1
        model.add_reactions(new_reactions)
        model.remove_reactions([reaction])

    cobra.io.write_sbml_model(model, project_folder+output_sbml_name)

    return model
Beispiel #5
0
def get_reactions_kcat_mapping(sbml_path: str, project_folder: str,
                               project_name: str, organism: str,
                               kcat_database_path: str,
                               protein_kcat_database_path: str,
                               type_of_kcat_selection: str) -> None:
    """Returns a reaction<->kcat mapping for the given model :D

    The selection of kcats is depending on the affected metabolites of the reaction direction (one
    kcat is given for each the forward and reverse direction), and on the organism (the kcats
    from the taxonomically nearest organism is prefered).

    Arguments
    ----------
    *sbml_path: str ~ Te SBML path to the model
    *project_folder: str ~ The folder in which the model data files are sored
    *project_name: str ~ The name of the used project
    *organism: str ~ The organism's name
    *kcat_database_path: str ~ A path to an already created EC number<->kcats database
    *protein_kcat_database_path: str ~ A path to the custom protein<->kcat database
    *type_of_kcat_selection: str ~ Can be "mean", "median" or "random". Refers to the selection of found kcats of a reaction.
                                   Is "mean" by default.

    Output
    ----------
    A JSON in the given project folder with the name $project_name+'_reactions_kcat_mapping_combined.json' and
    the following structure:
    <pre>
    {
        "$REACTION_NAME": {
            "forward": $forward_kcat,
            "reverse": $reverse_kcat
        },
        (...)
    }
    </pre>
    """
    # Standardize project folder
    project_folder = standardize_folder(project_folder)
    # Set the path for the output JSON
    basepath = project_folder + project_name
    # Load the combined, EC-number-dependent kcat database :D
    kcat_database = json_load(kcat_database_path)
    # If given, load the protein-dependent kcat database :D
    if protein_kcat_database_path != "none":
        protein_kcat_database = json_load(protein_kcat_database_path)
    else:
        protein_kcat_database = {}

    # Load the given stoichiometric model
    model = cobra.io.read_sbml_model(sbml_path)

    # Set-up dictionary which will be the content of the output JSON
    reactions_kcat_mapping: Dict[str, Dict[str, float]] = {}
    # Go through each reaction in order to assign kcats for it :D
    for reaction in model.reactions:
        # If no EC number is given in the reaction's annotations,
        # the protein-dependent database is read out in order to
        # find a kcat. This only works if at least one of the assigned
        # enzymes of the reaction's gene rule has a kcat in the
        # protein-dependent database.
        if "ec-code" not in reaction.annotation.keys():
            # 0 means that no kcat can be assigned
            forward_kcat: Any = 0
            reverse_kcat: Any = 0

            if protein_kcat_database != {}:
                # Retrieve the kcats from the protein-dependent database :D
                forward_kcat = _get_kcat_from_protein_kcat_database(
                    "forward", reaction, protein_kcat_database)
                reverse_kcat = _get_kcat_from_protein_kcat_database(
                    "reverse", reaction, protein_kcat_database)

            # If no kcat could be assigned, set the kcat to math.nan
            # which indicates this case
            if forward_kcat == 0.0:
                forward_kcat = math.nan
            if reverse_kcat == 0.0:
                reverse_kcat = math.nan

            # Add the retrieved forward and reverse kcats to the reaction<->kcat mapping dictionary :D
            reactions_kcat_mapping[reaction.id] = {}
            reactions_kcat_mapping[reaction.id]["forward"] = forward_kcat
            reactions_kcat_mapping[reaction.id]["reverse"] = reverse_kcat

            # Print the assigned kcats
            _print_assigned_kcats(reaction.id, forward_kcat, reverse_kcat)
            continue

        # Retrieve the reaction's associated EC numbers
        reaction_ids = reaction.annotation["ec-code"]
        # If only one EC number is given, set the EC number string to
        # a list in order to make it work with the following code lines
        if type(reaction_ids) is str:
            reaction_ids = [reaction_ids]
        # Get all EC numbers which do not contain a - wildcard, such as
        # in 2.1.1.-
        # These wildcarded EC numbers are in general too permissive in order
        # to get useful kcats
        eligible_reaction_ids = [x for x in reaction_ids if "-" not in x]
        if len(eligible_reaction_ids) == 0:
            eligible_reaction_ids = [x for x in reaction_ids]

        # Create a 'complete entry' from all eligible (i.e., non-wildcarded)
        # EC numbers. This complete entry contains - for every organism
        # and substrate given in the EC number kcat entries - all kcats
        # of all eligible EC numbers. In addition, the pseudo-substrate
        # "ALL" is added which contains all organisms. "ALL" is used
        # later if no fitting substrate can be found.
        complete_entry: Dict[str, Any] = {}
        complete_entry["ALL"] = {}
        # Go through each reaction ID :D
        for reaction_id in eligible_reaction_ids:
            # If the EC number could not be found in the given EC number<->kcat
            # database, print it and proceed with the next eligible EC number
            if reaction_id not in kcat_database.keys():
                print(f"INFO: No entry for EC number {reaction_id}")
                print("")
                continue
            # Otherwise, get the reaction ID entry from the given database :D
            reaction_id_entry = kcat_database[reaction_id]
            # Exclude all kcat entries which come from a wildcard search
            # with *
            if reaction_id_entry["WILDCARD"]:
                continue
            # Go trough each metabolite in the EC number<->kcat database entries
            for metabolite_key in reaction_id_entry.keys():
                # Ignore the keys which show additional information
                # about the nature of the kcat data
                if metabolite_key in ("WILDCARD", "SOURCE", "TRANSFER"):
                    continue
                # Add the metabolite to the complete entry if it does not already occur
                if metabolite_key not in complete_entry:
                    complete_entry[metabolite_key] = {}
                # Go throudh each species in the currently analyzed EC number
                for species_key in reaction_id_entry[metabolite_key]:
                    # Add the species to the metabolite entry if it does not already occur
                    if species_key not in complete_entry[metabolite_key]:
                        complete_entry[metabolite_key][species_key] = []
                    # ...and do the same for the pseudo-metabolite "ALL"
                    if species_key not in complete_entry["ALL"].keys():
                        complete_entry["ALL"][species_key] = []
                    # Add the list of kcats of the currently analyzed EC number to the current species
                    # and the current metabolite, and for "ALL"
                    complete_entry[metabolite_key][
                        species_key] += reaction_id_entry[metabolite_key][
                            species_key]
                    complete_entry["ALL"][species_key] += reaction_id_entry[
                        metabolite_key][species_key]

        # If no entries with kcats could be found for any of the eligible EC numbers, continue with the next reaction.
        if complete_entry["ALL"] == {}:
            continue

        # Get the BIGG IDs of the educts and products uusing the SBML's BIGG ID annotation
        educt_bigg_ids: List[str] = []
        for reactant in reaction.reactants:
            if "bigg.metabolite" in reactant.annotation.keys():
                educt_bigg_ids.append(reactant.annotation["bigg.metabolite"])
        product_bigg_ids: List[str] = []
        for product in reaction.products:
            if "bigg.metabolite" in product.annotation.keys():
                product_bigg_ids.append(product.annotation["bigg.metabolite"])
        # If no bigg IDs could be found in the SBML, add the pseudo-metabolite "X"
        # which indicated that "ALL" should be used later.
        if len(educt_bigg_ids) == 0:
            educt_bigg_ids = ["X"]
        if len(product_bigg_ids) == 0:
            product_bigg_ids = ["X"]

        # Get the metabolites which are used in the subsequent forward kcat search
        searched_educts = _get_searched_metabolites(complete_entry,
                                                    educt_bigg_ids)
        # Get the forward kcat depending on the educts and the organism
        forward_kcat = _get_kcat(searched_educts, complete_entry, organism,
                                 "forward", reaction, protein_kcat_database,
                                 type_of_kcat_selection)

        # Get the metabolites which are used in the subsequent forward kcat search
        searched_products = _get_searched_metabolites(complete_entry,
                                                      product_bigg_ids)
        # Get the reverse kcat depending on the products and the organism
        reverse_kcat = _get_kcat(searched_products, complete_entry, organism,
                                 "reverse", reaction, protein_kcat_database,
                                 type_of_kcat_selection)

        # Set the found out kcats in the reactions<->kcat mapping :D
        reactions_kcat_mapping[reaction.id] = {}
        reactions_kcat_mapping[reaction.id]["forward"] = forward_kcat
        reactions_kcat_mapping[reaction.id]["reverse"] = reverse_kcat

        # display the found out kcats for this reaction \o/
        _print_assigned_kcats(reaction.id, forward_kcat, reverse_kcat)

    # Export the kcat mapping results as JSON :D
    json_write(basepath + "_reactions_kcat_mapping_combined.json",
               reactions_kcat_mapping)
Beispiel #6
0
def get_protein_mass_mapping(model: cobra.Model, project_folder: str,
                             project_name: str) -> None:
    """Returns a JSON with a mapping of protein IDs as keys, and as values the protein mass in kDa.

    The protein masses are calculated using the amino acid sequence from UniProt (retrieved using
    UniProt's REST API).

    Arguments
    ----------
    * model: cobra.Model ~ The model in the cobrapy format
    * project_folder: str ~ The folder in which the JSON shall be created
    * project_name: str ~ The beginning of the JSON's file name

    Output
    ----------
    A JSON file with the path project_folder+project_name+'_protein_id_mass_mapping.json'
    and the following structure:
    <pre>
    {
        "$PROTEIN_ID": $PROTEIN_MASS_IN_KDA,
        (...),
    }
    </pre>
    """
    # Standardize project folder
    project_folder = standardize_folder(project_folder)

    # The beginning of the created JSON's path :D
    basepath: str = project_folder + project_name

    # GET UNIPROT ID - PROTEIN MAPPING
    uniprot_id_protein_id_mapping: Dict[str, List[str]] = {}
    for gene in model.genes:
        # Without a UniProt ID, no mass mapping can be found
        if "uniprot" not in gene.annotation:
            continue
        uniprot_id = gene.annotation["uniprot"]
        if uniprot_id in uniprot_id_protein_id_mapping.keys():
            uniprot_id_protein_id_mapping[uniprot_id].append(gene.id)
        else:
            uniprot_id_protein_id_mapping[uniprot_id] = [gene.id]

    # GET UNIPROT ID<->PROTEIN MASS MAPPING
    uniprot_id_protein_mass_mapping: Dict[str, float] = {}
    # The cache stored UniProt masses for already searched
    # UniProt IDs (each file in the cache folder has the name
    # of the corresponding UniProt ID). This prevents searching
    # UniProt for already found protein masses. :-)
    cache_basepath = "./_cache/uniprot/"
    ensure_folder_existence("./_cache/")
    ensure_folder_existence(cache_basepath)
    cache_files = get_files(cache_basepath)
    # Go through each batch of UniProt IDs (multiple UniProt IDs
    # are searched at once in order to save an amount of UniProt API calls)
    # and retrieve the amino acid sequences and using these sequences, their
    # masses.
    uniprot_ids = list(uniprot_id_protein_id_mapping.keys())
    batch_size = 5
    batch_start = 0
    while batch_start < len(uniprot_ids):
        # Create the batch with all UniProt IDs
        prebatch = uniprot_ids[batch_start:batch_start + batch_size]
        batch = []
        # Remove all IDs which are present in the cache (i.e.,
        # which were searched for already).
        # The cache consists of pickled protein mass floats, each
        # onein a file with the name of the associated protein.
        for uniprot_id in prebatch:
            if uniprot_id not in cache_files:
                batch.append(uniprot_id)
            else:
                cache_filepath = cache_basepath + uniprot_id
                uniprot_id_protein_mass_mapping[uniprot_id] = pickle_load(
                    cache_filepath)
                print(uniprot_id + ":",
                      uniprot_id_protein_mass_mapping[uniprot_id])

        # If all IDs could be found in the cache, continue with the next batch.
        if len(batch) == 0:
            batch_start += batch_size
            continue

        # Create the UniProt query for the batch
        # With 'OR', all given IDs are searched, and subsequently in this script,
        # the right associated masses are being picked.
        query = " OR ".join(batch)
        uniprot_query_url = f"https://www.uniprot.org/uniprot/?query={query}&format=tab&columns=id,sequence"

        # Call UniProt's API :-)
        uniprot_data = requests.get(uniprot_query_url).text.split("\n")
        # Wait in order to cool down their server :-)
        time.sleep(2.0)

        # Read out the API-returned lines
        for line in uniprot_data[1:]:
            if line == "":
                continue
            uniprot_id = line.split("\t")[0]
            sequence = line.split("\t")[1]
            # Get the protein mass using biopython's associated function for amino acid sequences
            mass = ProteinAnalysis(sequence,
                                   monoisotopic=False).molecular_weight()
            uniprot_id_protein_mass_mapping[uniprot_id] = float(mass)

        # Create the pickled cache files for the searched protein masses
        for uniprot_id in batch:
            cache_filepath = cache_basepath + uniprot_id
            pickle_write(cache_filepath,
                         uniprot_id_protein_mass_mapping[uniprot_id])

        # Continue with the next batch :D
        batch_start += batch_size

    # Create the final protein ID <-> mass mapping
    protein_id_mass_mapping: Dict[str, float] = {}
    for uniprot_id in list(uniprot_id_protein_mass_mapping.keys()):
        try:
            protein_ids = uniprot_id_protein_id_mapping[uniprot_id]
        except Exception:
            print(f"No mass found for {uniprot_id}!")
            continue
        for protein_id in protein_ids:
            protein_id_mass_mapping[
                protein_id] = uniprot_id_protein_mass_mapping[uniprot_id]

    # Write protein mass list JSON :D
    json_write(basepath + "_protein_id_mass_mapping.json",
               protein_id_mass_mapping)
Beispiel #7
0
def create_smoment_model_reaction_wise(
        model: cobra.Model,
        output_sbml_name: str,
        project_folder: str,
        project_name: str,
        excluded_reactions: List[str],
        type_of_default_kcat_selection: str = "median") -> None:
    """Adds proteomic constraints according to sMOMENT to the given stoichiometric model and stores it as SBML.

    Arguments
    ----------

    * model: cobra.Model ~ A cobra Model representation of the metabolic network. This model will
      be changed using cobrapy functions in order to add the proteomic constraints.
    * output_sbml_name: str ~ The base name of the created SBML.
    * project_folder: str ~ The folder in which the spreadsheets and JSONs with the model's supplemental
      data can be found.
    * project_name: str ~ The sMOMENTed model creation's name, which will be added at the beginning
      of the created SBML's name.
    * excluded_reactions: List[str] ~ A string list of reaction IDs (the 'reverse' and 'forward'
      name additions must not be added, i.e. for 'ACALD_forward' just 'ACALD' has to be given) to
      which no kcat shall be added. Typically used for gas exchange reactions such as 'CO2tex'.
    * type_of_default_kcat_selection: str ~ The type of selection of default kcat values. Can be "mean",
      "median" or "random". Is "median" by default.

    Output
    ----------
    An SBML in the given folder with the given name, which describes the given stoichiometric model
    enhanced by the protein constraint introduction with this function.
    """
    # Standardize project folder
    project_folder = standardize_folder(project_folder)

    # Set folder path for newly created SBML and name for the reaction ID addition (added at the end,
    # and used in order to have a programatically convinient way to separate additions such as 'reverse'
    # from the 'actual' reaction ID).
    basepath: str = project_folder + project_name
    id_addition: str = "_TG_"

    # READ REACTIONS<->KEGG ID XLSX
    protein_id_mass_mapping: Dict[str, float] = json_load(
        basepath + "_protein_id_mass_mapping.json")

    # Load protein data XLSX
    protein_id_concentration_mapping, p_total, unmeasured_protein_fraction, mean_saturation = \
        read_protein_data_xlsx(basepath)

    # Read enzyme stoichiometries xlsx
    reaction_id_gene_rules_mapping, reaction_id_gene_rules_protein_stoichiometry_mapping = \
        read_enzyme_stoichiometries_xlsx(basepath)

    # Calculate p_measured
    p_measured = get_p_measured(protein_id_concentration_mapping,
                                protein_id_mass_mapping)

    # Split reactions with measured enzymes
    model, reaction_id_gene_rules_mapping, reaction_id_gene_rules_protein_stoichiometry_mapping = \
        get_model_with_separated_measured_enzyme_reactions(model,
                                                           protein_id_concentration_mapping,
                                                           reaction_id_gene_rules_mapping,
                                                           reaction_id_gene_rules_protein_stoichiometry_mapping,
                                                           excluded_reactions,
                                                           protein_id_mass_mapping)

    # Make model irreversible, separating all reversible reactions to which a gene rule is given
    # in order to save some reactions.
    model = get_irreversible_model(model, id_addition)

    # Add prot_pool reaction according to the given protein pool values
    model, prot_pool_metabolite = add_prot_pool_reaction(
        model, id_addition, p_total, p_measured, unmeasured_protein_fraction,
        mean_saturation)

    # Read reaction <-> kcat mapping :-)
    reactions_kcat_mapping_database = json_load(
        basepath + "_reactions_kcat_mapping_combined.json")

    # sMOMENT :D
    # Get all kcats which are not math.nan and calculate the median of them, which will be used as default kcat
    all_kcats = [x["forward"] for x in reactions_kcat_mapping_database.values()] + \
                [x["reverse"] for x in reactions_kcat_mapping_database.values()]
    all_kcats = [x for x in all_kcats if not math.isnan(x)]

    if type_of_default_kcat_selection == "median":
        default_kcat = statistics.median(all_kcats)
    elif type_of_default_kcat_selection == "mean":
        default_kcat = statistics.mean(all_kcats)
    elif type_of_default_kcat_selection == "random":
        default_kcat = random.choice(all_kcats)
    else:
        print(
            'ERROR: Argument type_of_default_kcat_selection must be either "median", "mean" or "random".'
        )
        sys.exit(-1)

    print(f"Default kcat is: {default_kcat}")

    # Get all reaction IDs of the given model
    model_reaction_ids = [x.id for x in model.reactions]

    # Add measured enzyme pseudo-metabolites and pseudo-reactions
    for protein_id in protein_id_concentration_mapping.keys():
        new_metabolite = cobra.Metabolite(
            id="ENZYME_" + protein_id,
            name="Pseudo-metabolite of protein " + protein_id,
            compartment="sMOMENT")
        max_protein_concentration = protein_id_concentration_mapping[
            protein_id]
        new_reaction = cobra.Reaction(
            id="ENZYME_DELIVERY_" + protein_id,
            name="Delivery reaction of pseudo-metabolite " + protein_id,
            lower_bound=0,
            upper_bound=max_protein_concentration)
        new_reaction.add_metabolites({new_metabolite: 1})
        model.add_reactions([new_reaction])

    # Main loop :D, add enzyme constraints to reactions \o/
    for model_reaction_id in model_reaction_ids:
        # Get the reaction and split the ID at the ID addition
        reaction = model.reactions.get_by_id(model_reaction_id)
        splitted_id = reaction.id.split(id_addition)

        # If the reaction has no name, ignore it
        if splitted_id[0] == "":
            continue
        # Take the reaction ID from the first part of the split
        reaction_id = splitted_id[0]
        # Remove GPRSPLIT name addition from reactions with measured protein concentrations
        if "_GPRSPLIT_" in reaction_id:
            reaction_id = reaction_id.split("_GPRSPLIT_")[0]

        # If the reaction has no associated enzyme stoichiometries, ignore it
        if reaction_id not in list(reaction_id_gene_rules_mapping.keys()):
            continue
        # If the reaction has no gene rule, ignore it
        gene_rule = reaction_id_gene_rules_mapping[reaction_id]
        if gene_rule == [""]:
            continue
        # If the reaction is manually excluded, ignore it
        if reaction_id in excluded_reactions:
            continue

        # Check if all proteins in the reaction's gene rule have a found mass
        # This is not the case for e.g. spontaneous reactions which often get the pseudo-enzyme 's0001'
        all_available = True
        for enzyme in gene_rule:
            if type(enzyme) == str:
                if enzyme not in list(protein_id_mass_mapping.keys()):
                    print(enzyme)
                    all_available = False
                    break
            else:
                for enzyme_id in enzyme:
                    if enzyme_id not in list(protein_id_mass_mapping.keys()):
                        all_available = False
                        break
        # If not all of the mass-checked enzymes have a found mass, ignore this reaction
        if not all_available:
            continue

        # Retrieve the reaction's forward and reverse kcats from the given reaction<->kcat database
        if reaction_id in reactions_kcat_mapping_database.keys():
            forward_kcat = reactions_kcat_mapping_database[reaction_id][
                "forward"]
            reverse_kcat = reactions_kcat_mapping_database[reaction_id][
                "reverse"]
        # If the reaction is not in the database, set the default kcat
        else:
            forward_kcat = default_kcat
            reverse_kcat = default_kcat

        # If the given reaction<->kcat database contains math.nan as the reaction's kcat,
        # set the default kcat as math.nan means that no kcat could be found.
        if math.isnan(forward_kcat):
            forward_kcat = default_kcat
        if math.isnan(reverse_kcat):
            reverse_kcat = default_kcat

        # Add the given forward or reverse kcat is the reaction was
        # splitted due to its reversibility.
        # If the reaction is not splitted, add the forward kcat (this
        # is the only possible direction for non-splitted=non-reversible
        # reactions)
        if model_reaction_id.endswith(id_addition + "forward"):
            reaction_kcat = forward_kcat
        elif model_reaction_id.endswith(id_addition + "reverse"):
            reaction_kcat = reverse_kcat
        else:
            reaction_kcat = forward_kcat

        # Add protein pool pseudo-metabolite depending on isozyme complex presence
        stoichiometries: List[float] = [
        ]  # List of selectable MW/kcat stoichiometries (the most conservative constraint will be chosen)
        stoichiometry_enzyme_name_list: List[str] = [
        ]  # List of enzyme names and stoichiometries (semicolon-separated) for a console report
        for isozyme_id in gene_rule:
            # If it's not a complex :O...
            if type(isozyme_id) is str:
                # ...get the reaction ID without the additions...
                reaction_id = reaction_id.split("_TG_")[0]

                # ...get the number of units for this protein...
                number_units = reaction_id_gene_rules_protein_stoichiometry_mapping[
                    reaction_id][isozyme_id][isozyme_id]
                stoichiometry = number_units
                # ...and determine the protein pool stoichiometry by
                # 1) Multiplying the number of units for this protein with its mass (converted from kDa to mDa, since the reaction
                #    flux is defined for mmol/(gDW*h) and not mol/(gDW*h))
                stoichiometry *= (protein_id_mass_mapping[isozyme_id] / 1000)
                # 2) Dividing it with the reaction's kcat (converted from 1/s to 1/h)
                stoichiometry /= (reaction_kcat * 3600)
                # 3) Setting the right direction (educt)
                stoichiometry *= -1
                stoichiometries.append(stoichiometry)
                stoichiometry_enzyme_name_list.append(isozyme_id + ";" +
                                                      str(number_units))

                # Add proteomics constraints
                if isozyme_id in protein_id_concentration_mapping.keys():
                    enzyme_pseudo_metabolite = model.metabolites.get_by_id(
                        "ENZYME_" + isozyme_id)
                    stoichiometry = reaction_id_gene_rules_protein_stoichiometry_mapping[
                        reaction_id][isozyme_id][isozyme_id]
                    stoichiometry *= 1 / (reaction_kcat * 3600)
                    stoichiometry *= -1
                    reaction.add_metabolites(
                        {enzyme_pseudo_metabolite: stoichiometry})
            # If it is a complex :O...
            else:
                # ...convert the complex IDs to a hashable tuple (used for the stoichiometry selection)...
                isozyme_id = tuple(isozyme_id)
                stoichiometry = 0

                # ...go through each single ID of the complex...
                stoichiometry_enzyme_name_list.append("")
                for single_id in isozyme_id:
                    # ...get the reaction ID without additions...
                    reaction_id = reaction_id.split("_TG_")[0]

                    # ...get the number of units for this protein...
                    number_units = reaction_id_gene_rules_protein_stoichiometry_mapping[
                        reaction_id][isozyme_id][single_id]
                    single_stoichiometry = number_units
                    # ...and determine the protein pool stoichiometry addition by
                    # 1) Multiplying the number of units for this protein with its mass (converted from kDa to Da)
                    single_stoichiometry *= (
                        protein_id_mass_mapping[single_id] / 1000)
                    # 2) Dividing it with the reaction's kcat (converted from 1/s to 1/h)
                    single_stoichiometry /= (reaction_kcat * 3600)
                    # 3) Setting the right direction (educt)
                    single_stoichiometry *= -1
                    # 4) and add it to the complex's stoichiometry
                    stoichiometry += single_stoichiometry
                    # Add name of current single ID
                    stoichiometry_enzyme_name_list[
                        -1] += single_id + ";" + str(number_units) + " "
                stoichiometry_enzyme_name_list[
                    -1] = stoichiometry_enzyme_name_list[-1].rstrip()
                # Add to list of stoichiometries
                stoichiometries.append(stoichiometry)

                # Add proteomics constraints
                for single_id in isozyme_id:
                    if single_id in protein_id_concentration_mapping.keys():
                        enzyme_pseudo_metabolite = model.metabolites.get_by_id(
                            "ENZYME_" + single_id)
                        stoichiometry = reaction_id_gene_rules_protein_stoichiometry_mapping[
                            reaction_id][isozyme_id][single_id]
                        stoichiometry *= 1 / (reaction_kcat * 3600)
                        stoichiometry *= -1
                        reaction.add_metabolites(
                            {enzyme_pseudo_metabolite: stoichiometry})

        # Take the maximal stoichiometry (i.e., the one with the least cost since this one will usually be prefered
        # anyway in an FBA).
        metabolites = {}
        max_stoichiometry = max(stoichiometries)
        metabolites[prot_pool_metabolite] = max_stoichiometry
        reaction.add_metabolites(metabolites)
        selected_enzyme = stoichiometry_enzyme_name_list[stoichiometries.index(
            max_stoichiometry)]

        # Print report of selected kcat and molecular weight for this reaction
        print("Reaction: ", model_reaction_id)
        print("Selected kcat: ", reaction_kcat)
        print("Selected molecular weight (kDa): ", end="")
        if " " in selected_enzyme:  # Multiple enzymes
            mass_sum = .0
            for single_enzyme in selected_enzyme.split(" "):
                enzyme_name = single_enzyme.split(";")[0]
                enzyme_unit_number = float(single_enzyme.split(";")[1])
                mass_sum += protein_id_mass_mapping[
                    enzyme_name] * enzyme_unit_number
            print(mass_sum)
        else:  # Single enzyme
            enzyme_name = selected_enzyme.split(";")[0]
            enzyme_unit_number = float(selected_enzyme.split(";")[1])
            print(protein_id_mass_mapping[enzyme_name] * enzyme_unit_number)

    # Output as SBML (without constraints due to cobrapy limitations)
    cobra.io.write_sbml_model(model, project_folder + output_sbml_name)
Beispiel #8
0
def parse_brenda_textfile(brenda_textfile_path: str,
                          bigg_metabolites_json_folder: str,
                          json_output_path: str) -> None:
    """Goes through a BRENDA database textfile and converts it into a machine-readable JSON.

    The JSON includes kcats for found organisms and substrates.
    As of 29/04/2019, the BRENDA database can be downloaded as textfile under
    https://www.brenda-enzymes.org/download_brenda_without_registration.php

    The BRENDA database is not in a completely standardized format, so that this functions
    contains many convoluted checks and circumventions of non-standardized data.

    kcats from mutated enzymes are excluded.

    Arguments
    ----------
    * brenda_textfile_path: str ~ The BRENDA database text file path
    * bigg_metabolites_json_folder: str ~ The folder in which the BIGG metabolites
      database is stored (it has to have the name 'bigg_id_name_mapping.json').
    * json_output_path: str ~ The path of the JSON that shall be created

    Output
    ----------
    * A JSON containing the BRENDA textfile kcat data in a machine-readable format:
    <pre>
        {
            "$EC_NUMBER": {
                "$SUBSTRATE_WITH_BIGG_ID_1": {
                    "$ORGANISM_1": [
                        $kcat_1,
                        (...)
                        $kcat_n,
                    ]
                },
                (...),
                "REST": {
                    "$ORGANISM_1": [
                        $kcat_1,
                        (...)
                        $kcat_n,
                    ]
                }
            }
            (...),
        }
    </pre>
    'REST' stands for a substrate without found BIGG ID.
    """
    # Standardize output folder
    bigg_metabolites_json_folder = standardize_folder(
        bigg_metabolites_json_folder)

    # Load BIGG ID <-> metabolite name mapping :D
    bigg_id_name_mapping: Dict[str,
                               str] = json_load(bigg_metabolites_json_folder +
                                                "bigg_id_name_mapping.json")

    # Load BRENDA textfile as list of strings without newlines :D
    with open(brenda_textfile_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    lines = [x.replace("\n", "") for x in lines]

    # Go through each line and collect the organism lines and kcat lines for each EC number
    in_turnover_numbers = False
    in_organism_reference = False
    ec_number_kcat_lines_mapping: Dict[str, List[str]] = {}
    ec_number_organsism_lines_mapping: Dict[str, List[str]] = {}
    current_ec_number = ""
    organism_lines: List[str] = []
    kcat_lines: List[str] = []
    i = 0
    while i < len(lines):
        line = lines[i]
        if line.startswith("ID\t"):
            if current_ec_number != "":
                ec_number_organsism_lines_mapping[
                    current_ec_number] = organism_lines
                ec_number_kcat_lines_mapping[current_ec_number] = kcat_lines
            current_ec_number = line.replace("ID\t", "").replace(" ()", "")
            organism_lines = []
            kcat_lines = []

        if len(line) == 0:
            in_turnover_numbers = False
            in_organism_reference = False
        elif line.startswith("PROTEIN"):
            in_organism_reference = True
            i += 1
            line = lines[i]
        elif line.startswith("TURNOVER_NUMBER"):
            in_turnover_numbers = True
            i += 1
            line = lines[i]

        if in_organism_reference:
            if line.startswith("PR"):
                organism_lines.append("")
            if len(organism_lines[-1]) > 0:
                organism_lines[-1] += " "
            organism_lines[-1] += " " + line

        elif in_turnover_numbers:
            if line.startswith("TN"):
                kcat_lines.append("")
            if len(kcat_lines[-1]) > 0:
                kcat_lines[-1] += " "
            kcat_lines[-1] += line

        if len(line) == 0:
            in_turnover_numbers = False
            in_organism_reference = False

        i += 1

    # Create the BRENDA database dictionary using the collected kcat and organism lines
    # of each EC number :D
    ec_numbers = list(ec_number_kcat_lines_mapping.keys())
    brenda_kcat_database: Dict[str, Any] = {}
    for ec_number in ec_numbers:
        if "(transferred to " in ec_number:
            actual_ec_number = ec_number.split(" (transferred")[0]
            try:
                brenda_kcat_database[actual_ec_number] = {}
                brenda_kcat_database[actual_ec_number]["TRANSFER"] = \
                    ec_number.lower().replace("  ", " ").split("(transferred to ec")[1].replace(")", "").lstrip()
            except Exception:
                # Some transfers go to general subgroups instead of single EC numbers so that
                # no kcat database can be built from it D:
                print("WARNING: BRENDA text file line " + ec_number +
                      " is not interpretable!")
            continue

        brenda_kcat_database[ec_number] = {}

        reference_number_organism_mapping = {}
        organism_lines = ec_number_organsism_lines_mapping[ec_number]
        for organism_line in organism_lines:
            reference_number = organism_line.split("#")[1]
            organism_line_split_first_part = organism_line.split("# ")[1]
            organism_line_split = organism_line_split_first_part.split(" ")
            organism_line_split = [
                x for x in organism_line_split if len(x) > 0
            ]

            end = 1
            for part in organism_line_split:
                # Some organism names contain their SwissProt or UniProt ID,
                # since we don't nned them they are excluded
                if ("swissprot" in part.lower()) or \
                 (part.lower() == "and") or \
                 ("uniprot" in part.lower()) or \
                 ("genbank" in part.lower()) or \
                 ("trembl" in part.lower()):
                    end -= 2
                    break

                if ("<" in part) or ("(" in part):
                    end -= 1
                    break

                end += 1
            organism_name = " ".join(organism_line_split[:end])
            reference_number_organism_mapping[reference_number] = organism_name

        kcat_lines = ec_number_kcat_lines_mapping[ec_number]
        for kcat_line in kcat_lines:
            kcat_line = kcat_line
            # Exclude kcats of mutated/changed proteins since
            # they may not have a biological relevance
            if ("mutant" in kcat_line.lower()) or ("mutated"
                                                   in kcat_line.lower()):
                continue
            reference_number = kcat_line.split("#")[1].split(",")[0]
            organism = reference_number_organism_mapping[reference_number]
            kcat_str = "".join(
                kcat_line.split("#")[2]).split("{")[0].lstrip().rstrip()
            kcat = max([float(x) for x in kcat_str.split("-") if len(x) > 0])
            substrate = "".join(kcat_line.split("{")[1]).split("}")[0]

            substrate = substrate.lower()
            if substrate in bigg_id_name_mapping.keys():
                substrate = bigg_id_name_mapping[substrate]
            else:
                substrate = "REST"

            if substrate not in brenda_kcat_database[ec_number].keys():
                brenda_kcat_database[ec_number][substrate] = {}
            if organism not in brenda_kcat_database[ec_number][substrate].keys(
            ):
                brenda_kcat_database[ec_number][substrate][organism] = []
            brenda_kcat_database[ec_number][substrate][organism].append(kcat)

    # Write final BRENDA kcat database :D
    json_write(json_output_path, brenda_kcat_database)