def _setType(types, current_id, padmet): """ For id current_id, create relation 'is_a_class' to each id in types add new nodes in padmet and relation in list_of_relation Parameters ---------- types: list list of classe id associated to id 'current_id' current_id: str element id to link to all classe id in types padmet: padmet.PadmetRef padmet instance """ for subClass_id in types: #Type allow the hierachization of the current #XX is_a_class type #if type is not already in padmet, create a new class node (subClass_node) #subClass_node id == type #create a relation xx is_a_class type try: subClass_node = padmet.dicOfNode[subClass_id] except KeyError: subClass_node = Node("class", subClass_id) padmet.dicOfNode[subClass_id] = subClass_node is_a_class_rlt = Relation(current_id, "is_a_class", subClass_id) list_of_relation.append(is_a_class_rlt)
def _setSyns(syns, current_id, padmet): """ For id current_id, create relation 'has_name' to a node name 'current_id'_names store a list of synonymous from syns list in the node name 'current_id'_names add new node in padmet and relation in list_of_relation Parameters ---------- syns: list list of synonymous of current_id current_id: str element id to link to node name 'current_id'_names padmet: padmet.PadmetRef padmet instance """ name_id = current_id+"_names" try: name_node = padmet.dicOfNode[name_id] except KeyError: #create node name name_node = Node("name", name_id, {"LABEL":[]}) padmet.dicOfNode[name_id] = name_node has_name_rlt = Relation(current_id, "has_name", name_id) list_of_relation.append(has_name_rlt) [name_node.misc["LABEL"].append(syn) for syn in syns if syn not in name_node.misc["LABEL"]]
def _setXrefs(xrefs, current_id, padmet): """ For id current_id, create relation 'has_xref' to a node xref 'current_id'_xrefs store a list of external reference from xrefs list in the node xref 'current_id'_xrefs parse each external reference with regex_xref add new node in padmet and relation in list_of_relation Parameters ---------- xrefs: list list of external reference of current_id current_id: str element id to link to node xref 'current_id'_xref padmet: padmet.PadmetRef padmet instance """ xref_id = current_id+"_xrefs" try: xref_node = padmet.dicOfNode[xref_id] except KeyError: #create node xref xref_node = Node("xref", xref_id) padmet.dicOfNode[xref_id] = xref_node has_xref_rlt = Relation(current_id, "has_xref", xref_id) list_of_relation.append(has_xref_rlt) for xref in xrefs: #an xref is like: (REFSEQ "NP_417401" NIL NIL NIL NIL NIL) #in this example DB = REFSEQ and ID = NP_417401 #update node xref, with in misc k = DB and v = [ID] #node id is created by incrementing meta_max_id xref_search = regex_xref.search(xref) if xref_search is not None: xref_dict = xref_search.groupdict() db = xref_dict["DB"] _id = xref_dict["ID"] else: db = "GO-TERMS" _id = xref if db in list(xref_node.misc.keys()) and _id not in xref_node.misc[db]: xref_node.misc[db].append(_id) else: xref_node.misc[db] = [_id]
def compounds_parser(filePath, padmet, verbose = False): """ Parameters ---------- filePath: str path to compounds.dat padmet: padmet.PadmetRef padmet instance verbose: bool if True print information """ dict_data = {} with open(filePath, 'r', encoding='windows-1252') as f: data = (line for line in f.read().splitlines() if not line.startswith("#") and not line == "//") for line in data: try: #if len of value is 0 then ValueError raised attrib, value = line.split(" - ") #delete all tags value = re.sub(regex_purge,"",value) if attrib == "UNIQUE-ID": current_id = value dict_data[current_id] = {} if attrib in ["COMMON-NAME","INCHI-KEY","MOLECULAR-WEIGHT","SMILES",\ "TYPES", "SYNONYMS", "DBLINKS"]: try: dict_data[current_id][attrib].append(value) except KeyError: dict_data[current_id][attrib] = [value] except ValueError: pass count = 0 nb_cpds = str(len(list(dict_data.keys()))) for compound_id, dict_values in dict_data.items(): count += 1 if verbose: print("\r%s/%s: %s" %(count, nb_cpds, compound_id),end="", flush=True) #print(compound_id) compound_node = Node("compound", compound_id) padmet.dicOfNode[compound_id] = compound_node try: compound_node.misc["COMMON-NAME"] = dict_values["COMMON-NAME"] except KeyError: pass try: compound_node.misc["INCHI-KEY"] = dict_values["INCHI-KEY"] except KeyError: pass try: compound_node.misc["MOLECULAR-WEIGHT"] = dict_values["MOLECULAR-WEIGHT"] except KeyError: pass try: compound_node.misc["SMILES"] = dict_values["SMILES"] except KeyError: pass try: types = dict_values["TYPES"] _setType(types,compound_id, padmet) except KeyError: pass try: syns = dict_values["SYNONYMS"] _setSyns(syns, compound_id, padmet) except KeyError: pass try: xrefs = dict_values["DBLINKS"] _setXrefs(xrefs, compound_id, padmet) except KeyError: pass if verbose: print("")
def genes_parser(filePath, padmet, verbose = False): """ Parameters ---------- filePath: str path to genes.dat padmet: padmet.PadmetRef padmet instance verbose: bool if True print information """ dict_data = {} #k='ACCESSION-1', v ='PRODUCT' with open(filePath, 'r', encoding='windows-1252') as f: data = (line for line in f.read().splitlines() if not line.startswith("#") and not line == "//") for line in data: try: #if len of value is 0 then ValueError raised attrib, value = line.split(" - ") #delete all tags value = re.sub(regex_purge,"",value) if attrib == "UNIQUE-ID": current_id = value dict_data[current_id] = {} if attrib in ["COMMON-NAME","ACCESSION-1","CENTISOME-POSITION","LEFT-END-POSITION","RIGHT-END-POSITION","SYNONYMS","TRANSCRIPTION-DIRECTION","PRODUCT"]: try: dict_data[current_id][attrib].append(value) except KeyError: dict_data[current_id][attrib] = [value] except ValueError: pass count = 0 #nb_genes = str(len(list(dict_data.keys()))) map_gene_ids = {} for current_id, dict_values in dict_data.items(): try: gene_id = dict_values.get("ACCESSION-1",[current_id])[0] map_gene_ids[current_id] = gene_id count += 1 """ if verbose: print("\r%s/%s: %s" %(count, nb_genes, gene_id), end="", flush=True) #print(current_id, gene_id) """ gene_node = Node("gene", gene_id) padmet.dicOfNode[gene_id] = gene_node try: if dict_values["COMMON-NAME"][0] != gene_id: gene_node.misc["COMMON-NAME"] = dict_values["COMMON-NAME"] except KeyError: pass try: if dict_values["TRANSCRIPTION-DIRECTION"][0] == "-": gene_node.misc["TRANSCRIPTION-DIRECTION"] = ["NEGATIVE"] elif dict_values["TRANSCRIPTION-DIRECTION"][0] == "+": gene_node.misc["TRANSCRIPTION-DIRECTION"] = ["POSITIVE"] except KeyError: pass try: gene_node.misc["CENTISOME-POSITION"] = dict_values["CENTISOME-POSITION"] except KeyError: pass try: gene_node.misc["LEFT-END-POSITION"] = dict_values["LEFT-END-POSITION"] except KeyError: pass try: gene_node.misc["RIGHT-END-POSITION"] = dict_values["RIGHT-END-POSITION"] except KeyError: pass try: gene_node.misc["RIGHT-END-POSITION"] = dict_values["RIGHT-END-POSITION"] except KeyError: pass try: syns = dict_values["SYNONYMS"] _setSyns(syns, gene_id, padmet) except KeyError: pass except KeyError: pass if verbose: print("") return map_gene_ids
def pathways_parser(filePath, padmet, verbose = False): """ Parameters ---------- filePath: str path to pathways.dat padmet: padmet.PadmetRef padmet instance verbose: bool if True print information """ dict_data = {} with open(filePath, 'r', encoding='windows-1252') as f: data = (line for line in f.read().splitlines() if not line.startswith("#") and not line == "//") for line in data: try: #if len of value is 0 then ValueError raised attrib, value = line.split(" - ") #delete all tags value = re.sub(regex_purge,"",value) if attrib == "UNIQUE-ID": current_id = value dict_data[current_id] = {} if attrib in ["COMMON-NAME", "TAXONOMIC-RANGE",\ "TYPES", "SYNONYMS", "DBLINKS", "IN-PATHWAY", "REACTION-LIST"]: try: dict_data[current_id][attrib].append(value) except KeyError: dict_data[current_id][attrib] = [value] except ValueError: pass count = 0 nb_pathways = str(len(list(dict_data.keys()))) for pathway_id, dict_values in dict_data.items(): count += 1 if verbose: print("\r%s/%s" %(count, nb_pathways), end="", flush=True) #print(pathway_id) pathway_node = Node("pathway", pathway_id) padmet.dicOfNode[pathway_id] = pathway_node try: pathway_node.misc["COMMON-NAME"] = dict_values["COMMON-NAME"] except KeyError: pass try: pathway_node.misc["TAXONOMIC-RANGE"] = dict_values["TAXONOMIC-RANGE"] except KeyError: pass try: types = dict_values["TYPES"] _setType(types, pathway_id, padmet) except KeyError: pass try: syns = dict_values["SYNONYMS"] _setSyns(syns, pathway_id, padmet) except KeyError: pass try: xrefs = dict_values["DBLINKS"] _setXrefs(xrefs, pathway_id, padmet) except KeyError: pass try: subPathways = dict_values["IN-PATHWAY"] for subPathway in subPathways: #add the hierachization info, current pathway is_in_pathway subpathway is_in_pathway_rlt = Relation(pathway_id, "is_in_pathway", subPathway) list_of_relation.append(is_in_pathway_rlt) except KeyError: pass try: subNodes = dict_values["REACTION-LIST"] for subNode in subNodes: #add the hierachization info, Reaction/pathway is_in_pathway current pathway is_in_pathway_rlt = Relation(subNode, "is_in_pathway", pathway_id) list_of_relation.append(is_in_pathway_rlt) except KeyError: pass if verbose: print("")
def reactions_parser(filePath, padmet, extract_gene, source, verbose = False): """ from reaction.dat: get for each reaction, the UNIQUE-ID, COMMON-NAME, TYPES, SYNONYMS, DBLINKS Create a reaction node with node.id = UNIQUE-ID, node.misc = {COMMON-NAME:[COMMON-NAMES]} - For each types: A type is in fact a class. this information is stocked in padmet as: is_a_class relation btw a node and a class_node check if the type is already in the padmet if not create a new class_node (var: subClass) with subClass_node.id = type Create a relation current node is_a_class type - For each Synonyms: this information is stocked in padmet as: has_name relation btw a node and a name_node create a new name_node with name_node.id = reaction_id+"_names" name_node.misc = {LABEL:[synonyms]} Create a relation current node has_name name_node.id - For each DBLINKS: DBLINKS is parsed with regex_xref to get the db and the id this information is stocked in padmet as: has_xref relation btw a node and a xref_node create a new xref_node with xref_node.id = reaction_id+"_xrefs" and xref_node.misc = {db:[id]} Create a relation current node has_xref xref_node.id Parameters ---------- filePath: str path to reactions.dat padmet: padmet.PadmetRef padmet instance verbose: bool if True print information """ dict_data = {} with open(filePath, 'r', encoding='windows-1252') as f: data = [line for line in f.read().splitlines() if not line.startswith("#") and not line == "//"] index = -1 for line in data: index += 1 try: #if len of value is 0 then ValueError raised attrib, value = line.split(" - ") #delete all tags value = re.sub(regex_purge,"",value) if attrib == "UNIQUE-ID": current_id = value dict_data[current_id] = {} if attrib in ["COMMON-NAME", "EC-NUMBER", "REACTION-DIRECTION",\ "TYPES", "SYNONYMS", "DBLINKS"]: try: dict_data[current_id][attrib].append(value) except KeyError: dict_data[current_id][attrib] = [value] elif attrib in ["LEFT", "RIGHT"]: #set default values compartment = def_compart_in stoichiometry = "1" #check if information about stoechiometry and compartment in line + 1 and line + 2 try: first_next_line = data[index+1] #last line of the file, just add the LEFT/RIGHT except IndexError: first_next_line = "" try: second_next_line = data[index+2] except IndexError: second_next_line = "" if first_next_line.startswith("^COEFFICIENT"): stoichiometry = first_next_line.split(" - ")[1] if second_next_line.startswith("^COMPARTMENT"): compartment = second_next_line.split(" - ")[1] if first_next_line.startswith("^COMPARTMENT"): compartment = first_next_line.split(" - ")[1] if second_next_line.startswith("^COEFFICIENT"): stoichiometry = second_next_line.split(" - ")[1] #delete all tags compartment = re.sub(regex_purge, "", compartment) stoichiometry = re.sub(regex_purge, "", stoichiometry) try: dict_data[current_id][attrib].append((value, stoichiometry, compartment)) except KeyError: dict_data[current_id][attrib] = [(value, stoichiometry, compartment)] except ValueError: pass count = 0 nb_rxn = str(len(list(dict_data.keys()))) for rxn_id, dict_values in dict_data.items(): if "LEFT" in list(dict_values.keys()) or "RIGHT" in list(dict_values.keys()): count += 1 if verbose: print("\r%s/%s: %s" %(count, nb_rxn, rxn_id), end="", flush=True) #print(rxn_id) rxn_node = Node("reaction", rxn_id) padmet.dicOfNode[rxn_id] = rxn_node try: rxn_node.misc["COMMON-NAME"] = dict_values["COMMON-NAME"] except KeyError: pass try: rxn_node.misc["EC-NUMBER"] = dict_values["EC-NUMBER"] except KeyError: pass try: rxn_dir = dict_values["REACTION-DIRECTION"][0] if rxn_dir == "REVERSIBLE": rxn_node.misc["DIRECTION"] = ["REVERSIBLE"] elif "LEFT-TO-RIGHT" in rxn_dir: #if:LEFT-TO-RIGHT, IRREVERSIBLE-LEFT-TO-RIGHT, PHYSIOL-RIGHT-TO-LEFT rxn_node.misc["DIRECTION"] = ["LEFT-TO-RIGHT"] elif "RIGHT-TO-LEFT" in rxn_dir: #Temporarily set direaction as RIGHT-TO-LEFT #then, RIGHT' metabolites will be LEFT and LEFT -> RIGHT #To finish set back DIRECTION to LEFT-TO-RIGHT rxn_node.misc["DIRECTION"] = ["RIGHT-TO-LEFT"] except KeyError: rxn_node.misc["DIRECTION"] = ["REVERSIBLE"] """ try: rxn_node.misc["COMPARTMENT"] = dict_values["RXN-LOCATIONS"] except KeyError: pass """ try: types = dict_values["TYPES"] _setType(types, rxn_id, padmet) except KeyError: pass try: syns = dict_values["SYNONYMS"] _setSyns(syns, rxn_id, padmet) except KeyError: pass try: xrefs = dict_values["DBLINKS"] _setXrefs(xrefs, rxn_id, padmet) except KeyError: pass if extract_gene: reconstructionData_id = rxn_id+"_reconstructionData_"+source if reconstructionData_id in list(padmet.dicOfNode.keys()) and verbose: print("Warning: The reaction %s seems to be already added from the same source %s" %(rxn_id, source)) reconstructionData = {"SOURCE":[source],"TOOL":["PATHWAYTOOLS"],"CATEGORY":["ANNOTATION"]} reconstructionData_rlt = Relation(rxn_id,"has_reconstructionData",reconstructionData_id) padmet.dicOfNode[reconstructionData_id] = Node("reconstructionData", reconstructionData_id, reconstructionData) list_of_relation.append(reconstructionData_rlt) try: reactants_data = dict_values["LEFT"] for reactant_id, stoichiometry, compartment in reactants_data: if compartment == "CCO-OUT": compartment = def_compart_out else: compartment = def_compart_in try: reactant_node = padmet.dicOfNode[reactant_id] except KeyError: reactant_node = Node("compound", reactant_id) padmet.dicOfNode[reactant_id] = reactant_node #if the reaction direction was set to RIGHT-TO-LEFT, then this compound is in fact a product if rxn_node.misc["DIRECTION"][0] == "RIGHT-TO-LEFT": produces_rlt = Relation(rxn_id, "produces", reactant_id, {"STOICHIOMETRY": [stoichiometry], "COMPARTMENT": [compartment]}) list_of_relation.append(produces_rlt) else: consumes_rlt = Relation(rxn_id, "consumes", reactant_id, {"STOICHIOMETRY": [stoichiometry], "COMPARTMENT": [compartment]}) list_of_relation.append(consumes_rlt) except KeyError: pass try: products_data = dict_values["RIGHT"] for product_id, stoichiometry, compartment in products_data: if compartment == "CCO-OUT": compartment = def_compart_out else: compartment = def_compart_in try: product_node = padmet.dicOfNode[product_id] except KeyError: product_node = Node("compound", product_id) padmet.dicOfNode[product_id] = product_node #if the reaction direction was set to RIGHT-TO-LEFT, then this compound is in fact a reactant if rxn_node.misc["DIRECTION"][0] == "RIGHT-TO-LEFT": consumes_rlt = Relation(rxn_id, "consumes", product_id, {"STOICHIOMETRY": [stoichiometry], "COMPARTMENT": [compartment]}) list_of_relation.append(consumes_rlt) else: produces_rlt = Relation(rxn_id, "produces", product_id, {"STOICHIOMETRY": [stoichiometry], "COMPARTMENT": [compartment]}) list_of_relation.append(produces_rlt) except KeyError: pass if rxn_node.misc["DIRECTION"][0] == "RIGHT-TO-LEFT": rxn_node.misc["DIRECTION"] = ["LEFT-TO-RIGHT"] if verbose: print("")
def classes_parser(filePath, padmet, verbose = False): """ from class.dat: get for each class, the UNIQUE-ID, COMMON-NAME, TYPES, SYNONYMS, DBLINKS Create a class node with node.id = UNIQUE-ID, node.misc = {COMMON-NAME:[COMMON-NAMES]} - For each types: A type is in fact a class. this information is stocked in padmet as: is_a_class relation btw a node and a class_node check if the type is already in the padmet if not create a new class_node (var: subClass) with subClass_node.id = type Create a relation current node is_a_class type - For each Synonyms: this information is stocked in padmet as: has_name relation btw a node and a name_node create a new name_node with name_node.id = class_id+"_names" and name_node.misc = {LABEL:[synonyms]} Create a relation current node has_name name_node.id - For each DBLINKS: DBLINKS is parsed with regex_xref to get the db and the id this information is stocked in padmet as: has_xref relation btw a node and a xref_node create a new xref_node with xref_node.id = class_id+"_xrefs" and xref_node.misc = {db:[id]} Create a relation current node has_xref xref_node.id Parameters ---------- filePath: str path to classes.dat padmet: padmet.PadmetRef padmet instance verbose: bool if True print information """ dict_data = {} with open(filePath, 'r', encoding='windows-1252') as f: data = (line for line in f.read().splitlines() if not line.startswith("#") and not line == "//") for line in data: try: #if len of value is 0 then ValueError raised attrib, value = line.split(" - ") #delete all tags value = re.sub(regex_purge,"",value) if attrib == "UNIQUE-ID": current_id = value dict_data[current_id] = {} if attrib in ["COMMON-NAME",\ "TYPES", "SYNONYMS", "DBLINKS"]: try: dict_data[current_id][attrib].append(value) except KeyError: dict_data[current_id][attrib] = [value] except ValueError: pass count = 0 nb_classes = str(len(list(dict_data.keys()))) for class_id, dict_values in dict_data.items(): count += 1 if verbose: print("\r%s/%s" %(count, nb_classes), end="", flush=True) #print(class_id) class_node = Node("class", class_id) padmet.dicOfNode[class_id] = class_node try: class_node.misc["COMMON-NAME"] = dict_values["COMMON-NAME"] except KeyError: pass try: types = dict_values["TYPES"] _setType(types, class_id, padmet) except KeyError: pass try: syns = dict_values["SYNONYMS"] _setSyns(syns, class_id, padmet) except KeyError: pass try: xrefs = dict_values["DBLINKS"] _setXrefs(xrefs, class_id, padmet) except KeyError: pass if verbose: print("")
def from_pgdb_to_padmet(pgdb_folder, db='NA', version='NA', source='GENOME', extract_gene=False, no_orphan=False, enhanced_db=False, padmetRef_file=None, verbose=False): """ Parameters ---------- pgdb_folder: str path to pgdb db: str pgdb name, default is 'NA' version: str pgdb version, default is 'NA' source: str tag reactions for traceability, default is 'GENOME' extract_gene: bool if true extract genes information no_orphan: bool if true, remove reactions without genes associated enhanced_db: bool if true, read metabolix-reactions.xml sbml file and add information in final padmet padmetRef_file: str path to padmetRef corresponding to metacyc in padmet format verbose: bool if True print information Returns ------- padmet.padmetRef: padmet instance with pgdb within pgdb data """ global regex_purge, regex_xref, list_of_relation, def_compart_in, def_compart_out regex_purge = re.compile("<.*?>|\|") regex_xref = re.compile('^\((?P<DB>\S*)\s*"(?P<ID>\S*)"') list_of_relation = [] def_compart_in = "c" def_compart_out = "e" #parsing args source = source.upper() classes_file, compounds_file, proteins_file, reactions_file, enzrxns_file, pathways_file = \ [os.path.join(pgdb_folder,_file) for _file in ["classes.dat", "compounds.dat", "proteins.dat", "reactions.dat", "enzrxns.dat", "pathways.dat"]] if enhanced_db: metabolic_reactions = os.path.join(pgdb_folder,"metabolic-reactions.xml") else: metabolic_reactions = None if extract_gene: genes_file = os.path.join(pgdb_folder,"genes.dat") else: genes_file = None now = datetime.now() today_date = now.strftime("%Y-%m-%d") if padmetRef_file: padmet = PadmetSpec() padmetRef = PadmetRef(padmetRef_file) version = padmetRef.info["DB_info"]["version"] db = padmetRef.info["DB_info"]["DB"] dbNotes = {"PADMET":{"creation":today_date,"version":"2.6"},"DB_info":{"DB":db,"version":version}} padmet.setInfo(dbNotes) padmet.setPolicy(padmetRef) with open(reactions_file, 'r') as f: rxns_id = [line.split(" - ")[1] for line in f.read().splitlines() if line.startswith("UNIQUE-ID")] count = 0 for rxn_id in rxns_id: count += 1 if verbose: print("%s/%s Copy %s" %(count, len(rxns_id), rxn_id)) try: padmet.copyNode(padmetRef, rxn_id) reconstructionData_id = rxn_id+"_reconstructionData_"+source if reconstructionData_id in list(padmet.dicOfNode.keys()) and verbose: print("Warning: The reaction %s seems to be already added from the same source %s" %(rxn_id, source)) reconstructionData = {"SOURCE":[source],"TOOL":["PATHWAYTOOLS"],"CATEGORY":["ANNOTATION"]} reconstructionData_rlt = Relation(rxn_id,"has_reconstructionData",reconstructionData_id) padmet.dicOfNode[reconstructionData_id] = Node("reconstructionData", reconstructionData_id, reconstructionData) padmet._addRelation(reconstructionData_rlt) except TypeError: print("%s not in padmetRef" %(rxn_id)) if extract_gene: if verbose: print("parsing genes") map_gene_ids = genes_parser(genes_file, padmet, verbose) if verbose: print("parsing proteins") dict_protein_gene_id = proteins_parser(proteins_file, padmet, verbose) mapped_dict_protein_gene_id = map_gene_id(dict_protein_gene_id, map_gene_ids) if verbose: print("parsing association enzrxns") enzrxns_parser(enzrxns_file, padmet, mapped_dict_protein_gene_id, source, verbose) else: POLICY_IN_ARRAY = [['class','is_a_class','class'], ['class','has_name','name'], ['class','has_xref','xref'], ['class','has_suppData','suppData'], ['compound','is_a_class','class'], ['compound','has_name','name'], ['compound','has_xref','xref'], ['compound','has_suppData','suppData'], ['gene','is_a_class','class'], ['gene','has_name','name'], ['gene','has_xref','xref'], ['gene','has_suppData','suppData'], ['gene','codes_for','protein'], ['pathway','is_a_class','class'], ['pathway','has_name','name'], ['pathway','has_xref','xref'], ['pathway','is_in_pathway','pathway'], ['protein','is_a_class','class'], ['protein','has_name','name'], ['protein','has_xref','xref'], ['protein','has_suppData','suppData'], ['protein','catalyses','reaction'], ['protein','is_in_species','class'], ['reaction','is_a_class','class'], ['reaction','has_name','name'], ['reaction','has_xref','xref'], ['reaction','has_suppData','suppData'], ['reaction','has_reconstructionData','reconstructionData'], ['reaction','is_in_pathway','pathway'], ['reaction','consumes','class','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','class','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','consumes','compound','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','compound','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','consumes','protein','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','protein','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','is_linked_to','gene','SOURCE:ASSIGNMENT','X:Y']] dbNotes = {"PADMET":{"creation":today_date,"version":"2.6"},"DB_info":{"DB":db,"version":version}} padmet = PadmetRef() if verbose: print("setting policy") padmet.setPolicy(POLICY_IN_ARRAY) if verbose: print("setting dbInfo") padmet.setInfo(dbNotes) if verbose: print("parsing classes") classes_parser(classes_file, padmet, verbose) if verbose: print("parsing compounds") compounds_parser(compounds_file, padmet, verbose) if verbose: print("parsing reactions") reactions_parser(reactions_file, padmet, extract_gene, source, verbose) if verbose: print("parsing pathways") pathways_parser(pathways_file, padmet, verbose) if extract_gene: if verbose: print("parsing genes") map_gene_ids = genes_parser(genes_file, padmet, verbose) if verbose: print("parsing proteins") dict_protein_gene_id = proteins_parser(proteins_file, padmet, verbose) mapped_dict_protein_gene_id = map_gene_id(dict_protein_gene_id, map_gene_ids) if verbose: print("parsing association enzrxns") enzrxns_parser(enzrxns_file, padmet, mapped_dict_protein_gene_id, source, verbose) if metabolic_reactions is not None: if verbose: print("enhancing db from metabolic-reactions.xml") padmet = enhance_db(metabolic_reactions, padmet, extract_gene, verbose) for rlt in list_of_relation: try: padmet.dicOfRelationIn[rlt.id_in].append(rlt) except KeyError: padmet.dicOfRelationIn[rlt.id_in] = [rlt] try: padmet.dicOfRelationOut[rlt.id_out].append(rlt) except KeyError: padmet.dicOfRelationOut[rlt.id_out] = [rlt] if extract_gene and no_orphan: all_reactions = [node for node in list(padmet.dicOfNode.values()) if node.type == "reaction"] rxn_to_del = [r for r in all_reactions if not any([rlt for rlt in padmet.dicOfRelationIn[r.id] if rlt.type == "is_linked_to"])] for rxn in rxn_to_del: padmet.delNode(rxn.id) if verbose: print("%s/%s orphan reactions (without gene association) deleted" %(len(rxn_to_del), len(all_reactions))) all_genes_linked = set([rlt.id_out for rlt in padmet.getAllRelation() if rlt.type == "is_linked_to"]) all_genes = set([node.id for node in list(padmet.dicOfNode.values()) if node.type == "gene"]) count = 0 for gene_id in [g for g in all_genes if g not in all_genes_linked]: count += 1 #if verbose: print("Removing gene without gene assoc %s" %gene_id) padmet.dicOfNode.pop(gene_id) if verbose: print("%s/%s orphan genes (not linked to any reactions) deleted" %(count, len(all_genes))) rxns = [node.id for node in list(padmet.dicOfNode.values()) if node.type == "reaction"] for rxn_id in rxns: cp_rlts = set([rlt.type for rlt in padmet.dicOfRelationIn[rxn_id] if rlt.type in ["consumes","produces"]]) if len(cp_rlts) == 1: print("rxn only consume or produce, transport ???: %s" %rxn_id) padmet.delNode(rxn_id) return padmet
def enhance_db(metabolic_reactions, padmet, with_genes, verbose = False): """ Parse sbml metabolic_reactions and add reactions in padmet if with_genes: add also genes information Parameters ---------- metabolic_reactions: str path to sbml metabolic-reactions.xml padmet: padmet.PadmetRef padmet instance with_genes: bool if true alos add genes information. Returns ------- padmet.padmetRef: padmet instance with pgdb within pgdb + metabolic-reactions.xml data """ print("loading sbml file: %s" %metabolic_reactions) reader = libsbml.SBMLReader() document = reader.readSBML(metabolic_reactions) for i in range(document.getNumErrors()): print(document.getError(i).getMessage()) model = document.getModel() listOfReactions = model.getListOfReactions() #recovere the reactions that are not in the basic metacyc but in the sbml file #use the reactions_name instead of ids because the ids are encoded, the name is the non-encoded version of the id padmet_reactions_id = set([node.id for node in list(padmet.dicOfNode.values()) if node.type == "reaction"]) reaction_to_add = [reaction for reaction in listOfReactions if reaction.getName() not in padmet_reactions_id] count = 0 if verbose: print(str(len(reaction_to_add))+" reactions to add") for reactionSBML in reaction_to_add: count += 1 reaction_id = reactionSBML.getName() if verbose: print(str(count)+"/"+str(len(reaction_to_add))+"\t"+reaction_id) if reactionSBML.getReversible(): reaction_dir = "REVERSIBLE" else: reaction_dir = "LEFT-TO-RIGHT" try: reaction_node = padmet.dicOfNode[reaction_id] except KeyError: reaction_node = Node("reaction", reaction_id, {"DIRECTION": [reaction_dir]}) padmet.dicOfNode[reaction_id] = reaction_node reactants = reactionSBML.getListOfReactants() for reactant in reactants: #convert ids reactant_id, _type, reactant_compart = sbmlPlugin.convert_from_coded_id(reactant.getSpecies()) if reactant_id not in list(padmet.dicOfNode.keys()): reactant_node = Node("compound",reactant_id) padmet.dicOfNode[reaction_id] = reactant_node reactant_stoich = reactant.getStoichiometry() consumes_rlt = Relation(reaction_id,"consumes",reactant_id, {"STOICHIOMETRY":[reactant_stoich], "COMPARTMENT": [reactant_compart]}) list_of_relation.append(consumes_rlt) products = reactionSBML.getListOfProducts() for product in products: product_id, _type, product_compart = sbmlPlugin.convert_from_coded_id(product.getSpecies()) if product_id not in list(padmet.dicOfNode.keys()): product_node = Node("compound",product_id) padmet.dicOfNode[product_id] = product_node product_stoich = product.getStoichiometry() produces_rlt = Relation(reaction_id,"produces",product_id,{"STOICHIOMETRY": [product_stoich], "COMPARTMENT": [product_compart]}) list_of_relation.append(produces_rlt) if with_genes: notes = sbmlPlugin.parseNotes(reactionSBML) if "GENE_ASSOCIATION" in list(notes.keys()): #Using sbmlPlugin to recover all genes associated to the reaction listOfGenes = sbmlPlugin.parseGeneAssoc(notes["GENE_ASSOCIATION"][0]) if len(listOfGenes) != 0: for gene in listOfGenes: try: #check if gene already in the padmet padmet.dicOfNode[gene] except TypeError: gene_node = Node("gene",gene) padmet.dicOfNode[gene] = gene_node is_linked_rlt = Relation(reaction_id, "is_linked_to", gene) list_of_relation.append(is_linked_rlt) return padmet