def test_compare_padmet_cli(): subprocess.call([ 'padmet', 'pgdb_to_padmet', '--pgdb', 'test_data/pgdb', '--output', 'test.padmet', '--extract-gene' ]) fabo_1_padmetSpec = PadmetSpec('test.padmet') os.remove('test.padmet') fabo_1_padmetSpec.delNode('ACYLCOASYN-RXN') fabo_1_padmetSpec.generateFile('fabo_1.padmet') subprocess.call([ 'padmet', 'pgdb_to_padmet', '--pgdb', 'test_data/pgdb', '--output', 'test.padmet', '--extract-gene' ]) fabo_2_padmetSpec = PadmetSpec('test.padmet') os.remove('test.padmet') fabo_2_padmetSpec.delNode('ACYLCOADEHYDROG-RXN') fabo_2_padmetSpec.generateFile('fabo_2.padmet') subprocess.call([ 'padmet', 'compare_padmet', '--padmet', 'fabo_1.padmet,fabo_2.padmet', '--output', 'output' ]) genes_fabo_1 = [] genes_fabo_2 = [] with open('output/genes.tsv', 'r') as genes_file: csvreader = csv.reader(genes_file, delimiter='\t') for row in csvreader: if row[1] == '1': genes_fabo_1.append(row[0]) if row[2] == '1': genes_fabo_2.append(row[0]) assert set(FABO_GENES).issubset(set(genes_fabo_1)) assert set(FABO_GENES).issubset(set(genes_fabo_2)) reactions_fabo_1 = [] reactions_fabo_2 = [] with open('output/reactions.tsv', 'r') as reactions_file: csvreader = csv.reader(reactions_file, delimiter='\t') for row in csvreader: if row[1] == '1': reactions_fabo_1.append(row[0]) if row[2] == '1': reactions_fabo_2.append(row[0]) expected_fabo_1_rxns = [ rxn for rxn in FABO_RXNS if rxn != 'ACYLCOASYN-RXN' ] expected_fabo_2_rxns = [ rxn for rxn in FABO_RXNS if rxn != 'ACYLCOADEHYDROG-RXN' ] assert set(expected_fabo_1_rxns).issubset(set(reactions_fabo_1)) assert set(expected_fabo_2_rxns).issubset(set(reactions_fabo_2)) pathway_fabo_1 = [] pathway_fabo_2 = [] with open('output/pathways.tsv', 'r') as pathways_file: csvreader = csv.reader(pathways_file, delimiter='\t') for row in csvreader: if row[0] != 'pathway': pathway_fabo_1.append(row[0]) pathway_fabo_2.append(row[0]) if row[3] != 'fabo_1_rxn_assoc (sep=;)': pwy_reactions_fabo_1 = row[3].split(';') if row[4] != 'fabo_2_rxn_assoc (sep=;)': pwy_reactions_fabo_2 = row[4].split(';') assert pathway_fabo_1 == ['FAO-PWY'] assert pathway_fabo_2 == ['FAO-PWY'] assert set(expected_fabo_1_rxns).issubset(set(pwy_reactions_fabo_1)) assert set(expected_fabo_2_rxns).issubset(set(pwy_reactions_fabo_2)) metabolites_fabo_1 = [] metabolites_fabo_2 = [] with open('output/metabolites.tsv', 'r') as metabolites_file: csvreader = csv.reader(metabolites_file, delimiter='\t') for row in csvreader: if row[1] != 'fabo_1_rxn_consume' or row[1] != '': if row[0] != 'metabolite': metabolites_fabo_1.append(row[0]) if row[3] != 'fabo_1_rxn_produce' or row[3] != '': if row[0] != 'metabolite': metabolites_fabo_1.append(row[0]) if row[2] != 'fabo_2_rxn_consume' or row[2] != '': if row[0] != 'metabolite': metabolites_fabo_2.append(row[0]) if row[2] != 'fabo_2_rxn_produce' or row[2] != '': if row[0] != 'metabolite': metabolites_fabo_2.append(row[0]) metabolites_fabo_1 = list(set(metabolites_fabo_1)) metabolites_fabo_2 = list(set(metabolites_fabo_2)) assert set(FABO_CPDS).issubset(set(metabolites_fabo_1)) assert set(FABO_CPDS).issubset(set(metabolites_fabo_2)) os.remove('fabo_1.padmet') os.remove('fabo_2.padmet') shutil.rmtree('output')
def from_pgdb_to_padmet(pgdb_folder, db='NA', version='NA', source='GENOME', extract_gene=False, no_orphan=False, enhanced_db=False, padmetRef_file=None, verbose=False): """ Parameters ---------- pgdb_folder: str path to pgdb db: str pgdb name, default is 'NA' version: str pgdb version, default is 'NA' source: str tag reactions for traceability, default is 'GENOME' extract_gene: bool if true extract genes information no_orphan: bool if true, remove reactions without genes associated enhanced_db: bool if true, read metabolix-reactions.xml sbml file and add information in final padmet padmetRef_file: str path to padmetRef corresponding to metacyc in padmet format verbose: bool if True print information Returns ------- padmet.padmetRef: padmet instance with pgdb within pgdb data """ global regex_purge, regex_xref, list_of_relation, def_compart_in, def_compart_out regex_purge = re.compile("<.*?>|\|") regex_xref = re.compile('^\((?P<DB>\S*)\s*"(?P<ID>\S*)"') list_of_relation = [] def_compart_in = "c" def_compart_out = "e" #parsing args source = source.upper() classes_file, compounds_file, proteins_file, reactions_file, enzrxns_file, pathways_file = \ [os.path.join(pgdb_folder,_file) for _file in ["classes.dat", "compounds.dat", "proteins.dat", "reactions.dat", "enzrxns.dat", "pathways.dat"]] if enhanced_db: metabolic_reactions = os.path.join(pgdb_folder,"metabolic-reactions.xml") else: metabolic_reactions = None if extract_gene: genes_file = os.path.join(pgdb_folder,"genes.dat") else: genes_file = None now = datetime.now() today_date = now.strftime("%Y-%m-%d") if padmetRef_file: padmet = PadmetSpec() padmetRef = PadmetRef(padmetRef_file) version = padmetRef.info["DB_info"]["version"] db = padmetRef.info["DB_info"]["DB"] dbNotes = {"PADMET":{"creation":today_date,"version":"2.6"},"DB_info":{"DB":db,"version":version}} padmet.setInfo(dbNotes) padmet.setPolicy(padmetRef) with open(reactions_file, 'r') as f: rxns_id = [line.split(" - ")[1] for line in f.read().splitlines() if line.startswith("UNIQUE-ID")] count = 0 for rxn_id in rxns_id: count += 1 if verbose: print("%s/%s Copy %s" %(count, len(rxns_id), rxn_id)) try: padmet.copyNode(padmetRef, rxn_id) reconstructionData_id = rxn_id+"_reconstructionData_"+source if reconstructionData_id in list(padmet.dicOfNode.keys()) and verbose: print("Warning: The reaction %s seems to be already added from the same source %s" %(rxn_id, source)) reconstructionData = {"SOURCE":[source],"TOOL":["PATHWAYTOOLS"],"CATEGORY":["ANNOTATION"]} reconstructionData_rlt = Relation(rxn_id,"has_reconstructionData",reconstructionData_id) padmet.dicOfNode[reconstructionData_id] = Node("reconstructionData", reconstructionData_id, reconstructionData) padmet._addRelation(reconstructionData_rlt) except TypeError: print("%s not in padmetRef" %(rxn_id)) if extract_gene: if verbose: print("parsing genes") map_gene_ids = genes_parser(genes_file, padmet, verbose) if verbose: print("parsing proteins") dict_protein_gene_id = proteins_parser(proteins_file, padmet, verbose) mapped_dict_protein_gene_id = map_gene_id(dict_protein_gene_id, map_gene_ids) if verbose: print("parsing association enzrxns") enzrxns_parser(enzrxns_file, padmet, mapped_dict_protein_gene_id, source, verbose) else: POLICY_IN_ARRAY = [['class','is_a_class','class'], ['class','has_name','name'], ['class','has_xref','xref'], ['class','has_suppData','suppData'], ['compound','is_a_class','class'], ['compound','has_name','name'], ['compound','has_xref','xref'], ['compound','has_suppData','suppData'], ['gene','is_a_class','class'], ['gene','has_name','name'], ['gene','has_xref','xref'], ['gene','has_suppData','suppData'], ['gene','codes_for','protein'], ['pathway','is_a_class','class'], ['pathway','has_name','name'], ['pathway','has_xref','xref'], ['pathway','is_in_pathway','pathway'], ['protein','is_a_class','class'], ['protein','has_name','name'], ['protein','has_xref','xref'], ['protein','has_suppData','suppData'], ['protein','catalyses','reaction'], ['protein','is_in_species','class'], ['reaction','is_a_class','class'], ['reaction','has_name','name'], ['reaction','has_xref','xref'], ['reaction','has_suppData','suppData'], ['reaction','has_reconstructionData','reconstructionData'], ['reaction','is_in_pathway','pathway'], ['reaction','consumes','class','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','class','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','consumes','compound','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','compound','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','consumes','protein','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','produces','protein','STOICHIOMETRY','X','COMPARTMENT','Y'], ['reaction','is_linked_to','gene','SOURCE:ASSIGNMENT','X:Y']] dbNotes = {"PADMET":{"creation":today_date,"version":"2.6"},"DB_info":{"DB":db,"version":version}} padmet = PadmetRef() if verbose: print("setting policy") padmet.setPolicy(POLICY_IN_ARRAY) if verbose: print("setting dbInfo") padmet.setInfo(dbNotes) if verbose: print("parsing classes") classes_parser(classes_file, padmet, verbose) if verbose: print("parsing compounds") compounds_parser(compounds_file, padmet, verbose) if verbose: print("parsing reactions") reactions_parser(reactions_file, padmet, extract_gene, source, verbose) if verbose: print("parsing pathways") pathways_parser(pathways_file, padmet, verbose) if extract_gene: if verbose: print("parsing genes") map_gene_ids = genes_parser(genes_file, padmet, verbose) if verbose: print("parsing proteins") dict_protein_gene_id = proteins_parser(proteins_file, padmet, verbose) mapped_dict_protein_gene_id = map_gene_id(dict_protein_gene_id, map_gene_ids) if verbose: print("parsing association enzrxns") enzrxns_parser(enzrxns_file, padmet, mapped_dict_protein_gene_id, source, verbose) if metabolic_reactions is not None: if verbose: print("enhancing db from metabolic-reactions.xml") padmet = enhance_db(metabolic_reactions, padmet, extract_gene, verbose) for rlt in list_of_relation: try: padmet.dicOfRelationIn[rlt.id_in].append(rlt) except KeyError: padmet.dicOfRelationIn[rlt.id_in] = [rlt] try: padmet.dicOfRelationOut[rlt.id_out].append(rlt) except KeyError: padmet.dicOfRelationOut[rlt.id_out] = [rlt] if extract_gene and no_orphan: all_reactions = [node for node in list(padmet.dicOfNode.values()) if node.type == "reaction"] rxn_to_del = [r for r in all_reactions if not any([rlt for rlt in padmet.dicOfRelationIn[r.id] if rlt.type == "is_linked_to"])] for rxn in rxn_to_del: padmet.delNode(rxn.id) if verbose: print("%s/%s orphan reactions (without gene association) deleted" %(len(rxn_to_del), len(all_reactions))) all_genes_linked = set([rlt.id_out for rlt in padmet.getAllRelation() if rlt.type == "is_linked_to"]) all_genes = set([node.id for node in list(padmet.dicOfNode.values()) if node.type == "gene"]) count = 0 for gene_id in [g for g in all_genes if g not in all_genes_linked]: count += 1 #if verbose: print("Removing gene without gene assoc %s" %gene_id) padmet.dicOfNode.pop(gene_id) if verbose: print("%s/%s orphan genes (not linked to any reactions) deleted" %(count, len(all_genes))) rxns = [node.id for node in list(padmet.dicOfNode.values()) if node.type == "reaction"] for rxn_id in rxns: cp_rlts = set([rlt.type for rlt in padmet.dicOfRelationIn[rxn_id] if rlt.type in ["consumes","produces"]]) if len(cp_rlts) == 1: print("rxn only consume or produce, transport ???: %s" %rxn_id) padmet.delNode(rxn_id) return padmet