def pgdb_to_sbml(pgdb_dir, output_dir, noorphan_bool, padmet_bool, sbml_level, cpu): """Turn Pathway Tools PGDBs into SBML2 files using Padmet Args: pgdb_dir (str): PGDB directory output_dir (str): results directory noorphan_bool (bool): ignores orphan reactions if True sbml_level (int): SBML level cpu (int): number of CPU for multi-process Returns: sbml_dir (str): SBML directory if successful """ logger.info('######### Creating SBML files #########') sbml_dir = os.path.join(output_dir, 'sbml') if padmet_bool: padmet_dir = os.path.join(output_dir, 'padmet') if not utils.is_valid_dir(padmet_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) if not utils.is_valid_dir(sbml_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) pgdb_to_sbml_pool = Pool(processes=cpu) multiprocess_data = [] for species in os.listdir(pgdb_dir): pgdb_species_path = os.path.join(pgdb_dir, species) sbml_species_path = os.path.join(sbml_dir, species + '.sbml') padmet_species_path = os.path.join(padmet_dir, species + '.padmet') if padmet_bool: multiprocess_data.append([ pgdb_species_path, sbml_species_path, sbml_level, noorphan_bool, padmet_species_path ]) else: multiprocess_data.append([ pgdb_species_path, sbml_species_path, sbml_level, noorphan_bool, padmet_bool ]) sbml_checks = pgdb_to_sbml_pool.map(run_pgdb_to_sbml, multiprocess_data) pgdb_to_sbml_pool.close() pgdb_to_sbml_pool.join() if all(sbml_checks): return sbml_dir else: logger.critical('Error during padmet/sbml creation.') sys.exit(1)
def instance_community(sbml_dir, seeds, output_dir, targets_file=None, host_mn=None): """Create ASP instance for community analysis. Args: sbml_dir (str): directory of symbionts SBML files seeds (str): seeds SBML file output_dir (str): directory for results targets_file (str): targets file host_mn (str): metabolic network file for host Returns: str: instance filepath """ logger.info( "######### Creating metabolic instance for the whole community #########" ) miscoto_dir = os.path.join(output_dir, 'community_analysis') if not utils.is_valid_dir(miscoto_dir): logger.critical("Impossible to access/create output directory") sys.exit(1) # create tempfile fd, outputfile = tempfile.mkstemp(suffix='.lp', prefix='miscoto_', dir=miscoto_dir) instance_filepath = run_instance( bacteria_dir=sbml_dir, seeds_file=seeds, host_file=host_mn, targets_file=targets_file, output=outputfile) logger.info("Created instance in " + instance_filepath) return instance_filepath
def comm_scope_run(instance, output_dir, host_mn=None): """Run Miscoto_scope and analyse community metabolic capabilities Args: instance (str): instance filepath output_dir (str): directory for results host_mn (str): metabolic network file for host Returns: set: microbiota scope """ miscoto_dir = os.path.join(output_dir, 'community_analysis') com_scopes_path = os.path.join(miscoto_dir, 'comm_scopes.json') if not utils.is_valid_dir(miscoto_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) microbiota_scope = run_scopes(lp_instance_file=instance) # Remove keys "host_prodtargets", "host_scope", "comhost_scope" and "host_unprodtargets" if there is no host: if host_mn is None: del microbiota_scope['host_prodtargets'] del microbiota_scope['host_unprodtargets'] del microbiota_scope['host_scope'] del microbiota_scope['comhost_scope'] with open(com_scopes_path, 'w') as dumpfile: json.dump(microbiota_scope, dumpfile, indent=4) logger.info('Community scopes for all metabolic networks available in ' + com_scopes_path) return set(microbiota_scope['com_scope'])
def addedvalue(iscope_rm, cscope_rm, out_dir): """Compute the added value of considering interaction with microbiota metabolism rather than individual metabolisms. Args: iscope_rm (set): union of metabolites in all individual scopes cscope_rm (set): metabolites reachable by community/microbiota Returns: set: set of metabolites that can only be reached by a community """ # Community targets = what can be produced only if cooperation occurs between species newtargets = cscope_rm - iscope_rm logger.info("\nAdded value of cooperation over individual metabolism: " + str(len(newtargets)) + " newly reachable metabolites: \n") logger.info('\n'.join(newtargets)) logger.info("\n") miscoto_dir = out_dir + "/community_analysis" if not utils.is_valid_dir(miscoto_dir): logger.critical("Impossible to access/create output directory") sys.exit(1) dict_av = {"addedvalue": list(newtargets)} with open(miscoto_dir + "/addedvalue.json", 'w') as dumpfile: json.dump(dict_av, dumpfile, indent=4, default=lambda x: x.__dict__) logger.info( f"Added-value of cooperation written in {miscoto_dir}/addedvalue.json") return newtargets
def check_sbml(inpt, outdir, folder = True): """Check whether one or several SBML level 3 files are in directory. If yes, convert them into a new directory and copy the SBML files that are correct into this same directory. Args: inpt (str): SBML files directory outdir (str): Results directory folder (bool): Defaults to True. Change function behavior is input is a file or a folder Returns: str: filepath of file or directory, same as input if all SBMLs are level2 """ if folder: all_files = [ f for f in os.listdir(inpt) if os.path.isfile(os.path.join(inpt, f)) and utils.get_extension( os.path.join(inpt, f)).lower() in ["xml", "sbml"] ] sbml_levels = {} make_new_sbmls = False for f in all_files: sbml_levels[f] = sbml_management.get_sbml_level( os.path.join(inpt, f)) if sbml_levels[f] != 2: make_new_sbmls = True if make_new_sbmls: sbml_dir = outdir + "/new_sbml/" logger.warning( "At least one SBML has not a suitable level for the tools. They will be transformed and created in " + sbml_dir + ". The others will be copied in this directory") if not utils.is_valid_dir(sbml_dir): logger.critical("Impossible to write in output directory") sys.exit(1) for f in all_files: if sbml_levels[f] != 2: #create level 2 SBML in sbml_dir sbml_management.transform_sbml_lvl( os.path.join(inpt, f), os.path.join(sbml_dir, f), 2) else: #copy the original SBML in sbml_dir copyfile(os.path.join(inpt, f), os.path.join(sbml_dir, f)) else: sbml_dir = inpt return sbml_dir else: if not utils.is_valid_file(inpt): logger.critical(inpt + " is not a correct filepath") sys.exit(1) else: sbml_level = sbml_management.get_sbml_level(inpt) if sbml_level != 2: newsbml = outdir + '/' + utils.get_basename(inpt) + "_lvl2.sbml" logger.warning(inpt + " was not in a suitable level for analysis. A converted file is created in " + newsbml) sbml_management.transform_sbml_lvl(inpt, newsbml, 2) else: newsbml = inpt return newsbml
def indiv_scope_run(sbml_dir, seeds, output_dir): """Run Menetools and analyse individual metabolic capabilities. Args: sbml_dir (str): directory of SBML files seeds (str): SBML seeds file output_dir (str): directory for results Returns: str: output file for Menetools analysis """ logger.info('######### Running individual metabolic scopes #########') menetools_dir = os.path.join(output_dir, 'indiv_scopes') indiv_scopes_path = os.path.join(menetools_dir, 'indiv_scopes.json') if not utils.is_valid_dir(menetools_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) all_files = [ f for f in os.listdir(sbml_dir) if os.path.isfile(os.path.join(sbml_dir, f)) and utils.get_extension( os.path.join(sbml_dir, f)).lower() in ['xml', 'sbml'] ] all_scopes = {} for f in all_files: bname = utils.get_basename(f) try: all_scopes[bname] = run_menescope(draft_sbml=os.path.join( sbml_dir, f), seeds_sbml=seeds) except: traceback_str = traceback.format_exc() # Don't print the traceback if the error is linked to SystemExit as the error has been hanled by menetools. if 'SystemExit: 1' not in traceback_str: logger.critical(traceback_str) logger.critical( '---------------Something went wrong running Menetools on " + f + "---------------' ) sys.exit(1) with open(indiv_scopes_path, 'w') as dumpfile: json.dump(all_scopes, dumpfile, indent=4) return indiv_scopes_path
def enumeration_analysis(sbml_folder, target_folder_file, seed_file, output_dir, host_file=None): """Run miscoto enumeration on input data Args: sbml_folder (str): sbml directory target_folder_file (str): targets file or folder containing multiple sbmls seed_file (str): seeds file output_dir (str): results directory host_file (str): metabolic network file for host Returns: dict: {target_filename_without_extension: json_output_path} """ starttime = time.time() target_paths = utils.file_or_folder(target_folder_file) output_jsons = os.path.join(output_dir, 'json') if not utils.is_valid_dir(output_jsons): logger.critical("Impossible to access/create output directory") sys.exit(1) miscoto_jsons = {} for target_path in target_paths: logger.info('######### Enumeration of solution for: ' + target_path + ' #########') target_pathname = target_paths[target_path] output_json = os.path.join(output_jsons, target_path + '.json') if os.path.exists(output_json): logger.info('######### Enumeration has already been done for ' + target_path + ', it will not be launched again. #########') else: miscoto_json = enumeration(sbml_folder, target_pathname, seed_file, output_json, host_file) miscoto_jsons[target_path] = miscoto_json logger.info("--- Enumeration runtime %.2f seconds ---\n" % (time.time() - starttime)) return output_jsons
def comm_scope_run(instance, output_dir): """Run Miscoto_scope and analyse individual metabolic capabilities Args: sbml_dir (str): directory of SBML files seeds (str): seeds SBML file output_dir (str): directory for results Returns: set: microbiota scope """ miscoto_dir = output_dir + "/community_analysis" if not utils.is_valid_dir(miscoto_dir): logger.critical("Impossible to access/create output directory") sys.exit(1) microbiota_scope = run_scopes(instance) with open(miscoto_dir + "/comm_scopes.json", 'w') as dumpfile: json.dump(microbiota_scope, dumpfile, indent=4) logger.info("Community scopes for all metabolic networks available in " + miscoto_dir + "/comm_scopes.json") return set(microbiota_scope['com_scope'])
def compute_mincom(instancefile, output_dir): """Run minimal community selection and analysis. Args: instancefile (str): filepath to instance file output_dir (str): directory with results Returns: dict: results of miscoto_mincom analysis """ miscoto_dir = output_dir + "/community_analysis" if not utils.is_valid_dir(miscoto_dir): logger.critical("Impossible to access/create output directory") sys.exit(1) results_dic = run_mincom(option="soup", lp_instance_file=instancefile, optsol=True, union=True, intersection=True) return results_dic
def create_gml(json_paths, target_paths, output_dir, taxon_file=None): """Create solution graph from miscoto output and compute stats Args: json_paths (str): {target: path_to_corresponding_json} target_paths (str): {target: path_to_corresponding_sbml} output_dir (str): results directory taxon_file (str): mpwt taxon file for species in sbml folder """ miscoto_stat_output = output_dir + "/" + "miscoto_stats.txt" key_species_stats_output = output_dir + "/" + "keystone_species_stats.tsv" key_species_supdata_output = output_dir + "/" + "keystone_species_supdata.tsv" gml_output = output_dir + "/" + "gml" + "/" if not utils.is_valid_dir(gml_output): logger.critical("Impossible to access/create output directory") sys.exit(1) len_min_sol = {} len_union = {} len_intersection = {} len_solution = {} len_target = {} target_categories = {} for target in target_paths: target_categories[target] = sbml_management.get_compounds( target_paths[target]) if taxon_file: phylum_named_species, all_phylums = get_phylum(taxon_file) else: phylum_named_species = None all_phylums = None with open(key_species_stats_output, "w") as key_stats_file, open( key_species_supdata_output, "w") as key_sup_file, open(miscoto_stat_output, "w") as stats_output: keystone_stats_writer = csv.writer(key_stats_file, delimiter="\t") if all_phylums: keystone_stats_writer.writerow([ "target_categories", "keystones_group", *sorted(all_phylums), "Sum" ]) else: keystone_stats_writer.writerow( ["target_categories", "keystones_group", "data", "Sum"]) keystone_sup_writer = csv.writer(key_sup_file, delimiter="\t") for target_category in target_categories: with open(json_paths[target_category]) as json_data: dicti = json.load(json_data) create_stat_species(target_category, dicti, keystone_stats_writer, keystone_sup_writer, phylum_named_species, all_phylums) G = nx.Graph() added_node = [] species_weight = {} if dicti["still_unprod"] != []: print("ERROR ", dicti["still_unprod"], " is unproducible") len_target[target_category] = len(dicti["newly_prod"]) + len( dicti["still_unprod"]) len_min_sol[target_category] = len(dicti["bacteria"]) len_union[target_category] = len(dicti["union_bacteria"]) len_intersection[target_category] = len(dicti["inter_bacteria"]) len_solution[target_category] = len(dicti["enum_bacteria"]) for sol in dicti["enum_bacteria"]: for species_1, species_2 in combinations( dicti["enum_bacteria"][sol], 2): if species_1 not in added_node: if taxon_file: G.add_node(phylum_named_species[species_1]) else: G.add_node(species_1) added_node.append(species_1) if species_2 not in added_node: if taxon_file: G.add_node(phylum_named_species[species_2]) else: G.add_node(species_2) added_node.append(species_2) combination_species = "_".join( sorted([species_1, species_2])) if combination_species not in species_weight: species_weight[combination_species] = 1 else: species_weight[combination_species] += 1 if taxon_file: G.add_edge(phylum_named_species[species_1], phylum_named_species[species_2], weight=species_weight[combination_species]) else: G.add_edge(species_1, species_2, weight=species_weight[combination_species]) statswriter = csv.writer(stats_output, delimiter="\t") statswriter.writerow([ "categories", "nb_target", "size_min_sol", "size_union", "size_intersection", "size_enum" ]) statswriter.writerow([ target_category, str(len_target[target_category]), str(len_min_sol[target_category]), str(len_union[target_category]), str(len_intersection[target_category]), str(len_solution[target_category]) ]) logger.info('######### Graph of ' + target_category + ' #########') logger.info('Number of nodes: ' + str(G.number_of_nodes())) logger.info('Number of edges: ' + str(G.number_of_edges())) nx.write_gml(G, gml_output + "/" + target_category + ".gml")
def main(): """Run programm """ start_time = time.time() parser = argparse.ArgumentParser( "m2m_analysis", description=MESSAGE + " For specific help on each subcommand use: m2m_analysis {cmd} --help", epilog=REQUIRES, formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument( "-v", "--version", action="version", version="%(prog)s " + VERSION + "\n" + LICENSE) # parent parser parent_parser_q = argparse.ArgumentParser(add_help=False) parent_parser_q.add_argument( "-q", "--quiet", dest="quiet", help="quiet mode", required=False, action="store_true", default=None, ) parent_parser_c = argparse.ArgumentParser(add_help=False) parent_parser_c.add_argument( "-c", "--cpu", help="cpu number for multi-process", required=False, type=int, default=1) parent_parser_o = argparse.ArgumentParser(add_help=False) parent_parser_o.add_argument( "-o", "--out", dest="out", required=True, help="output directory path", metavar="OUPUT_DIR") parent_parser_s = argparse.ArgumentParser(add_help=False) parent_parser_s.add_argument( "-s", "--seeds", help="seeds (growth medium) for metabolic analysis", required=True) parent_parser_n = argparse.ArgumentParser(add_help=False) parent_parser_n.add_argument( "-n", "--networksdir", metavar="NETWORKS_DIR", help="Metabolic networks directory", required=True) parent_parser_t = argparse.ArgumentParser(add_help=False) parent_parser_t.add_argument( "-t", "--targets", metavar="TARGETS_DIR_OR_FILE", help="Folder containg sbml targets or single sbml file for metabolic analysis", required=True) parent_parser_m = argparse.ArgumentParser(add_help=False) parent_parser_m.add_argument( "-m", "--modelhost", help="Host metabolic model for community analysis", required=False, default=None) parent_parser_taxon = argparse.ArgumentParser(add_help=False) parent_parser_taxon.add_argument( "--taxon", help="Mpwt taxon file", required=False, default=None) parent_parser_j = argparse.ArgumentParser(add_help=False) parent_parser_j.add_argument( "-j", "--json", metavar="JSON_DIR_OR_FILE", help="Folder containing JSON file of single JSON file containing miscoto enumeration results", required=False, type = str) parent_parser_g = argparse.ArgumentParser(add_help=False) parent_parser_g.add_argument( "-g", "--graph", metavar="GML_DIR_OR_FILE", help="Folder containing Graph GML file or single GML file", required=True) parent_parser_jar = argparse.ArgumentParser(add_help=False) parent_parser_jar.add_argument( "--oog", help="OOG jar file for powergraph", required=True, type=str) # subparsers subparsers = parser.add_subparsers( title='subcommands', description='valid subcommands:', dest="cmd") enum_parser = subparsers.add_parser( "enum", help="enumeration using miscoto", parents=[ parent_parser_s, parent_parser_n, parent_parser_t, parent_parser_m, parent_parser_o, parent_parser_q ], description= "Run miscoto enumeration on sbml species with seeds and targets" ) stat_parser = subparsers.add_parser( "stats", help="statistics on keystone species", parents=[ parent_parser_j, parent_parser_o, parent_parser_taxon, parent_parser_q ], description= "Compute statistics on keystone species in the community" ) graph_parser = subparsers.add_parser( "graph", help="graph creation with enumeration solution", parents=[ parent_parser_j, parent_parser_o, parent_parser_t, parent_parser_taxon, parent_parser_q ], description="Create the solution graph using the JSON from miscoto enumeration") powergraph_parser = subparsers.add_parser( "powergraph", help="powergraph creation and visualization", parents=[ parent_parser_g, parent_parser_o, parent_parser_jar, parent_parser_q ], description= "Compress the GMl graph of solution and create a powergraph (bbl) and a svg of the graph" ) wkf_parser = subparsers.add_parser( "workflow", help="whole workflow", parents=[ parent_parser_s, parent_parser_n, parent_parser_t, parent_parser_m, parent_parser_o, parent_parser_jar, parent_parser_taxon, parent_parser_q ], description= "Run the whole workflow: miscoto enumeration, statistics on keystone species, graph on solution and powergraph creation" ) args = parser.parse_args() # If no argument print the help. if len(sys.argv) == 1: parser.print_help() sys.exit(1) # set up the logger if args.quiet: root_logger.setLevel(logging.CRITICAL) else: root_logger.setLevel(logging.INFO) # test writing in out_directory if a subcommand is given else print version and help if args.cmd: if not utils.is_valid_dir(args.out): logger.critical("Impossible to access/create output directory") sys.exit(1) else: logger.info("m2m_analysis " + VERSION + "\n" + LICENSE) parser.print_help() sys.exit() #if modelhost is given as an arg: check the SBML level and turn it into 2 if needed if args.cmd in ["workflow", "enum"]: if not os.path.isdir(args.networksdir): logger.critical(args.networksdir + " is not a correct directory path") sys.exit(1) network_dir = check_sbml(args.networksdir, args.out) if not utils.is_valid_file(args.seeds): logger.critical(args.seeds + " is not a correct filepath") sys.exit(1) if not utils.is_valid_file(args.targets) and not utils.is_valid_dir(args.targets): logger.critical(args.targets + " is not a correct filepath") sys.exit(1) if args.modelhost: new_arg_modelhost = check_sbml(args.modelhost, args.out, folder=False) else: new_arg_modelhost = None # deal with given subcommand if args.cmd == "workflow": main_analysis_workflow(network_dir, args.targets, args.seeds, args.out, args.taxon, args.oog, new_arg_modelhost) elif args.cmd == "enum": main_enumeration(network_dir, args.targets, args.seeds, args.out, new_arg_modelhost) elif args.cmd == "stats": main_stat(args.json, args.out, args.taxon) elif args.cmd == "graph": main_graph(args.json, args.targets, args.out, args.taxon) elif args.cmd == "powergraph": main_powergraph(args.graph, args.out, args.oog) logger.info("--- Total runtime %.2f seconds ---" % (time.time() - start_time))
def mincom(instance_w_targets, out_dir): """Compute minimal community selection and show analyses. Args: instance_w_targets (str): ASP instance filepath out_dir (str): results directory """ starttime = time.time() miscoto_dir = os.path.join(out_dir, 'community_analysis') miscoto_mincom_path = os.path.join(miscoto_dir, 'mincom.json') if not utils.is_valid_dir(miscoto_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) # Compute community selection logger.info('Running minimal community selection') all_results = compute_mincom(instance_w_targets, miscoto_dir) for key in all_results: all_results[key] = list(all_results[key]) producible_targets = all_results['producible'] unproducible_targets = all_results['still_unprod'] logger.info('\nIn the initial and minimal communities ' + str(len(producible_targets)) + ' targets are producible and ' + str(len(unproducible_targets)) + ' remain unproducible.') logger.info('\n' + str(len(producible_targets)) + ' producible targets:') logger.info('\n'.join(producible_targets)) logger.info('\n' + str(len(unproducible_targets)) + ' still unproducible targets:') logger.info('\n'.join(unproducible_targets)) logger.info( f'\nMinimal communities are available in {miscoto_mincom_path} \n') # Give one solution one_sol_bact = [] for bact in all_results['bacteria']: one_sol_bact.append(bact) logger.info('######### One minimal community #########') logger.info( '# One minimal community enabling the producibility of the target metabolites given as inputs' ) logger.info('Minimal number of bacteria in communities => ' + str(len(one_sol_bact)) + '\n') logger.info("\n".join(one_sol_bact)) # Give union of solutions union = all_results['union_bacteria'] logger.info( '######### Key species: Union of minimal communities #########') logger.info( '# Bacteria occurring in at least one minimal community enabling the producibility of the target metabolites given as inputs' ) logger.info('Number of key species => ' + str(len(union)) + "\n") logger.info("\n".join(union)) # Give intersection of solutions intersection = all_results['inter_bacteria'] logger.info( '######### Essential symbionts: Intersection of minimal communities #########' ) logger.info( '# Bacteria occurring in ALL minimal communities enabling the producibility of the target metabolites given as inputs' ) logger.info('Number of essential symbionts => ' + str(len(intersection)) + "\n") logger.info("\n".join(intersection)) # Give key species, essential and alternative symbionts alternative_symbionts = list(set(union) - set(intersection)) logger.info( '######### Alternative symbionts: Difference between Union and Intersection #########' ) logger.info( '# Bacteria occurring in at least one minimal community but not all minimal communities enabling the producibility of the target metabolites given as inputs' ) logger.info('Number of alternative symbionts => ' + str(len(alternative_symbionts)) + '\n') logger.info('\n'.join(alternative_symbionts)) logger.info('\n--- Mincom runtime %.2f seconds ---\n' % (time.time() - starttime))
def main(): """Run programm """ start_time = time.time() parser = argparse.ArgumentParser( "m2m_analysis", description=MESSAGE + " For specific help on each subcommand use: m2m_analysis {cmd} --help", epilog=REQUIRES, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-v", "--version", action="version", version="%(prog)s " + VERSION + "\n" + LICENSE) # parent parser parent_parser_q = argparse.ArgumentParser(add_help=False) parent_parser_q.add_argument( "-q", "--quiet", dest="quiet", help="quiet mode", required=False, action="store_true", default=None, ) parent_parser_c = argparse.ArgumentParser(add_help=False) parent_parser_c.add_argument("-c", "--cpu", help="cpu number for multi-process", required=False, type=int, default=1) parent_parser_o = argparse.ArgumentParser(add_help=False) parent_parser_o.add_argument("-o", "--out", dest="out", required=True, help="output directory path", metavar="OUPUT_DIR") parent_parser_s = argparse.ArgumentParser(add_help=False) parent_parser_s.add_argument( "-s", "--seeds", help="seeds (growth medium) for metabolic analysis", required=True) parent_parser_n = argparse.ArgumentParser(add_help=False) parent_parser_n.add_argument("-n", "--networksdir", metavar="NETWORKS_DIR", help="Metabolic networks directory", required=True) parent_parser_t = argparse.ArgumentParser(add_help=False) parent_parser_t.add_argument( "-t", "--targets", metavar="TARGETS_DIR_OR_FILE", help= "Folder containg sbml targets or single sbml file for metabolic analysis", required=True) parent_parser_m = argparse.ArgumentParser(add_help=False) parent_parser_m.add_argument( "-m", "--modelhost", help="Host metabolic model for community analysis", required=False, default=None) parent_parser_taxon = argparse.ArgumentParser(add_help=False) parent_parser_taxon.add_argument("--taxon", help="Mpwt taxon file", required=False, default=None) parent_parser_j = argparse.ArgumentParser(add_help=False) parent_parser_j.add_argument( "-j", "--json", metavar="JSON_DIR_OR_FILE", help= "Folder containing JSON files of single JSON file containing miscoto enumeration results", required=True, type=str) parent_parser_g = argparse.ArgumentParser(add_help=False) parent_parser_g.add_argument( "-g", "--gml", metavar="GML_DIR_OR_FILE", help= "Folder containing GML files of single GML file containing m2m_analysis graph results", required=True) parent_parser_jar = argparse.ArgumentParser(add_help=False) parent_parser_jar.add_argument( "--oog", help= "OOG jar file for powergraph svg creation using Power Graph Command Line Tool", required=False, type=str) parent_parser_level = argparse.ArgumentParser(add_help=False) parent_parser_level.add_argument( "--level", help= "Taxonomy level, must be: phylum, class, order, family, genus or species. By default, it is phylum.", required=False, type=str) # subparsers subparsers = parser.add_subparsers(title='subcommands', description='valid subcommands:', dest="cmd") enum_parser = subparsers.add_parser( "enum", help="enumeration using miscoto", parents=[ parent_parser_s, parent_parser_n, parent_parser_t, parent_parser_m, parent_parser_o, parent_parser_q ], description= "Run miscoto enumeration on sbml species with seeds and targets") graph_parser = subparsers.add_parser( "graph", help="graph creation with enumeration solution", parents=[ parent_parser_j, parent_parser_o, parent_parser_t, parent_parser_taxon, parent_parser_q, parent_parser_level ], description= "Create the solution graph using the JSON from miscoto enumeration") powergraph_parser = subparsers.add_parser( "powergraph", help="powergraph creation and visualization", parents=[ parent_parser_g, parent_parser_jar, parent_parser_q, parent_parser_taxon, parent_parser_level, parent_parser_o ], description= "Compress the GMl graph of solution and create a powergraph (bbl), a website format of the powergraph and a svg of the graph (if you use the --oog option)" ) wkf_parser = subparsers.add_parser( "workflow", help="whole workflow", parents=[ parent_parser_s, parent_parser_n, parent_parser_t, parent_parser_m, parent_parser_o, parent_parser_jar, parent_parser_taxon, parent_parser_q, parent_parser_level ], description= "Run the whole workflow: miscoto enumeration, graph on solution and powergraph creation" ) args = parser.parse_args() # If no argument print the help. if len(sys.argv) == 1: parser.print_help() sys.exit(1) # set up the logger if args.quiet: logger.setLevel(logging.CRITICAL) else: logger.setLevel(logging.INFO) # test writing in out_directory if a subcommand is given else print version and help if args.cmd: if not utils.is_valid_dir(args.out): logger.critical("Impossible to access/create output directory") sys.exit(1) else: logger.info("m2m_analysis " + VERSION + "\n" + LICENSE) parser.print_help() sys.exit() # add logger in file formatter = logging.Formatter('%(message)s') log_file_path = os.path.join(args.out, f'm2m_analysis_{args.cmd}.log') file_handler = logging.FileHandler(log_file_path, 'w+') file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) # set up the default console logger console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) if args.quiet: console_handler.setLevel(logging.WARNING) logger.addHandler(console_handler) # Check Oog.jar file if args.cmd in ["workflow", "powergraph"]: if args.oog: check_oog_jar_file(args.oog) #if modelhost is given as an arg: check the SBML level and turn it into 2 if needed if args.cmd in ["workflow", "enum"]: if not os.path.isdir(args.networksdir): logger.critical(args.networksdir + " is not a correct directory path") sys.exit(1) network_dir = args.networksdir if not utils.is_valid_file(args.seeds): logger.critical(args.seeds + " is not a correct filepath") sys.exit(1) if not utils.is_valid_file(args.targets) and not utils.is_valid_dir( args.targets): logger.critical(args.targets + " is not a correct filepath") sys.exit(1) if args.modelhost: new_arg_modelhost = args.modelhost else: new_arg_modelhost = None if args.cmd in ["workflow", "graph", "powergraph"]: if args.level: if args.level not in [ 'phylum', 'class', 'order', 'family', 'genus', 'species' ]: logger.critical( "Error with --level arugment, it must be one among: phylum, class, order, family, genus or species" ) sys.exit(1) if args.level is None: args.level = 'phylum' # deal with given subcommand if args.cmd == "workflow": main_analysis_workflow(network_dir, args.targets, args.seeds, args.out, args.taxon, args.oog, new_arg_modelhost, args.level) elif args.cmd == "enum": main_enumeration(network_dir, args.targets, args.seeds, args.out, new_arg_modelhost) elif args.cmd == "graph": main_graph(args.json, args.targets, args.out, args.taxon, args.level) elif args.cmd == "powergraph": main_powergraph(args.gml, args.out, args.oog, args.taxon, args.level) logger.info("--- Total runtime %.2f seconds ---" % (time.time() - start_time))
def genomes_to_pgdb(genomes_dir, output_dir, cpu, clean): """Run Pathway Tools on each genome of the repository Args: genomes_dir (str): genome repository output_dir (str): output repository cpu (int): number of CPUs to use clean (bool): delete PGDBs in ptools-local coresponding to the input data Returns: pgdb_dir (str): pgdb repository """ logger.info( "######### Running metabolic network reconstruction with Pathway Tools #########" ) if not os.path.isdir(genomes_dir): logger.critical("Genomes directory path does not exist.") sys.exit(1) pgdb_dir = output_dir + "/pgdb" log_dir = output_dir + "/pgdb_log" if not utils.is_valid_dir(pgdb_dir): logger.critical("Impossible to access/create output directory") sys.exit(1) if not utils.check_program("pathway-tools"): logger.critical( "Pathway Tools is not in the PATH, please fix it before using the program" ) sys.exit(1) if not utils.check_program("blastp"): logger.critical( "blastp is not in the PATH, please fix it before using the program" ) sys.exit(1) if not utils.is_valid_file(os.path.expanduser("~") + "/.ncbirc"): logger.critical( "No ~/.ncbirc file, please fix it before using the program") sys.exit(1) genomes_pgdbs = [ genome_dir.lower() + 'cyc' for genome_dir in os.listdir(genomes_dir) ] if clean: remove_pgdbs(to_delete_pgdbs=genomes_pgdbs, number_cpu=cpu) cleaning_input(genomes_dir, verbose=False) # Check whether PGDBs are already created. If yes and not --clean, pursue without running ptools again pgdb_dirs = [pgdb_dir.lower() + 'cyc' for pgdb_dir in os.listdir(pgdb_dir)] if set(pgdb_dirs) == set(genomes_pgdbs): logger.warning( "PGDBs are already created and will be used. To overrun them, run m2m with --clean option" ) return pgdb_dir taxon_file = None if 'taxon_id.tsv' in set(next(os.walk(genomes_dir))[2]): taxon_file = True multiprocess_pwt(genomes_dir, pgdb_dir, patho_inference=True, patho_hole_filler=False, patho_operon_predictor=False, no_download_articles=False, dat_creation=True, dat_extraction=True, size_reduction=False, number_cpu=cpu, taxon_file=taxon_file, patho_log=log_dir, verbose=False) if len(os.listdir(pgdb_dir)) != len(os.listdir(genomes_dir)): if os.path.exists(log_dir + "/log_error.txt"): logger.critical( "Something went wrong running Pathway Tools. See the log file in " + log_dir + "/log_error.txt") else: logger.critical("Something went wrong running Pathway Tools.") sys.exit(1) return (pgdb_dir)
def indiv_scope_run(sbml_dir, seeds, output_dir, cpu_number=1): """Run Menetools and analyse individual metabolic capabilities. Args: sbml_dir (str): directory of SBML files seeds (str): SBML seeds file output_dir (str): directory for results cpu_number (int): number of CPU to use for multiprocessing Returns: str: path to output file for scope from Menetools analysis """ logger.info('######### Running individual metabolic scopes #########') menetools_dir = os.path.join(output_dir, 'indiv_scopes') indiv_scopes_path = os.path.join(menetools_dir, 'indiv_scopes.json') produced_seeds_path = os.path.join(menetools_dir, 'indiv_produced_seeds.json') if not utils.is_valid_dir(menetools_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) all_files = [ f for f in os.listdir(sbml_dir) if os.path.isfile(os.path.join(sbml_dir, f)) and utils.get_extension( os.path.join(sbml_dir, f)).lower() in ['xml', 'sbml'] ] all_scopes = {} all_produced_seeds = {} multiprocessing_indiv_scopes = [] for f in all_files: bname = utils.get_basename(f) sbml_path = os.path.join(sbml_dir, f) multiprocessing_indiv_scopes.append((sbml_path, bname, seeds)) menescope_pool = Pool(cpu_number) results = menescope_pool.starmap(indiv_scope_on_species, multiprocessing_indiv_scopes) for result in results: error = result[0] if error is True: logger.critical( '------------An error occurred during M2M run of Menetools, M2M will stop-------------' ) menescope_pool.close() menescope_pool.join() sys.exit(1) bname = result[1] menescope_results = result[2] all_scopes[bname] = menescope_results['scope'] all_produced_seeds[bname] = menescope_results['produced_seeds'] menescope_pool.close() menescope_pool.join() with open(indiv_scopes_path, 'w') as dumpfile: json.dump(all_scopes, dumpfile, indent=4) with open(produced_seeds_path, 'w') as dumpfile: json.dump(all_produced_seeds, dumpfile, indent=4) return indiv_scopes_path
def powergraph_analysis(gml_input_file_folder, output_folder, oog_jar=None, taxon_file=None, taxonomy_level="phylum"): """Run the graph compression and picture creation Args: gml_input_file_folder (str): path to the gml folder or the gml file output_folder (str): path to the output folder oog_jar (str): path to OOG jar file taxon_file (str): mpwt taxon file for species taxonomy_level (str): taxonomy level, must be: phylum, class, order, family, genus or species. """ starttime = time.time() gml_paths = utils.file_or_folder(gml_input_file_folder) bbl_path = os.path.join(output_folder, 'bbl') svg_path = os.path.join(output_folder, 'svg') html_output = os.path.join(output_folder, 'html') if not utils.is_valid_dir(bbl_path): logger.critical("Impossible to access/create output directory " + bbl_path) sys.exit(1) if oog_jar: if not utils.is_valid_dir(svg_path): logger.critical("Impossible to access/create output directory " + svg_path) sys.exit(1) if not utils.is_valid_dir(html_output): logger.critical("Impossible to access/create output directory " + html_output) sys.exit(1) # 26 colours from Alphabet project (minus white): # https://en.wikipedia.org/wiki/Help:Distinguishable_colors alphabet_project_distinct_hex_colors = ["#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFFF00", "#FF5005"] # 269 colors from: # https://graphicdesign.stackexchange.com/a/3815 hex_colors = ["#000000","#FFFF00","#1CE6FF","#FF34FF", "#FF4A46","#008941","#006FA6","#A30059","#FFDBE5","#7A4900", "#0000A6","#63FFAC","#B79762","#004D43","#8FB0FF","#997D87", "#5A0007","#809693","#FEFFE6","#1B4400","#4FC601","#3B5DFF", "#4A3B53","#FF2F80","#61615A","#BA0900","#6B7900","#00C2A0", "#FFAA92","#FF90C9","#B903AA","#D16100","#DDEFFF","#000035", "#7B4F4B","#A1C299","#300018","#0AA6D8","#013349","#00846F", "#372101","#FFB500","#C2FFED","#A079BF","#CC0744","#C0B9B2", "#C2FF99","#001E09","#00489C","#6F0062","#0CBD66","#EEC3FF", "#456D75","#B77B68","#7A87A1","#788D66","#885578","#FAD09F", "#FF8A9A","#D157A0","#BEC459","#456648","#0086ED","#886F4C", "#34362D","#B4A8BD","#00A6AA","#452C2C","#636375","#A3C8C9", "#FF913F","#938A81","#575329","#00FECF","#B05B6F","#8CD0FF", "#3B9700","#04F757","#C8A1A1","#1E6E00","#7900D7","#A77500", "#6367A9","#A05837","#6B002C","#772600","#D790FF","#9B9700", "#549E79","#FFF69F","#201625","#72418F","#BC23FF","#99ADC0", "#3A2465","#922329","#5B4534","#FDE8DC","#404E55","#0089A3", "#CB7E98","#A4E804","#324E72","#6A3A4C","#83AB58","#001C1E", "#D1F7CE","#004B28","#C8D0F6","#A3A489","#806C66","#222800", "#BF5650","#E83000","#66796D","#DA007C","#FF1A59","#8ADBB4", "#1E0200","#5B4E51","#C895C5","#320033","#FF6832","#66E1D3", "#CFCDAC","#D0AC94","#7ED379","#012C58","#7A7BFF","#D68E01", "#353339","#78AFA1","#FEB2C6","#75797C","#837393","#943A4D", "#B5F4FF","#D2DCD5","#9556BD","#6A714A","#001325","#02525F", "#0AA3F7","#E98176","#DBD5DD","#5EBCD1","#3D4F44","#7E6405", "#02684E","#962B75","#8D8546","#9695C5","#E773CE","#D86A78", "#3E89BE","#CA834E","#518A87","#5B113C","#55813B","#E704C4", "#00005F","#A97399","#4B8160","#59738A","#FF5DA7","#F7C9BF", "#643127","#513A01","#6B94AA","#51A058","#A45B02","#1D1702", "#E20027","#E7AB63","#4C6001","#9C6966","#64547B","#97979E", "#006A66","#391406","#F4D749","#0045D2","#006C31","#DDB6D0", "#7C6571","#9FB2A4","#00D891","#15A08A","#BC65E9","#FFFFFE", "#C6DC99","#203B3C","#671190","#6B3A64","#F5E1FF","#FFA0F2", "#CCAA35","#374527","#8BB400","#797868","#C6005A","#3B000A", "#C86240","#29607C","#402334","#7D5A44","#CCB87C","#B88183", "#AA5199","#B5D6C3","#A38469","#9F94F0","#A74571","#B894A6", "#71BB8C","#00B433","#789EC9","#6D80BA","#953F00","#5EFF03", "#E4FFFC","#1BE177","#BCB1E5","#76912F","#003109","#0060CD", "#D20096","#895563","#29201D","#5B3213","#A76F42","#89412E", "#1A3A2A","#494B5A","#A88C85","#F4ABAA","#A3F3AB","#00C6C8", "#EA8B66","#958A9F","#BDC9D2","#9FA064","#BE4700","#658188", "#83A485","#453C23","#47675D","#3A3F00","#061203","#DFFB71", "#868E7E","#98D058","#6C8F7D","#D7BFC2","#3C3E6E","#D83D66", "#2F5D9B","#6C5E46","#D25B88","#5B656C","#00B57F","#545C46", "#866097","#365D25","#252F99","#00CCFF","#674E60","#FC009C", "#92896B"] if taxon_file: taxonomy_output_file = os.path.join(output_folder, 'taxonomy_species.tsv') tree_output_file = os.path.join(output_folder, 'taxon_tree.txt') if not os.path.exists(taxonomy_output_file): extract_taxa(taxon_file, taxonomy_output_file, tree_output_file, taxonomy_level) taxon_species, all_taxons = get_taxon(taxonomy_output_file) taxon_colors = {} if len(all_taxons) <= 26: used_colors = alphabet_project_distinct_hex_colors else: used_colors = hex_colors for index, taxon in enumerate(all_taxons): taxon_colors[taxon] = used_colors[index] for target_name in gml_paths: bbl_output = os.path.join(bbl_path, target_name + '.bbl') svg_file = os.path.join(svg_path, target_name + '.bbl.svg') html_target = os.path.join(html_output, target_name) if not utils.is_valid_dir(html_target): logger.critical("Impossible to access/create output directory " + html_target) sys.exit(1) # Compress gml file into a bbl file with PowerGrASP. gml_input_path = gml_paths[target_name] logger.info('######### Graph compression: ' + target_name + ' #########') compression(gml_input_path, bbl_output) logger.info('######### PowerGraph visualization: ' + target_name + ' #########') # Read gml file with networkx and extract the essential and alternative symbionts using the note of each node (organism). graph = nx.read_gml(gml_input_path) essentials = [organism for organism in graph.nodes if graph.nodes[organism]['note'] == 'ES'] alternatives = [organism for organism in graph.nodes if graph.nodes[organism]['note'] == 'AS'] if taxon_file: key_species = essentials + alternatives taxon_key_species = [organism.split('__')[0] for organism in key_species] if len(set(all_taxons).intersection(set(taxon_key_species))) == 0: logger.critical('Difference of taxonomy level between gml file ('+gml_input_path+') compared to '+taxonomy_output_file+'.') sys.exit(1) bbl_to_html(bbl_output, html_target) if taxon_file: if os.path.exists(html_target +'_taxon'): shutil.rmtree(html_target +'_taxon') shutil.copytree(html_target, html_target +'_taxon') update_js_taxonomy(html_target +'_taxon', taxon_colors) output_html_merged = os.path.join(html_output, target_name + '_powergraph_taxon.html') merge_html_css_js(html_target +'_taxon', output_html_merged) update_js(html_target, essentials, alternatives) output_html_merged = os.path.join(html_output, target_name + '_powergraph.html') merge_html_css_js(html_target, output_html_merged) if oog_jar: svg_file = os.path.join(svg_path, target_name + '.bbl.svg') if os.path.exists(svg_file): os.remove(svg_file) bbl_to_svg(oog_jar, bbl_output, svg_path) if taxon_file: taxonomy_svg_file = os.path.join(svg_path, target_name + '_taxon.bbl.svg') if os.path.exists(taxonomy_svg_file): os.remove(taxonomy_svg_file) shutil.copyfile(svg_file, taxonomy_svg_file) update_svg_taxonomy(taxonomy_svg_file, taxon_colors) update_svg(svg_file, essentials, alternatives) logger.info( "--- Powergraph runtime %.2f seconds ---\n" % (time.time() - starttime))
def main(): """Run programm. """ start_time = time.time() parser = argparse.ArgumentParser( "m2m", description=MESSAGE + " For specific help on each subcommand use: m2m {cmd} --help", epilog=REQUIRES) parser.add_argument("-v", "--version", action="version", version="%(prog)s " + VERSION + "\n" + LICENSE) # parent parser parent_parser_q = argparse.ArgumentParser(add_help=False) parent_parser_q.add_argument( "-q", "--quiet", dest="quiet", help="quiet mode", required=False, action="store_true", default=None, ) parent_parser_c = argparse.ArgumentParser(add_help=False) parent_parser_c.add_argument("-c", "--cpu", help="cpu number for multi-process", required=False, type=int, default=1) parent_parser_o = argparse.ArgumentParser(add_help=False) parent_parser_o.add_argument("-o", "--out", dest="out", required=True, help="output directory path", metavar="OUPUT_DIR") parent_parser_no = argparse.ArgumentParser(add_help=False) parent_parser_no.add_argument( "--noorphan", help= "use this option to ignore reactions without gene or protein association", required=False, action="store_true", default=False, ) parent_parser_s = argparse.ArgumentParser(add_help=False) parent_parser_s.add_argument( "-s", "--seeds", help="seeds (growth medium) for metabolic analysis", required=True) parent_parser_n = argparse.ArgumentParser(add_help=False) parent_parser_n.add_argument("-n", "--networksdir", metavar="NETWORKS_DIR", help="metabolic networks directory", required=True) parent_parser_g = argparse.ArgumentParser(add_help=False) parent_parser_g.add_argument("-g", "--genomes", help="annotated genomes directory", required=True) parent_parser_cl = argparse.ArgumentParser(add_help=False) parent_parser_cl.add_argument("--clean", help="clean PGDBs if already present", required=False, action="store_true", default=None) parent_parser_m = argparse.ArgumentParser(add_help=False) parent_parser_m.add_argument( "-m", "--modelhost", help="host metabolic model for community analysis", required=False, default=None) parent_parser_l = argparse.ArgumentParser(add_help=False) parent_parser_l.add_argument("-l", "--level", help="Level for SBML creation, 2 or 3", required=False, type=int, choices=[2, 3], default=2) parent_parser_p = argparse.ArgumentParser(add_help=False) parent_parser_p.add_argument("-p", "--padmet", help="create padmet files", required=False, action="store_true", default=None) parent_parser_t_required = argparse.ArgumentParser(add_help=False) parent_parser_t_required.add_argument( "-t", "--targets", help="targets for metabolic analysis", required=True) parent_parser_t_optional = argparse.ArgumentParser(add_help=False) parent_parser_t_optional.add_argument( "-t", "--targets", help= "Optional targets for metabolic analysis, if not used metage2metabo will use the addedvalue of the community", required=False) # subparsers subparsers = parser.add_subparsers(title='subcommands', description='valid subcommands:', dest="cmd") ptools_parser = subparsers.add_parser( "recon", help="metabolic network reconstruction", parents=[ parent_parser_g, parent_parser_o, parent_parser_c, parent_parser_q, parent_parser_l, parent_parser_no, parent_parser_p, parent_parser_cl ], description= "Run metabolic network reconstruction for each annotated genome of the input directory, using Pathway Tools" ) indivscope_parser = subparsers.add_parser( "iscope", help="individual scope computation", parents=[ parent_parser_n, parent_parser_s, parent_parser_o, parent_parser_q ], description= "Compute individual scopes (reachable metabolites from seeds) for each metabolic network of the input directory" ) comscope_parser = subparsers.add_parser( "cscope", help="community scope computation", parents=[ parent_parser_n, parent_parser_s, parent_parser_o, parent_parser_m, parent_parser_q, parent_parser_t_optional ], description="Compute the community scope of all metabolic networks") added_value_parser = subparsers.add_parser( "addedvalue", help="added value of microbiota's metabolism over individual's", parents=[ parent_parser_n, parent_parser_s, parent_parser_o, parent_parser_m, parent_parser_q ], description= "Compute metabolites that are reachable by the community/microbiota and not by individual organisms" ) mincom_parser = subparsers.add_parser( "mincom", help="minimal communtity selection", parents=[ parent_parser_n, parent_parser_s, parent_parser_o, parent_parser_m, parent_parser_q, parent_parser_t_required ], description= "Select minimal-size community to make reachable a set of metabolites") seeds_parser = subparsers.add_parser( "seeds", help="creation of seeds SBML file", parents=[parent_parser_o, parent_parser_q], description= "Create a SBML file starting for a simple text file with metabolic compounds identifiers" ) seeds_parser.add_argument( "--metabolites", help= 'metabolites file: one per line, encoded (XXX as in <species id="XXXX" .../> of SBML files)', required=True) wkf_parser = subparsers.add_parser( "workflow", help="whole workflow", parents=[ parent_parser_g, parent_parser_s, parent_parser_m, parent_parser_o, parent_parser_c, parent_parser_q, parent_parser_no, parent_parser_p, parent_parser_t_optional, parent_parser_cl ], description= "Run the whole workflow: metabolic network reconstruction, individual and community scope analysis and community selection" ) metacom_parser = subparsers.add_parser( "metacom", help="whole metabolism community analysis", parents=[ parent_parser_n, parent_parser_s, parent_parser_m, parent_parser_o, parent_parser_t_optional, parent_parser_q ], description= "Run the whole metabolism community analysis: individual and community scope analysis and community selection" ) test_parser = subparsers.add_parser( "test", help="test on sample data from rumen experiments", parents=[parent_parser_q, parent_parser_c, parent_parser_o], description="Test the whole workflow on a data sample") args = parser.parse_args() # If no argument print the help. if len(sys.argv) == 1: parser.print_help() sys.exit(1) # test writing in out_directory if a subcommand is given else print version and help if args.cmd: if not utils.is_valid_dir(args.out): logger.critical("Impossible to access/create output directory") sys.exit(1) else: logger.info("m2m " + VERSION + "\n" + LICENSE) parser.print_help() sys.exit() # logger = logging.getLogger() #TODO: get rid of it once mpwt's logger is fixed # logger.setLevel(logging.DEBUG) #TODO: get rid of it once mpwt's logger is fixed # add logger in file formatter = logging.Formatter('%(message)s') log_file_path = os.path.join(args.out, f'm2m_{args.cmd}.log') file_handler = logging.FileHandler(log_file_path, 'w+') file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) # set up the default console logger console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) if args.quiet: console_handler.setLevel(logging.WARNING) logger.addHandler(console_handler) #if modelhost is given as an arg: check the SBML level and turn it into 2 if needed if args.cmd in ["workflow", "metacom", "mincom", "cscope", "addedvalue" ] and args.modelhost: new_arg_modelhost = args.modelhost logger.warning( f"\n A metabolic model is given for an host. The metabolite producibility of the community will display metabolites that can be produced by the hsot or the microbiome, *not including* the metabolites that the host can produce by itself. If this is not what you want to do, you can consider placing the host in the same directory as the symbionts, which will lead to a complete community scope. \n" ) else: new_arg_modelhost = None if "seeds" in args and args.seeds is not None: if not utils.is_valid_file(args.seeds): logger.critical(args.seeds + " is not a correct filepath") sys.exit(1) # deal with given subcommand if args.cmd == "workflow": main_workflow(args.genomes, args.out, args.cpu, args.clean, args.seeds, args.noorphan, args.padmet, new_arg_modelhost, args.targets) elif args.cmd in ["iscope", "cscope", "addedvalue", "mincom", "metacom"]: if not os.path.isdir(args.networksdir): logger.critical(args.networksdir + " is not a correct directory path") sys.exit(1) network_dir = args.networksdir if "targets" in args and args.targets is not None: if not utils.is_valid_file(args.targets): logger.critical(args.targets + " is not a correct filepath") sys.exit(1) # test if some targets are seeds itsct_seeds_targets = sbml_management.compare_seeds_and_targets( args.seeds, args.targets) if itsct_seeds_targets != set(): logger.warning( f"\nWARNING: compounds {*list(itsct_seeds_targets),} are both in seeds and targets. Since they are in seeds, they will be in each organism's individual producibility scope (iscope), but not appear in the community scope (cscope). To be certain that they are produced (through an activable reaction and not just because they are seeds), check the output file: producibility_targets.json.\n" ) if args.cmd == "iscope": main_iscope(network_dir, args.seeds, args.out) elif args.cmd == "cscope": main_cscope(network_dir, args.seeds, args.out, args.targets, new_arg_modelhost) elif args.cmd == "addedvalue": main_added_value(network_dir, args.seeds, args.out, new_arg_modelhost) elif args.cmd == "mincom": main_mincom(network_dir, args.seeds, args.out, args.targets, new_arg_modelhost) elif args.cmd == "metacom": main_metacom(network_dir, args.out, args.seeds, new_arg_modelhost, args.targets) elif args.cmd == "recon": main_recon(args.genomes, args.out, args.noorphan, args.padmet, args.level, args.cpu, args.clean) elif args.cmd == "seeds": if not utils.is_valid_file(args.metabolites): logger.critical(args.metabolites + " is not a correct filepath") sys.exit(1) else: main_seeds(args.metabolites, args.out) elif args.cmd == 'test': main_test(args.out, args.cpu) logger.info("--- Total runtime %.2f seconds ---" % (time.time() - start_time)) logger.warning(f'--- Logs written in {log_file_path} ---')
def genomes_to_pgdb(genomes_dir, output_dir, cpu, clean, use_pwt_xml): """Run Pathway Tools on each genome of the repository Args: genomes_dir (str): genome repository output_dir (str): output repository cpu (int): number of CPUs to use clean (bool): delete PGDBs in ptools-local coresponding to the input data use_pwt_xml (bool): use Pathway Tools XML instead of creating them with padmet Returns: pgdb_dir (str): pgdb repository """ logger.info( "######### Running metabolic network reconstruction with Pathway Tools #########" ) if not os.path.isdir(genomes_dir): logger.critical("Genomes directory path does not exist.") sys.exit(1) pgdb_dir = os.path.join(output_dir, 'pgdb') log_dir = os.path.join(output_dir, 'pgdb_log') ncbirc_path = os.path.join(os.path.expanduser('~'), '.ncbirc') log_path = os.path.join(log_dir, 'log_error.txt') if not utils.is_valid_dir(pgdb_dir): logger.critical('Impossible to access/create output directory') sys.exit(1) if not utils.check_program('pathway-tools'): logger.critical( 'Pathway Tools is not in the PATH, please fix it before using the program' ) sys.exit(1) if not utils.check_program("blastp"): logger.critical( 'blastp is not in the PATH, please fix it before using the program' ) sys.exit(1) if not utils.is_valid_file(ncbirc_path): logger.critical( f'No {ncbirc_path} file, please fix it before using the program' ) sys.exit(1) genomes_pgdbs = [genome_dir.lower() + 'cyc' for genome_dir in os.listdir(genomes_dir)] if clean: remove_pgdbs(to_delete_pgdbs=genomes_pgdbs, number_cpu=cpu) cleaning_input(genomes_dir, verbose=False) # Check whether PGDBs are already created. If yes and not --clean, pursue without running ptools again pgdb_dirs = [pgdb_dir.lower() + 'cyc' for pgdb_dir in os.listdir(pgdb_dir)] if set(pgdb_dirs) == set(genomes_pgdbs): logger.warning("PGDBs are already created and will be used. To overrun them, run m2m with --clean option") return pgdb_dir taxon_file = None if 'taxon_id.tsv' in set(next(os.walk(genomes_dir))[2]): taxon_file = True if use_pwt_xml: move_dat = False move_xml = True else: move_dat = True move_xml = False multiprocess_pwt(genomes_dir, pgdb_dir, patho_inference=True, patho_hole_filler=False, patho_operon_predictor=False, no_download_articles=False, flat_creation=True, dat_extraction=move_dat, xml_extraction=move_xml, owl_extraction=False, col_extraction=False, size_reduction=False, number_cpu=cpu, taxon_file=taxon_file, patho_log=log_dir, verbose=False) nb_genomes_dir = len([folder for folder in os.listdir(genomes_dir) if os.path.isdir(os.path.join(genomes_dir, folder))]) if use_pwt_xml: nb_pgdb_dir = len([folder for folder in os.listdir(pgdb_dir) if os.path.isfile(os.path.join(pgdb_dir, folder))]) else: nb_pgdb_dir = len([folder for folder in os.listdir(pgdb_dir) if os.path.isdir(os.path.join(pgdb_dir, folder))]) if nb_pgdb_dir != nb_genomes_dir: if os.path.exists(log_path): logger.critical("Something went wrong running Pathway Tools. See the log file in " + log_path) else: logger.critical("Something went wrong running Pathway Tools.") sys.exit(1) return (pgdb_dir)
def mincom(instance_w_targets, out_dir): """Compute minimal community selection and show analyses. Args: instance_w_targets (str): ASP instance filepath out_dir (str): results directory """ starttime = time.time() miscoto_dir = out_dir + "/community_analysis" if not utils.is_valid_dir(miscoto_dir): logger.critical("Impossible to access/create output directory") sys.exit(1) # Compute community selection logger.info("Running minimal community selection") all_results = compute_mincom(instance_w_targets, out_dir) for key in all_results: all_results[key] = list(all_results[key]) with open(miscoto_dir + "/mincom.json", 'w') as dumpfile: json.dump(all_results, dumpfile, indent=4, default=lambda x: x.__dict__) logger.info("Community scopes for all metabolic networks available in " + miscoto_dir + "/comm_scopes.json") # Give one solution one_sol_bact = [] for bact in all_results['bacteria']: one_sol_bact.append(bact) logger.info('######### One minimal community #########') logger.info( "# One minimal community enabling the producibility of the target metabolites given as inputs" ) logger.info("Minimal number of bacteria in communities = " + str(len(one_sol_bact))) logger.info("\n".join(one_sol_bact)) # Give union of solutions union = all_results['union_bacteria'] logger.info( '######### Keystone species: Union of minimal communities #########') logger.info( "# Bacteria occurring in at least one minimal community enabling the producibility of the target metabolites given as inputs" ) logger.info("Keystone species = " + str(len(union))) logger.info("\n".join(union)) # Give intersection of solutions intersection = all_results['inter_bacteria'] logger.info( '######### Essential symbionts: Intersection of minimal communities #########' ) logger.info( "# Bacteria occurring in ALL minimal communities enabling the producibility of the target metabolites given as inputs" ) logger.info("Essential symbionts = " + str(len(intersection))) logger.info("\n".join(intersection)) # Give keystones, essential and alternative symbionts alternative_symbionts = list(set(union) - set(intersection)) logger.info( '######### Alternative symbionts: Difference between Union and Intersection #########' ) logger.info( "# Bacteria occurring in at least one minimal community but not all minimal communities enabling the producibility of the target metabolites given as inputs" ) logger.info("Alternative symbionts = " + str(len(alternative_symbionts))) logger.info("\n".join(alternative_symbionts)) logger.info("--- Mincom runtime %.2f seconds ---\n" % (time.time() - starttime))
def create_gml(json_paths, target_paths, output_dir, taxon_file=None): """Create solution graph from miscoto output and compute stats Args: json_paths (str): {target: path_to_corresponding_json} target_paths (str): {target: path_to_corresponding_sbml} output_dir (str): results directory taxon_file (str): mpwt taxon file for species in sbml folder """ miscoto_stat_output = os.path.join(output_dir, 'miscoto_stats.txt') key_species_stats_output = os.path.join(output_dir, 'key_species_stats.tsv') key_species_json = os.path.join(output_dir, 'key_species.json') gml_output = os.path.join(output_dir, 'gml') if not utils.is_valid_dir(gml_output): logger.critical('Impossible to access/create output directory') sys.exit(1) len_min_sol = {} len_union = {} len_intersection = {} len_solution = {} len_target = {} target_categories = {} for target in target_paths: target_categories[target] = sbml_management.get_compounds( target_paths[target]) if taxon_file: taxon_named_species, all_taxons = get_taxon(taxon_file) else: taxon_named_species = None all_taxons = None key_species_data = {} miscoto_stat_output_datas = [] for target_category in target_categories: key_species_data[target_category] = {} key_species_data[target_category]['essential_symbionts'] = {} key_species_data[target_category]['alternative_symbionts'] = {} target_output_gml_path = os.path.join(gml_output, target_category + '.gml') with open(json_paths[target_category]) as json_data: dicti = json.load(json_data) G = nx.Graph() added_node = [] species_weight = {} if dicti['still_unprod'] != []: logger.warning('ERROR ', dicti["still_unprod"], ' is unproducible') len_target[target_category] = len(dicti['newly_prod']) + len( dicti['still_unprod']) len_min_sol[target_category] = len(dicti['bacteria']) len_union[target_category] = len(dicti['union_bacteria']) len_intersection[target_category] = len(dicti['inter_bacteria']) key_species_types = { organism: 'ES' if organism in dicti['inter_bacteria'] else 'AS' for organism in dicti['union_bacteria'] } if taxon_file: for taxon in all_taxons: key_species_data[target_category]['essential_symbionts'][ taxon] = [ organism for organism in key_species_types if key_species_types[organism] == 'ES' and taxon_named_species[organism].split('__')[0] == taxon ] key_species_data[target_category]['alternative_symbionts'][ taxon] = [ organism for organism in key_species_types if key_species_types[organism] == 'AS' and taxon_named_species[organism].split('__')[0] == taxon ] else: key_species_data[target_category]['essential_symbionts'][ 'data'] = [ organism for organism in key_species_types if key_species_types[organism] == 'ES' ] key_species_data[target_category]['alternative_symbionts'][ 'data'] = [ organism for organism in key_species_types if key_species_types[organism] == 'AS' ] len_solution[target_category] = len(dicti['enum_bacteria']) for sol in dicti['enum_bacteria']: if len(dicti['enum_bacteria'][sol]) > 1: for species_1, species_2 in combinations( dicti['enum_bacteria'][sol], 2): if species_1 not in added_node: if taxon_file: G.add_node(taxon_named_species[species_1], note=key_species_types[species_1]) else: G.add_node(species_1, note=key_species_types[species_1]) added_node.append(species_1) if species_2 not in added_node: if taxon_file: G.add_node(taxon_named_species[species_2], note=key_species_types[species_2]) else: G.add_node(species_2, note=key_species_types[species_2]) added_node.append(species_2) combination_species = '_'.join( sorted([species_1, species_2])) if combination_species not in species_weight: species_weight[combination_species] = 1 else: species_weight[combination_species] += 1 if taxon_file: G.add_edge(taxon_named_species[species_1], taxon_named_species[species_2], weight=species_weight[combination_species]) else: G.add_edge(species_1, species_2, weight=species_weight[combination_species]) elif len(dicti['enum_bacteria'][sol]) == 1: species_1 = dicti['enum_bacteria'][sol][0] if species_1 not in added_node: if taxon_file: G.add_node(taxon_named_species[species_1], note=key_species_types[species_1]) else: G.add_node(species_1, note=key_species_types[species_1]) added_node.append(species_1) # Check if all the nodes of G are not isolates. if len(G.nodes) == nx.number_of_isolates(G): logger.critical( r'/!\ Warning: All the nodes of the solution graph are isolated (they are not connected to other nodes). This lead to powergrasp creating san empty powergraph.' ) logger.critical( 'So m2m_analysis stops at the solution graph step.') sys.exit(1) miscoto_stat_output_datas.append([ target_category, str(len_target[target_category]), str(len_min_sol[target_category]), str(len_union[target_category]), str(len_intersection[target_category]), str(len_solution[target_category]) ]) logger.info('######### Graph of ' + target_category + ' #########') logger.info('Number of nodes: ' + str(G.number_of_nodes())) logger.info('Number of edges: ' + str(G.number_of_edges())) nx.write_gml(G, target_output_gml_path) with open(miscoto_stat_output, 'w') as stats_output: statswriter = csv.writer(stats_output, delimiter="\t") statswriter.writerow([ 'categories', 'nb_target', 'size_min_sol', 'size_union', 'size_intersection', 'size_enum' ]) for miscoto_stat_output_data in miscoto_stat_output_datas: statswriter.writerow(miscoto_stat_output_data) with open(key_species_json, 'w') as json_output: json.dump(key_species_data, json_output, indent=4) with open(key_species_stats_output, 'w') as key_stat_output: key_stats_writer = csv.writer(key_stat_output, delimiter='\t') if all_taxons: key_stats_writer.writerow( ['target_categories', 'key_group', *sorted(all_taxons), 'Sum']) else: key_stats_writer.writerow( ['target_categories', 'key_group', 'data', 'Sum']) for target in key_species_data: if all_taxons: essential_counts = [ len(key_species_data[target]['essential_symbionts'][taxon]) for taxon in sorted(all_taxons) ] alternative_counts = [ len(key_species_data[target]['alternative_symbionts'] [taxon]) for taxon in sorted(all_taxons) ] else: essential_counts = [ len(key_species_data[target]['essential_symbionts'] ['data']) ] alternative_counts = [ len(key_species_data[target]['alternative_symbionts'] ['data']) ] key_counts = list(map(add, essential_counts, alternative_counts)) key_stats_writer.writerow( [target, 'key_species', *key_counts, sum(key_counts)]) key_stats_writer.writerow([ target, 'essential_symbionts', *essential_counts, sum(essential_counts) ]) key_stats_writer.writerow([ target, 'alternative_symbionts', *alternative_counts, sum(alternative_counts) ])