def run_parsing_step(msas, library, scheduler_mode, parse_run_output_dir, cores, op): """ Run raxml-ng --parse on each MSA to check it is valid and to get its MSA dimensiosn """ parse_commands_file = os.path.join(parse_run_output_dir, "parse_command.txt") parse_run_results = os.path.join(parse_run_output_dir, "results") commons.makedirs(parse_run_results) fasta_chuncks = [] fasta_chuncks.append([]) with open(parse_commands_file, "w") as writer: for name, msa in msas.items(): fasta_output_dir = os.path.join(parse_run_results, name) commons.makedirs(fasta_output_dir) if (op.use_modeltest): if (not msa.has_model()): # set a fake model to make raxml parsing happy # if will be replaced after modeltest run if (op.datatype == "aa"): msa.set_model("WAG") else: msa.set_model("GTR") writer.write("parse_" + name + " 1 1 ") writer.write(" --parse ") writer.write(" --log DEBUG ") writer.write( " --msa " + msa.path + " " + msa.get_raxml_arguments_str()) writer.write(" --prefix " + os.path.join(fasta_output_dir, name)) writer.write(" --threads 1 ") writer.write("\n") scheduler.run_mpi_scheduler(library, scheduler_mode, parse_commands_file, parse_run_output_dir, cores, op)
def run(output_dir, library, scheduler_mode, run_path, cores, op): """ Use the MPI scheduler to run raxml --support on all the MSAs. This call builds the trees withs support values from bootstraps""" ml_trees_dir = os.path.join(output_dir, "mlsearch_run", "results") concatenated_dir = os.path.join(output_dir, "concatenated_bootstraps") commands_file = os.path.join(run_path, "supports_commands.txt") commons.makedirs(run_path) support_results = os.path.join(run_path, "results") commons.makedirs(support_results) logger.info("Writing supports commands in " + commands_file) with open(commands_file, "w") as writer: for fasta in os.listdir(ml_trees_dir): ml_tree = os.path.join(ml_trees_dir, fasta, fasta + ".raxml.bestTree") bs_trees = os.path.join(concatenated_dir, fasta + ".bs") if (not os.path.exists(bs_trees) or os.stat(bs_trees).st_size == 0): continue writer.write("support_" + fasta + " 1 1") writer.write(" --support") writer.write(" --tree " + ml_tree) writer.write(" --bs-trees " + bs_trees) writer.write(" --threads 1") writer.write(" --prefix " + os.path.join(support_results, fasta + ".support")) writer.write("\n") scheduler.run_mpi_scheduler(library, scheduler_mode, commands_file, run_path, cores, op)
def run(msas, output_dir, library, run_path, op): """ Use the MPI scheduler to run modeltest on all the MSAs""" cores = op.cores run_path = os.path.join(output_dir, "modeltest_run") commands_file = os.path.join(run_path, "modeltest_command.txt") modeltest_results = os.path.join(run_path, "results") commons.makedirs(modeltest_results) if (op.modeltest_cores < 4): print( "[Error] The number of cores per modeltest job should at least be 4" ) sys.exit(1) with open(commands_file, "w") as writer: for name, msa in msas.items(): if (not msa.valid): continue modeltest_fasta_output_dir = os.path.join(modeltest_results, name) commons.makedirs(modeltest_fasta_output_dir) writer.write("modeltest_" + name + " ") writer.write( str(op.modeltest_cores) + " " + str(msa.taxa * msa.per_taxon_clv_size)) writer.write(" -i ") writer.write(msa.path) writer.write(" -t mp ") writer.write(" -o " + os.path.join(modeltest_results, name, name)) writer.write(" " + msa.modeltest_arguments + " ") writer.write("\n") scheduler.run_mpi_scheduler(library, op.scheduler, commands_file, run_path, cores, op)
def concatenate_bootstraps(output_dir, cores): """ Concurrently run concatenate_bootstrap_msa on all the MSA (on one single node)""" concatenated_dir = os.path.join(output_dir, "concatenated_bootstraps") commons.makedirs(concatenated_dir) bootstraps_dir = os.path.join(output_dir, "mlsearch_run", "bootstraps") with concurrent.futures.ThreadPoolExecutor( max_workers=min(16, int(cores))) as e: for msa_name in os.listdir(bootstraps_dir): e.submit(concatenate_bootstrap_msa, bootstraps_dir, concatenated_dir, msa_name)
def concatenate_bootstraps(output_dir, cores): """ Concurrently run concatenate_bootstrap_msa on all the MSA (on one single node)""" concatenated_dir = os.path.join(output_dir, "concatenated_bootstraps") commons.makedirs(concatenated_dir) bootstraps_dir = os.path.join(output_dir, "mlsearch_run", "bootstraps") pool = multiprocessing.Pool(processes=min(multiprocessing.cpu_count(), int(cores))) for msa_name in os.listdir(bootstraps_dir): pool.apply_async(concatenate_bootstrap_msa, (bootstraps_dir, concatenated_dir, msa_name,)) pool.close() pool.join()
def run(msas, random_trees, parsimony_trees, bootstraps, library, scheduler_mode, run_path, cores, op): """ Use the MPI scheduler_mode to run raxml-ng on all the dataset. Also schedules the bootstraps runs""" commands_file = os.path.join(run_path, "mlsearch_command.txt") mlsearch_run_results = os.path.join(run_path, "results") mlsearch_run_bootstraps = os.path.join(run_path, "bootstraps") commons.makedirs(mlsearch_run_results) starting_trees = random_trees + parsimony_trees if (bootstraps != 0): commons.makedirs(mlsearch_run_bootstraps) with open(commands_file, "w") as writer: for name, msa in msas.items(): if (not msa.valid): continue msa_path = msa.binary_path if (msa_path == "" or msa_path == None): msa_path = msa.path msa_size = 1 if (not msa.flag_disable_sorting): msa_size = msa.taxa * msa.per_taxon_clv_size mlsearch_fasta_output_dir = os.path.join(mlsearch_run_results, name) commons.makedirs(mlsearch_fasta_output_dir) for starting_tree in range(0, starting_trees): if (starting_trees > 1): prefix = os.path.join(mlsearch_fasta_output_dir, "multiple_runs", str(starting_tree)) commons.makedirs(prefix) prefix = os.path.join(prefix, name) else: prefix = os.path.join(mlsearch_fasta_output_dir, name) writer.write("mlsearch_" + name + "_" + str(starting_tree) + " ") writer.write(str(msa.cores) + " " + str(msa_size)) writer.write(" --msa " + msa_path + " " + msa.get_raxml_arguments_str()) writer.write(" --prefix " + prefix) writer.write(" --threads 1 ") if (starting_tree >= random_trees): writer.write(" --tree pars ") writer.write(" --seed " + str(starting_tree + op.seed + 1) + " ") writer.write("\n") bs_output_dir = os.path.join(mlsearch_run_bootstraps, name) commons.makedirs(bs_output_dir) chunk_size = 10 for current_bs in range(0, (bootstraps - 1) // chunk_size + 1): bsbase = name + "_bs" + str(current_bs) bs_number = min(chunk_size, bootstraps - current_bs * chunk_size) writer.write(bsbase + " ") writer.write(str(max(1, msa.cores // 2)) + " " + str(msa_size * chunk_size)) writer.write(" --bootstrap") writer.write(" --msa " + msa_path + " " + msa.get_raxml_arguments_str()) writer.write(" --prefix " + os.path.join(bs_output_dir, bsbase)) writer.write(" --threads 1 ") writer.write(" --seed " + str(current_bs + op.seed + 1)) writer.write(" --bs-trees " + str(bs_number)) writer.write("\n") scheduler.run_mpi_scheduler(library, scheduler_mode, commands_file, run_path, cores, op)
def compute_constrain(msas, samples, raxml_library, scheduler_mode, run_path, cores, op): parsi_run_path = os.path.join(run_path, "parsimony") parsi_commands_file = os.path.join(parsi_run_path, "command.txt") parsi_results = os.path.join(parsi_run_path, "results") commons.makedirs(parsi_results) with open(parsi_commands_file, "w") as writer: for name, msa in msas.items(): prefix = os.path.join(parsi_results, name) if (not msa.valid): continue msa_path = msa.binary_path if (msa_path == "" or msa_path == None): msa_path = msa.path msa_size = 1 writer.write("parsi_" + name + " 1 1") writer.write(" --msa " + msa_path + " ") writer.write(" --prefix " + os.path.abspath(prefix)) if (scheduler_mode != "fork"): writer.write(" --threads 1 ") writer.write(" --tree pars{" + str(samples) + "} ") writer.write(" --start") writer.write(" --model " + msa.get_model()) writer.write("\n") scheduler.run_scheduler(raxml_library, scheduler_mode, "--threads", parsi_commands_file, parsi_run_path, cores, op) consensus_run_path = os.path.join(run_path, "consensus") consensus_commands_file = os.path.join(consensus_run_path, "command.txt") consensus_results = os.path.join(consensus_run_path, "results") commons.makedirs(consensus_results) with open(consensus_commands_file, "w") as writer: for name, msa in msas.items(): prefix = os.path.join(consensus_results, name) if (not msa.valid): continue trees = os.path.join(parsi_results, name + ".raxml.startTree") writer.write("consensus_" + name + " 1 1") writer.write(" --prefix " + os.path.abspath(prefix)) if (scheduler_mode != "fork"): writer.write(" --threads 1 ") writer.write(" --tree " + trees) writer.write(" --consense STRICT") writer.write("\n") scheduler.run_scheduler(raxml_library, scheduler_mode, "--threads", consensus_commands_file, consensus_run_path, cores, op)
def run_parsing_step(msas, library, scheduler_mode, parse_run_output_dir, cores, op): """ Run raxml-ng --parse on each MSA to check it is valid and to get its MSA dimensiosn """ parse_commands_file = os.path.join(parse_run_output_dir, "parse_command.txt") parse_run_results = os.path.join(parse_run_output_dir, "results") commons.makedirs(parse_run_results) fasta_chuncks = [] fasta_chuncks.append([]) with open(parse_commands_file, "w") as writer: for name, msa in msas.items(): fasta_output_dir = os.path.join(parse_run_results, name) commons.makedirs(fasta_output_dir) writer.write("parse_" + name + " 1 1 ") writer.write(" --parse ") writer.write(" --log DEBUG ") writer.write(" --msa " + msa.path + " " + msa.raxml_arguments) writer.write(" --prefix " + os.path.join(fasta_output_dir, name)) writer.write(" --threads 1 ") writer.write("\n") scheduler.run_mpi_scheduler(library, scheduler_mode, parse_commands_file, parse_run_output_dir, cores, op)
def main_raxml_runner(args, op): """ Run pargenes from the parsed arguments op """ start = time.time() output_dir = op.output_dir checkpoint_index = checkpoint.read_checkpoint(output_dir) if (os.path.exists(output_dir) and not op.do_continue): logger.info( "[Error] The output directory " + output_dir + " already exists. Please use another output directory or run with --continue." ) sys.exit(1) commons.makedirs(output_dir) logger.init_logger(op.output_dir) print_header(args) msas = None logger.timed_log("end of MSAs initializations") scriptdir = os.path.dirname(os.path.realpath(__file__)) modeltest_run_path = os.path.join(output_dir, "modeltest_run") raxml_run_path = os.path.join(output_dir, "mlsearch_run") binaries_dir = os.path.join(scriptdir, "..", "pargenes_binaries") print("Binaries directory: " + binaries_dir) if (op.scheduler != "split"): raxml_library = os.path.join(binaries_dir, "raxml-ng") modeltest_library = os.path.join(binaries_dir, "modeltest-ng") else: raxml_library = os.path.join(binaries_dir, "raxml-ng-mpi.so") modeltest_library = os.path.join(binaries_dir, "modeltest-ng-mpi.so") astral_jar = os.path.join(binaries_dir, "astral.jar") if (len(op.raxml_binary) > 1): raxml_library = op.raxml_binary if (len(op.modeltest_binary) > 1): modeltest_library = op.modeltest_binary if (len(op.astral_jar) > 1): astral_jar = op.astral_jar astral_jar = os.path.abspath(astral_jar) if (checkpoint_index < 1): msas = commons.init_msas(op) raxml.run_parsing_step(msas, raxml_library, op.scheduler, os.path.join(output_dir, "parse_run"), op.cores, op) raxml.analyse_parsed_msas(msas, op) checkpoint.write_checkpoint(output_dir, 1) logger.timed_log("end of parsing mpi-scheduler run") else: msas = raxml.load_msas(op) if (op.dry_run): logger.info("End of the dry run. Exiting") return 0 logger.timed_log("end of anlysing parsing results") if (op.use_modeltest): if (checkpoint_index < 2): modeltest.run(msas, output_dir, modeltest_library, modeltest_run_path, op) logger.timed_log("end of modeltest mpi-scheduler run") modeltest.parse_modeltest_results(op.modeltest_criteria, msas, output_dir) logger.timed_log("end of parsing modeltest results") # then recompute the binary MSA files to put the correct model, and reevaluate the MSA sizes with the new models shutil.move(os.path.join(output_dir, "parse_run"), os.path.join(output_dir, "old_parse_run")) raxml.run_parsing_step(msas, raxml_library, op.scheduler, os.path.join(output_dir, "parse_run"), op.cores, op) raxml.analyse_parsed_msas(msas, op) logger.timed_log("end of the second parsing step") checkpoint.write_checkpoint(output_dir, 2) if (checkpoint_index < 3): raxml.run(msas, op.random_starting_trees, op.parsimony_starting_trees, op.bootstraps, raxml_library, op.scheduler, raxml_run_path, op.cores, op) logger.timed_log("end of mlsearch mpi-scheduler run") checkpoint.write_checkpoint(output_dir, 3) if (op.random_starting_trees + op.parsimony_starting_trees > 1): if (checkpoint_index < 4): raxml.select_best_ml_tree(msas, op) logger.timed_log("end of selecting the best ML tree") checkpoint.write_checkpoint(output_dir, 4) if (op.bootstraps != 0): if (checkpoint_index < 5): bootstraps.concatenate_bootstraps(output_dir, min(16, op.cores)) logger.timed_log("end of bootstraps concatenation") checkpoint.write_checkpoint(output_dir, 5) starting_trees = op.random_starting_trees + op.parsimony_starting_trees if (checkpoint_index < 6 and starting_trees > 0): bootstraps.run(msas, output_dir, raxml_library, op.scheduler, os.path.join(output_dir, "supports_run"), op.cores, op) logger.timed_log("end of supports mpi-scheduler run") checkpoint.write_checkpoint(output_dir, 6) if (op.use_astral): if (checkpoint_index < 7): astral.run_astral_pargenes(astral_jar, op) checkpoint.write_checkpoint(output_dir, 7) all_invalid = True for name, msa in msas.items(): if (msa.valid): all_invalid = False if (all_invalid): print("[Error] ParGenes failed to analyze all MSAs.") report.report_and_exit(op.output_dir, 1) print_stats(op) return 0
def main_raxml_runner(op): """ Run pargenes from the parsed arguments op """ start = time.time() output_dir = op.output_dir checkpoint_index = checkpoint.read_checkpoint(output_dir) print("Checkpoint: " + str(checkpoint_index)) if (os.path.exists(output_dir) and not op.do_continue): print( "[Error] The output directory " + output_dir + " already exists. Please use another output directory or run with --continue." ) sys.exit(1) commons.makedirs(output_dir) logs = commons.get_log_file(output_dir, "pargenes_logs") print("Redirecting logs to " + logs) sys.stdout = open(logs, "w") print_header() msas = commons.init_msas(op) timed_print(start, "end of MSAs initializations") scriptdir = os.path.dirname(os.path.realpath(__file__)) modeltest_run_path = os.path.join(output_dir, "modeltest_run") raxml_run_path = os.path.join(output_dir, "mlsearch_run") if (op.scheduler == "onecore"): raxml_library = os.path.join(scriptdir, "..", "raxml-ng", "bin", "raxml-ng") modeltest_library = os.path.join(scriptdir, "..", "modeltest", "bin", "modeltest-ng") else: raxml_library = os.path.join(scriptdir, "..", "raxml-ng", "bin", "raxml-ng-mpi.so") modeltest_library = os.path.join(scriptdir, "..", "modeltest", "build", "src", "modeltest-ng-mpi.so") if (checkpoint_index < 1): raxml.run_parsing_step(msas, raxml_library, op.scheduler, os.path.join(output_dir, "parse_run"), op.cores, op) checkpoint.write_checkpoint(output_dir, 1) timed_print(start, "end of parsing mpi-scheduler run") raxml.analyse_parsed_msas(msas, op, output_dir) if (op.dry_run): print("End of the dry run. Exiting") return 0 timed_print(start, "end of anlysing parsing results") if (op.use_modeltest): if (checkpoint_index < 2): modeltest.run(msas, output_dir, modeltest_library, modeltest_run_path, op) timed_print(start, "end of modeltest mpi-scheduler run") modeltest.parse_modeltest_results(op.modeltest_criteria, msas, output_dir) timed_print(start, "end of parsing modeltest results") # then recompute the binary MSA files to put the correct model, and reevaluate the MSA sizes with the new models shutil.move(os.path.join(output_dir, "parse_run"), os.path.join(output_dir, "old_parse_run")) raxml.run_parsing_step(msas, raxml_library, op.scheduler, os.path.join(output_dir, "parse_run"), op.cores, op) raxml.analyse_parsed_msas(msas, op, output_dir) timed_print(start, "end of the second parsing step") checkpoint.write_checkpoint(output_dir, 2) if (checkpoint_index < 3): raxml.run(msas, op.random_starting_trees, op.parsimony_starting_trees, op.bootstraps, raxml_library, op.scheduler, raxml_run_path, op.cores, op) timed_print(start, "end of mlsearch mpi-scheduler run") checkpoint.write_checkpoint(output_dir, 3) if (op.random_starting_trees + op.parsimony_starting_trees > 1): if (checkpoint_index < 4): raxml.select_best_ml_tree(msas, op) timed_print(start, "end of selecting the best ML tree") checkpoint.write_checkpoint(output_dir, 4) if (op.bootstraps != 0): if (checkpoint_index < 5): bootstraps.concatenate_bootstraps(output_dir, min(16, op.cores)) timed_print(start, "end of bootstraps concatenation") checkpoint.write_checkpoint(output_dir, 5) if (checkpoint_index < 6): bootstraps.run(output_dir, raxml_library, op.scheduler, os.path.join(output_dir, "supports_run"), op.cores, op) timed_print(start, "end of supports mpi-scheduler run") checkpoint.write_checkpoint(output_dir, 6) return 0