Example #1
0
def run_parsing_step(msas, library, scheduler_mode, parse_run_output_dir, cores, op):
  """ Run raxml-ng --parse on each MSA to check it is valid and
  to get its MSA dimensiosn """
  parse_commands_file = os.path.join(parse_run_output_dir, "parse_command.txt")
  parse_run_results = os.path.join(parse_run_output_dir, "results")
  commons.makedirs(parse_run_results)
  fasta_chuncks = []
  fasta_chuncks.append([])
  with open(parse_commands_file, "w") as writer:
    for name, msa in msas.items():
      fasta_output_dir = os.path.join(parse_run_results, name)
      commons.makedirs(fasta_output_dir)
      if (op.use_modeltest):
        if (not msa.has_model()):
          # set a fake model to make raxml parsing happy
          # if will be replaced after modeltest run
          if (op.datatype == "aa"):
            msa.set_model("WAG")
          else:
            msa.set_model("GTR")
      writer.write("parse_" + name + " 1 1 ")
      writer.write(" --parse ")
      writer.write(" --log DEBUG ")
      writer.write( " --msa " + msa.path + " " + msa.get_raxml_arguments_str())
      writer.write(" --prefix " + os.path.join(fasta_output_dir, name))
      writer.write(" --threads 1 ")
        

      writer.write("\n")
  scheduler.run_mpi_scheduler(library, scheduler_mode, parse_commands_file, parse_run_output_dir, cores, op)  
Example #2
0
def run(output_dir, library, scheduler_mode, run_path, cores, op):
    """ Use the MPI scheduler to run raxml --support on all the MSAs. 
  This call builds the trees withs support values from bootstraps"""
    ml_trees_dir = os.path.join(output_dir, "mlsearch_run", "results")
    concatenated_dir = os.path.join(output_dir, "concatenated_bootstraps")
    commands_file = os.path.join(run_path, "supports_commands.txt")
    commons.makedirs(run_path)
    support_results = os.path.join(run_path, "results")
    commons.makedirs(support_results)
    logger.info("Writing supports commands in " + commands_file)
    with open(commands_file, "w") as writer:
        for fasta in os.listdir(ml_trees_dir):
            ml_tree = os.path.join(ml_trees_dir, fasta,
                                   fasta + ".raxml.bestTree")
            bs_trees = os.path.join(concatenated_dir, fasta + ".bs")
            if (not os.path.exists(bs_trees)
                    or os.stat(bs_trees).st_size == 0):
                continue
            writer.write("support_" + fasta + " 1 1")
            writer.write(" --support")
            writer.write(" --tree " + ml_tree)
            writer.write(" --bs-trees " + bs_trees)
            writer.write(" --threads 1")
            writer.write(" --prefix " +
                         os.path.join(support_results, fasta + ".support"))
            writer.write("\n")
    scheduler.run_mpi_scheduler(library, scheduler_mode, commands_file,
                                run_path, cores, op)
Example #3
0
def run(msas, output_dir, library, run_path, op):
    """ Use the MPI scheduler to run modeltest on all the MSAs"""
    cores = op.cores
    run_path = os.path.join(output_dir, "modeltest_run")
    commands_file = os.path.join(run_path, "modeltest_command.txt")
    modeltest_results = os.path.join(run_path, "results")
    commons.makedirs(modeltest_results)
    if (op.modeltest_cores < 4):
        print(
            "[Error] The number of cores per modeltest job should at least be 4"
        )
        sys.exit(1)
    with open(commands_file, "w") as writer:
        for name, msa in msas.items():
            if (not msa.valid):
                continue
            modeltest_fasta_output_dir = os.path.join(modeltest_results, name)
            commons.makedirs(modeltest_fasta_output_dir)
            writer.write("modeltest_" + name + " ")
            writer.write(
                str(op.modeltest_cores) + " " +
                str(msa.taxa * msa.per_taxon_clv_size))
            writer.write(" -i ")
            writer.write(msa.path)
            writer.write(" -t mp ")
            writer.write(" -o " + os.path.join(modeltest_results, name, name))
            writer.write(" " + msa.modeltest_arguments + " ")
            writer.write("\n")
    scheduler.run_mpi_scheduler(library, op.scheduler, commands_file, run_path,
                                cores, op)
Example #4
0
def concatenate_bootstraps(output_dir, cores):
    """ Concurrently run concatenate_bootstrap_msa on all the MSA (on one single node)"""
    concatenated_dir = os.path.join(output_dir, "concatenated_bootstraps")
    commons.makedirs(concatenated_dir)
    bootstraps_dir = os.path.join(output_dir, "mlsearch_run", "bootstraps")
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=min(16, int(cores))) as e:
        for msa_name in os.listdir(bootstraps_dir):
            e.submit(concatenate_bootstrap_msa, bootstraps_dir,
                     concatenated_dir, msa_name)
Example #5
0
def concatenate_bootstraps(output_dir, cores):
  """ Concurrently run concatenate_bootstrap_msa on all the MSA (on one single node)"""
  concatenated_dir = os.path.join(output_dir, "concatenated_bootstraps")
  commons.makedirs(concatenated_dir)
  bootstraps_dir = os.path.join(output_dir, "mlsearch_run", "bootstraps")
  pool = multiprocessing.Pool(processes=min(multiprocessing.cpu_count(), int(cores)))
  for msa_name in os.listdir(bootstraps_dir):
    pool.apply_async(concatenate_bootstrap_msa, (bootstraps_dir, concatenated_dir, msa_name,))
  pool.close()
  pool.join()
Example #6
0
def run(msas, random_trees, parsimony_trees, bootstraps, library, scheduler_mode, run_path, cores, op):
  """ Use the MPI scheduler_mode to run raxml-ng on all the dataset. 
  Also schedules the bootstraps runs"""
  commands_file = os.path.join(run_path, "mlsearch_command.txt")
  mlsearch_run_results = os.path.join(run_path, "results")
  mlsearch_run_bootstraps = os.path.join(run_path, "bootstraps")
  commons.makedirs(mlsearch_run_results)
  starting_trees = random_trees + parsimony_trees
  if (bootstraps != 0):
    commons.makedirs(mlsearch_run_bootstraps)
  with open(commands_file, "w") as writer:
    for name, msa in msas.items():
      if (not msa.valid):
        continue
      msa_path = msa.binary_path
      if (msa_path == "" or msa_path == None):
        msa_path = msa.path
      msa_size = 1
      if (not msa.flag_disable_sorting):
        msa_size = msa.taxa * msa.per_taxon_clv_size
      mlsearch_fasta_output_dir = os.path.join(mlsearch_run_results, name)
      commons.makedirs(mlsearch_fasta_output_dir)
      for starting_tree in range(0, starting_trees):
        if (starting_trees > 1):
          prefix = os.path.join(mlsearch_fasta_output_dir, "multiple_runs", str(starting_tree))
          commons.makedirs(prefix)
          prefix = os.path.join(prefix, name)
        else:
          prefix = os.path.join(mlsearch_fasta_output_dir, name)
        writer.write("mlsearch_" + name + "_" + str(starting_tree) + " ")
        writer.write(str(msa.cores) + " " + str(msa_size))
        writer.write(" --msa " + msa_path + " " + msa.get_raxml_arguments_str())
        writer.write(" --prefix " + prefix)
        writer.write(" --threads 1 ")
        if (starting_tree >= random_trees):
          writer.write(" --tree pars ")
        writer.write(" --seed " + str(starting_tree + op.seed + 1) + " ")
        writer.write("\n")
      bs_output_dir = os.path.join(mlsearch_run_bootstraps, name)
      commons.makedirs(bs_output_dir)
      chunk_size = 10
      for current_bs in range(0, (bootstraps - 1) // chunk_size + 1):
        bsbase = name + "_bs" + str(current_bs)
        bs_number = min(chunk_size, bootstraps - current_bs * chunk_size)
        writer.write(bsbase + " ")
        writer.write(str(max(1, msa.cores // 2)) + " " + str(msa_size * chunk_size))
        writer.write(" --bootstrap")
        writer.write(" --msa " + msa_path + " " + msa.get_raxml_arguments_str())
        writer.write(" --prefix " + os.path.join(bs_output_dir, bsbase))
        writer.write(" --threads 1 ")
        writer.write(" --seed " + str(current_bs + op.seed + 1))
        writer.write(" --bs-trees " + str(bs_number))
        writer.write("\n")
  scheduler.run_mpi_scheduler(library, scheduler_mode, commands_file, run_path, cores, op)  
Example #7
0
def compute_constrain(msas, samples, raxml_library, scheduler_mode, run_path,
                      cores, op):
    parsi_run_path = os.path.join(run_path, "parsimony")
    parsi_commands_file = os.path.join(parsi_run_path, "command.txt")
    parsi_results = os.path.join(parsi_run_path, "results")
    commons.makedirs(parsi_results)
    with open(parsi_commands_file, "w") as writer:
        for name, msa in msas.items():
            prefix = os.path.join(parsi_results, name)
            if (not msa.valid):
                continue
            msa_path = msa.binary_path
            if (msa_path == "" or msa_path == None):
                msa_path = msa.path
            msa_size = 1
            writer.write("parsi_" + name + " 1 1")
            writer.write(" --msa " + msa_path + " ")
            writer.write(" --prefix " + os.path.abspath(prefix))
            if (scheduler_mode != "fork"):
                writer.write(" --threads 1 ")
            writer.write(" --tree pars{" + str(samples) + "} ")
            writer.write(" --start")
            writer.write(" --model " + msa.get_model())
            writer.write("\n")
    scheduler.run_scheduler(raxml_library, scheduler_mode, "--threads",
                            parsi_commands_file, parsi_run_path, cores, op)

    consensus_run_path = os.path.join(run_path, "consensus")
    consensus_commands_file = os.path.join(consensus_run_path, "command.txt")
    consensus_results = os.path.join(consensus_run_path, "results")
    commons.makedirs(consensus_results)
    with open(consensus_commands_file, "w") as writer:
        for name, msa in msas.items():
            prefix = os.path.join(consensus_results, name)
            if (not msa.valid):
                continue
            trees = os.path.join(parsi_results, name + ".raxml.startTree")
            writer.write("consensus_" + name + " 1 1")
            writer.write(" --prefix " + os.path.abspath(prefix))
            if (scheduler_mode != "fork"):
                writer.write(" --threads 1 ")
            writer.write(" --tree " + trees)
            writer.write(" --consense STRICT")
            writer.write("\n")
    scheduler.run_scheduler(raxml_library, scheduler_mode, "--threads",
                            consensus_commands_file, consensus_run_path, cores,
                            op)
Example #8
0
def run_parsing_step(msas, library, scheduler_mode, parse_run_output_dir,
                     cores, op):
    """ Run raxml-ng --parse on each MSA to check it is valid and
  to get its MSA dimensiosn """
    parse_commands_file = os.path.join(parse_run_output_dir,
                                       "parse_command.txt")
    parse_run_results = os.path.join(parse_run_output_dir, "results")
    commons.makedirs(parse_run_results)
    fasta_chuncks = []
    fasta_chuncks.append([])
    with open(parse_commands_file, "w") as writer:
        for name, msa in msas.items():
            fasta_output_dir = os.path.join(parse_run_results, name)
            commons.makedirs(fasta_output_dir)
            writer.write("parse_" + name + " 1 1 ")
            writer.write(" --parse ")
            writer.write(" --log DEBUG ")
            writer.write(" --msa " + msa.path + " " + msa.raxml_arguments)
            writer.write(" --prefix " + os.path.join(fasta_output_dir, name))
            writer.write(" --threads 1 ")
            writer.write("\n")
    scheduler.run_mpi_scheduler(library, scheduler_mode, parse_commands_file,
                                parse_run_output_dir, cores, op)
Example #9
0
def main_raxml_runner(args, op):
    """ Run pargenes from the parsed arguments op """
    start = time.time()
    output_dir = op.output_dir
    checkpoint_index = checkpoint.read_checkpoint(output_dir)
    if (os.path.exists(output_dir) and not op.do_continue):
        logger.info(
            "[Error] The output directory " + output_dir +
            " already exists. Please use another output directory or run with --continue."
        )
        sys.exit(1)
    commons.makedirs(output_dir)
    logger.init_logger(op.output_dir)
    print_header(args)
    msas = None
    logger.timed_log("end of MSAs initializations")
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    modeltest_run_path = os.path.join(output_dir, "modeltest_run")
    raxml_run_path = os.path.join(output_dir, "mlsearch_run")
    binaries_dir = os.path.join(scriptdir, "..", "pargenes_binaries")
    print("Binaries directory: " + binaries_dir)
    if (op.scheduler != "split"):
        raxml_library = os.path.join(binaries_dir, "raxml-ng")
        modeltest_library = os.path.join(binaries_dir, "modeltest-ng")
    else:
        raxml_library = os.path.join(binaries_dir, "raxml-ng-mpi.so")
        modeltest_library = os.path.join(binaries_dir, "modeltest-ng-mpi.so")
    astral_jar = os.path.join(binaries_dir, "astral.jar")
    if (len(op.raxml_binary) > 1):
        raxml_library = op.raxml_binary
    if (len(op.modeltest_binary) > 1):
        modeltest_library = op.modeltest_binary
    if (len(op.astral_jar) > 1):
        astral_jar = op.astral_jar
    astral_jar = os.path.abspath(astral_jar)
    if (checkpoint_index < 1):
        msas = commons.init_msas(op)
        raxml.run_parsing_step(msas, raxml_library, op.scheduler,
                               os.path.join(output_dir, "parse_run"), op.cores,
                               op)
        raxml.analyse_parsed_msas(msas, op)
        checkpoint.write_checkpoint(output_dir, 1)
        logger.timed_log("end of parsing mpi-scheduler run")
    else:
        msas = raxml.load_msas(op)
    if (op.dry_run):
        logger.info("End of the dry run. Exiting")
        return 0
    logger.timed_log("end of anlysing parsing results")
    if (op.use_modeltest):
        if (checkpoint_index < 2):
            modeltest.run(msas, output_dir, modeltest_library,
                          modeltest_run_path, op)
            logger.timed_log("end of modeltest mpi-scheduler run")
            modeltest.parse_modeltest_results(op.modeltest_criteria, msas,
                                              output_dir)
            logger.timed_log("end of parsing  modeltest results")
            # then recompute the binary MSA files to put the correct model, and reevaluate the MSA sizes with the new models
            shutil.move(os.path.join(output_dir, "parse_run"),
                        os.path.join(output_dir, "old_parse_run"))
            raxml.run_parsing_step(msas, raxml_library, op.scheduler,
                                   os.path.join(output_dir, "parse_run"),
                                   op.cores, op)
            raxml.analyse_parsed_msas(msas, op)
            logger.timed_log("end of the second parsing step")
            checkpoint.write_checkpoint(output_dir, 2)
    if (checkpoint_index < 3):
        raxml.run(msas, op.random_starting_trees, op.parsimony_starting_trees,
                  op.bootstraps, raxml_library, op.scheduler, raxml_run_path,
                  op.cores, op)
        logger.timed_log("end of mlsearch mpi-scheduler run")
        checkpoint.write_checkpoint(output_dir, 3)
    if (op.random_starting_trees + op.parsimony_starting_trees > 1):
        if (checkpoint_index < 4):
            raxml.select_best_ml_tree(msas, op)
            logger.timed_log("end of selecting the best ML tree")
            checkpoint.write_checkpoint(output_dir, 4)
    if (op.bootstraps != 0):
        if (checkpoint_index < 5):
            bootstraps.concatenate_bootstraps(output_dir, min(16, op.cores))
            logger.timed_log("end of bootstraps concatenation")
            checkpoint.write_checkpoint(output_dir, 5)
        starting_trees = op.random_starting_trees + op.parsimony_starting_trees
        if (checkpoint_index < 6 and starting_trees > 0):
            bootstraps.run(msas, output_dir, raxml_library, op.scheduler,
                           os.path.join(output_dir, "supports_run"), op.cores,
                           op)
            logger.timed_log("end of supports mpi-scheduler run")
            checkpoint.write_checkpoint(output_dir, 6)
    if (op.use_astral):
        if (checkpoint_index < 7):
            astral.run_astral_pargenes(astral_jar, op)
            checkpoint.write_checkpoint(output_dir, 7)
    all_invalid = True
    for name, msa in msas.items():
        if (msa.valid):
            all_invalid = False
    if (all_invalid):
        print("[Error] ParGenes failed to analyze all MSAs.")
        report.report_and_exit(op.output_dir, 1)
    print_stats(op)
    return 0
Example #10
0
def main_raxml_runner(op):
    """ Run pargenes from the parsed arguments op """
    start = time.time()
    output_dir = op.output_dir
    checkpoint_index = checkpoint.read_checkpoint(output_dir)
    print("Checkpoint: " + str(checkpoint_index))
    if (os.path.exists(output_dir) and not op.do_continue):
        print(
            "[Error] The output directory " + output_dir +
            " already exists. Please use another output directory or run with --continue."
        )
        sys.exit(1)
    commons.makedirs(output_dir)
    logs = commons.get_log_file(output_dir, "pargenes_logs")
    print("Redirecting logs to " + logs)
    sys.stdout = open(logs, "w")
    print_header()
    msas = commons.init_msas(op)
    timed_print(start, "end of MSAs initializations")
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    modeltest_run_path = os.path.join(output_dir, "modeltest_run")
    raxml_run_path = os.path.join(output_dir, "mlsearch_run")
    if (op.scheduler == "onecore"):
        raxml_library = os.path.join(scriptdir, "..", "raxml-ng", "bin",
                                     "raxml-ng")
        modeltest_library = os.path.join(scriptdir, "..", "modeltest", "bin",
                                         "modeltest-ng")
    else:
        raxml_library = os.path.join(scriptdir, "..", "raxml-ng", "bin",
                                     "raxml-ng-mpi.so")
        modeltest_library = os.path.join(scriptdir, "..", "modeltest", "build",
                                         "src", "modeltest-ng-mpi.so")
    if (checkpoint_index < 1):
        raxml.run_parsing_step(msas, raxml_library, op.scheduler,
                               os.path.join(output_dir, "parse_run"), op.cores,
                               op)
        checkpoint.write_checkpoint(output_dir, 1)
        timed_print(start, "end of parsing mpi-scheduler run")
    raxml.analyse_parsed_msas(msas, op, output_dir)
    if (op.dry_run):
        print("End of the dry run. Exiting")
        return 0
    timed_print(start, "end of anlysing parsing results")
    if (op.use_modeltest):
        if (checkpoint_index < 2):
            modeltest.run(msas, output_dir, modeltest_library,
                          modeltest_run_path, op)
            timed_print(start, "end of modeltest mpi-scheduler run")
            modeltest.parse_modeltest_results(op.modeltest_criteria, msas,
                                              output_dir)
            timed_print(start, "end of parsing  modeltest results")
            # then recompute the binary MSA files to put the correct model, and reevaluate the MSA sizes with the new models
            shutil.move(os.path.join(output_dir, "parse_run"),
                        os.path.join(output_dir, "old_parse_run"))
            raxml.run_parsing_step(msas, raxml_library, op.scheduler,
                                   os.path.join(output_dir, "parse_run"),
                                   op.cores, op)
            raxml.analyse_parsed_msas(msas, op, output_dir)
            timed_print(start, "end of the second parsing step")
            checkpoint.write_checkpoint(output_dir, 2)
    if (checkpoint_index < 3):
        raxml.run(msas, op.random_starting_trees, op.parsimony_starting_trees,
                  op.bootstraps, raxml_library, op.scheduler, raxml_run_path,
                  op.cores, op)
        timed_print(start, "end of mlsearch mpi-scheduler run")
        checkpoint.write_checkpoint(output_dir, 3)
    if (op.random_starting_trees + op.parsimony_starting_trees > 1):
        if (checkpoint_index < 4):
            raxml.select_best_ml_tree(msas, op)
            timed_print(start, "end of selecting the best ML tree")
            checkpoint.write_checkpoint(output_dir, 4)
    if (op.bootstraps != 0):
        if (checkpoint_index < 5):
            bootstraps.concatenate_bootstraps(output_dir, min(16, op.cores))
            timed_print(start, "end of bootstraps concatenation")
            checkpoint.write_checkpoint(output_dir, 5)
        if (checkpoint_index < 6):
            bootstraps.run(output_dir, raxml_library, op.scheduler,
                           os.path.join(output_dir, "supports_run"), op.cores,
                           op)
            timed_print(start, "end of supports mpi-scheduler run")
            checkpoint.write_checkpoint(output_dir, 6)
    return 0