def __init__(self, threads = int, model='GTRCAT', bootstrap = 0, verbose=False, additional_args = None): """Initialises the object""" self.verbose = verbose self.threads = threads self.tree_prefix = "" self.tree_suffix = ".tre" self.alignment_suffix = ".snp_sites.aln" self.model = model self.additional_args = additional_args self.bootstrap = bootstrap # Construct command self.executable = "rapidnj" if utils.which(self.executable) is None: sys.exit("No usable version of rapidnj could be found.") # Reproducibility self.version = "unspecified" self.citation = "https://doi.org/10.1007/978-3-540-87361-7_10" command = [self.executable] command.extend(["-i fa", "-t d", "-n"]) command.extend(["-c", str(self.threads)]) if self.model == 'JC': command.extend(["-a", "jc"]) elif self.model == 'K2P': command.extend(["-a", "kim"]) else: command.extend(["-a", self.model]) # Additional arguments if self.additional_args is not None: command.extend([self.additional_args]) self.base_command = command
def __init__(self, threads: 1, model: str, bootstrap = 0, internal_node_prefix="", verbose=False, additional_args = None): """Initialises the object""" self.verbose = verbose self.threads = threads self.model = model self.tree_prefix = "" self.tree_suffix = ".treefile" self.asr_prefix = "" self.asr_suffix = ".state" self.asr_tree_prefix = "" self.asr_tree_suffix = ".treefile" self.alignment_suffix = ".phylip" self.internal_node_prefix = internal_node_prefix self.bootstrap = bootstrap self.additional_args = additional_args # Construct base command self.executable = "iqtree" if utils.which(self.executable) is None: sys.exit("No usable version of IQTree could be found.") command = [self.executable] # Reproducibility self.version = self.get_version(self.executable) self.citation = "https://doi.org/10.1093/molbev/msaa015" # Set parallelisation command.extend(["-nt", str(self.threads)]) # Add flags command.extend(["-safe"]) if self.model == 'JC': command.extend(["-m", "JC"]) elif self.model == 'K2P': command.extend(["-m", "K2P"]) elif self.model == 'HKY': command.extend(["-m", "HKY"]) elif self.model == 'GTR': command.extend(["-m","GTR"]) elif self.model == 'GTRGAMMA': command.extend(["-m","GTR+G4"]) else: command.extend(["-m",self.model]) # Additional arguments if self.additional_args is not None: command.extend([self.additional_args]) self.base_command = command
def __init__(self, threads: int, internal_node_prefix="", verbose=False): """Initialises the object""" self.verbose = verbose self.threads = threads self.tree_prefix = "" self.tree_suffix = ".treefile" self.asr_prefix = "" self.asr_suffix = ".state" self.asr_tree_prefix = "" self.asr_tree_suffix = ".treefile" self.internal_node_prefix = internal_node_prefix self.executable = "iqtree" if utils.which(self.executable) is None: sys.exit("No usable version of IQTree could be found.") self.tree_building_parameters = ["-safe -m GTR+G4"] self.internal_sequence_reconstruction_parameters = [ "-safe -asr -m GTR+G4" ]
def test_is_executable(self): program = utils.which('ls') assert utils.is_executable(program) assert not utils.is_executable('non_existent_program')
def test_which(self): # the location of ls varies depending on OS so just check end assert re.match('.*/ls$', utils.which('ls')) is not None # Strip parameters assert re.match('.*/ls$', utils.which('ls -alrt')) is not None assert utils.which('non_existent_program') is None
def parse_and_run(input_args, program_description=""): """Main function of the Gubbins program""" start_time = time.time() current_directory = os.getcwd() printer = utils.VerbosePrinter(True, "\n") # Check if the Gubbins C-program is available. If so, print a welcome message. Otherwise exit. gubbins_exec = 'gubbins' if utils.which(gubbins_exec) is None: # Check if the Gubbins C-program is available in its source directory (for tests/Travis) gubbins_bundled_exec = os.path.abspath(os.path.join(current_directory, '../src/gubbins')) if utils.which(gubbins_bundled_exec) is None: sys.exit(gubbins_exec + " is not in your path") else: gubbins_exec = utils.replace_executable(gubbins_exec, gubbins_bundled_exec) program_version = "" try: program_version = str(pkg_resources.get_distribution(gubbins_exec).version) except pkg_resources.RequirementParseError: pass printer.print(["\n--- Gubbins " + program_version + " ---\n", program_description]) # Initialize tree builder and ancestral sequence reconstructor; check if all required dependencies are available printer.print("\nChecking dependencies...") current_tree_name = input_args.starting_tree tree_file_names = [] internal_node_label_prefix = "internal_" if input_args.tree_builder == "fasttree" or input_args.tree_builder == "hybrid": tree_builder = FastTree(input_args.verbose) sequence_reconstructor = RAxML(input_args.threads, input_args.raxml_model, internal_node_label_prefix, input_args.verbose) alignment_suffix = ".snp_sites.aln" elif input_args.tree_builder == "raxml": tree_builder = RAxML(input_args.threads, input_args.raxml_model, internal_node_label_prefix, input_args.verbose) sequence_reconstructor = tree_builder alignment_suffix = ".phylip" else: tree_builder = IQTree(input_args.threads, internal_node_label_prefix, input_args.verbose) sequence_reconstructor = tree_builder alignment_suffix = ".phylip" printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # Check if the input files exist and have the right format printer.print("\nChecking input files...") if not os.path.exists(input_args.alignment_filename) \ or not ValidateFastaAlignment(input_args.alignment_filename).is_input_fasta_file_valid(): sys.exit("There input alignment file does not exist or has an invalid format") if input_args.starting_tree is not None and input_args.starting_tree != "" \ and (not os.path.exists(input_args.starting_tree) or not is_starting_tree_valid(input_args.starting_tree)): sys.exit("The starting tree does not exist or has an invalid format") if input_args.starting_tree is not None and input_args.starting_tree != "" \ and not do_the_names_match_the_fasta_file(input_args.starting_tree, input_args.alignment_filename): sys.exit("The names in the starting tree do not match the names in the alignment file") if number_of_sequences_in_alignment(input_args.alignment_filename) < 3: sys.exit("3 or more sequences are required.") # Check - and potentially correct - further input parameters check_and_fix_window_size(input_args) # Get the base filename (base_directory, base_filename) = os.path.split(input_args.alignment_filename) (basename, extension) = os.path.splitext(base_filename) if input_args.use_time_stamp: time_stamp = str(int(time.time())) basename = basename + "." + time_stamp snp_alignment_filename = base_filename + ".snp_sites.aln" gaps_alignment_filename = base_filename + ".gaps.snp_sites.aln" gaps_vcf_filename = base_filename + ".gaps.vcf" joint_sequences_filename = base_filename + ".seq.joint.aln" # Check if intermediate files from a previous run exist intermediate_files = [basename + ".iteration_"] if not input_args.no_cleanup: utils.delete_files(".", intermediate_files, "", input_args.verbose) if utils.do_files_exist(".", intermediate_files, "", input_args.verbose): sys.exit("Intermediate files from a previous run exist. Please rerun without the --no_cleanup option " "to automatically delete them or with the --use_time_stamp to add a unique prefix.") printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # Filter the input alignment and save as temporary alignment file printer.print("\nFiltering input alignment...") temp_working_dir = tempfile.mkdtemp(dir=os.getcwd()) temp_alignment_filename = temp_working_dir + "/" + base_filename pre_process_fasta = PreProcessFasta(input_args.alignment_filename, input_args.verbose, input_args.filter_percentage) taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data( temp_alignment_filename, input_args.remove_identical_sequences) input_args.alignment_filename = temp_alignment_filename # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from it if input_args.starting_tree: (tree_base_directory, tree_base_filename) = os.path.split(input_args.starting_tree) temp_starting_tree = temp_working_dir + '/' + tree_base_filename filter_out_removed_taxa_from_tree(input_args.starting_tree, temp_starting_tree, taxa_removed) input_args.starting_tree = temp_starting_tree printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # Find all SNP sites with Gubbins gubbins_command = " ".join([gubbins_exec, input_args.alignment_filename]) printer.print(["\nRunning Gubbins to detect SNPs...", gubbins_command]) try: subprocess.check_call(gubbins_command, shell=True) except subprocess.SubprocessError: sys.exit("Gubbins crashed, please ensure you have enough free memory") printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) reconvert_fasta_file(snp_alignment_filename, snp_alignment_filename) reconvert_fasta_file(gaps_alignment_filename, base_filename + ".start") # Start the main loop printer.print("\nEntering the main loop.") for i in range(1, input_args.iterations+1): printer.print("\n*** Iteration " + str(i) + " ***") # 1.1. Construct the tree-building command depending on the iteration and employed options if i == 2 and input_args.tree_builder == "hybrid": # Switch to RAxML tree_builder = sequence_reconstructor alignment_suffix = ".phylip" if i == 1: previous_tree_name = input_args.starting_tree alignment_filename = base_filename + alignment_suffix else: previous_tree_name = current_tree_name alignment_filename = previous_tree_name + alignment_suffix current_basename = basename + ".iteration_" + str(i) current_tree_name = current_basename + ".tre" if previous_tree_name: tree_building_command = tree_builder.tree_building_command( os.path.abspath(alignment_filename), os.path.abspath(previous_tree_name), current_basename) else: tree_building_command = tree_builder.tree_building_command( os.path.abspath(alignment_filename), "", current_basename) built_tree = temp_working_dir + "/" + tree_builder.tree_prefix + current_basename + tree_builder.tree_suffix # 1.2. Construct the phylogenetic tree if input_args.starting_tree is not None and i == 1: printer.print("\nCopying the starting tree...") shutil.copyfile(input_args.starting_tree, current_tree_name) else: printer.print(["\nConstructing the phylogenetic tree with " + tree_builder.executable + "...", tree_building_command]) os.chdir(temp_working_dir) try: subprocess.check_call(tree_building_command, shell=True) except subprocess.SubprocessError: sys.exit("Failed while building the tree.") os.chdir(current_directory) shutil.copyfile(built_tree, current_tree_name) printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # 2. Re-root the tree reroot_tree(str(current_tree_name), input_args.outgroup) temp_rooted_tree = temp_working_dir + "/" + current_tree_name + ".rooted" if input_args.tree_builder == "iqtree": shutil.copyfile(current_tree_name, temp_rooted_tree) else: root_tree(current_tree_name, temp_rooted_tree) # 3.1. Construct the command for ancestral state reconstruction depending on the iteration and employed options ancestral_sequence_basename = current_basename + ".internal" sequence_reconstruction_command = sequence_reconstructor.internal_sequence_reconstruction_command( os.path.abspath(base_filename + alignment_suffix), os.path.abspath(temp_rooted_tree), ancestral_sequence_basename) raw_internal_sequence_filename \ = temp_working_dir + "/" + sequence_reconstructor.asr_prefix \ + ancestral_sequence_basename + sequence_reconstructor.asr_suffix processed_internal_sequence_filename = temp_working_dir + "/" + ancestral_sequence_basename + ".aln" raw_internal_rooted_tree_filename \ = temp_working_dir + "/" + sequence_reconstructor.asr_tree_prefix \ + ancestral_sequence_basename + sequence_reconstructor.asr_tree_suffix # 3.2. Reconstruct the ancestral sequence printer.print(["\nReconstructing ancestral sequences with " + sequence_reconstructor.executable + "...", sequence_reconstruction_command]) os.chdir(temp_working_dir) try: subprocess.check_call(sequence_reconstruction_command, shell=True) except subprocess.SubprocessError: sys.exit("Failed while reconstructing the ancestral sequences.") os.chdir(current_directory) # 3.3. Join ancestral sequences with given sequences current_tree_name_with_internal_nodes = current_tree_name + ".internal" sequence_reconstructor.convert_raw_ancestral_states_to_fasta(raw_internal_sequence_filename, processed_internal_sequence_filename) concatenate_fasta_files([snp_alignment_filename, processed_internal_sequence_filename], joint_sequences_filename) transfer_internal_node_labels_to_tree(raw_internal_rooted_tree_filename, temp_rooted_tree, current_tree_name_with_internal_nodes, sequence_reconstructor) printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # 4. Reinsert gaps (cp15 note: something is wonky here, the process is at the very least terribly inefficient) printer.print("\nReinserting gaps into the alignment...") shutil.copyfile(base_filename + ".start", gaps_alignment_filename) reinsert_gaps_into_fasta_file(joint_sequences_filename, gaps_vcf_filename, gaps_alignment_filename) if not os.path.exists(gaps_alignment_filename) \ or not ValidateFastaAlignment(gaps_alignment_filename).is_input_fasta_file_valid(): sys.exit("There is a problem with your FASTA file after running internal sequence reconstruction. " "Please check this intermediate file is valid: " + gaps_alignment_filename) printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) # 5. Detect recombination sites with Gubbins (cp15 note: copy file with internal nodes back and forth to # ensure all created files have the desired name structure and to avoid fiddling with the Gubbins C program) shutil.copyfile(current_tree_name_with_internal_nodes, current_tree_name) gubbins_command = create_gubbins_command( gubbins_exec, gaps_alignment_filename, gaps_vcf_filename, current_tree_name, input_args.alignment_filename, input_args.min_snps, input_args.min_window_size, input_args.max_window_size) printer.print(["\nRunning Gubbins to detect recombinations...", gubbins_command]) try: subprocess.check_call(gubbins_command, shell=True) except subprocess.SubprocessError: sys.exit("Failed while running Gubbins. Please ensure you have enough free memory") printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) shutil.copyfile(current_tree_name, current_tree_name_with_internal_nodes) # 6. Check for convergence printer.print("\nChecking for convergence...") remove_internal_node_labels_from_tree(current_tree_name_with_internal_nodes, current_tree_name) tree_file_names.append(current_tree_name) if i > 1: if input_args.converge_method == 'recombination': current_recomb_file, previous_recomb_files = get_recombination_files(tree_file_names) if have_recombinations_been_seen_before(current_recomb_file, previous_recomb_files): printer.print("Convergence after " + str(i) + " iterations: Recombinations observed before.") break else: if has_tree_been_seen_before(tree_file_names, input_args.converge_method): printer.print("Convergence after " + str(i) + " iterations: Tree observed before.") break printer.print("...done. Run time: {:.2f} s".format(time.time() - start_time)) else: printer.print("Maximum number of iterations (" + str(input_args.iterations) + ") reached.") printer.print("\nExiting the main loop.") # Create the final output printer.print("\nCreating the final output...") if input_args.prefix is None: input_args.prefix = basename output_filenames_to_final_filenames = translation_of_filenames_to_final_filenames( current_tree_name, input_args.prefix) utils.rename_files(output_filenames_to_final_filenames) # Cleanup intermediate files if not input_args.no_cleanup: shutil.rmtree(temp_working_dir) utils.delete_files(".", tree_file_names[:-1], intermediate_files_regex(), input_args.verbose) utils.delete_files(".", [base_filename], starting_files_regex(), input_args.verbose) printer.print("...finished. Total run time: {:.2f} s".format(time.time() - start_time))