def test_run_raxml_ancestor_reconstruction(self): raxml_seq_recon = RAxMLSequenceReconstruction('gubbins/tests/data/raxml_sequence_reconstruction/input_alignment.fasta', 'gubbins/tests/data/raxml_sequence_reconstruction/unrooted_tree.newick', 'outputfile', 'output_tree', RAxMLExecutable(1).internal_sequence_reconstruction_command(), False) raxml_seq_recon.reconstruct_ancestor_sequences() assert os.path.exists('outputfile')
def test_more_complex_tree(self): raxml_seq_recon = RAxMLSequenceReconstruction('gubbins/tests/data/multiple_recombinations.aln', 'gubbins/tests/data/expected_RAxML_result.multiple_recombinations.iteration_5.output_tree', 'output_alignment', 'output_tree', RAxMLExecutable(1).internal_sequence_reconstruction_command(), False) raxml_seq_recon.reconstruct_ancestor_sequences() assert os.path.exists('output_alignment') assert os.path.exists('output_tree')
def parse_and_run(self): # Default parameters raxml_executable_obj = RAxMLExecutable(self.args.threads, self.args.raxml_model, self.args.verbose) fasttree_executables = ["FastTree", "fasttree"] FASTTREE_EXEC = GubbinsCommon.choose_executable(fasttree_executables) FASTTREE_PARAMS = "-nosupport -gtr -gamma -nt" GUBBINS_EXEC = "gubbins" GUBBINS_BUNDLED_EXEC = "../src/gubbins" # check that all the external executable dependancies are available if GubbinsCommon.which(GUBBINS_EXEC) is None: GUBBINS_EXEC = GubbinsCommon.use_bundled_exec(GUBBINS_EXEC, GUBBINS_BUNDLED_EXEC) if GubbinsCommon.which(GUBBINS_EXEC) is None: sys.exit(GUBBINS_EXEC + " is not in your path") if self.args.tree_builder == "fasttree" or self.args.tree_builder == "hybrid": if GubbinsCommon.which(FASTTREE_EXEC) is None: sys.exit("FastTree is not in your path") if self.args.converge_method not in ["weighted_robinson_foulds", "robinson_foulds", "recombination"]: sys.exit( "Please choose weighted_robinson_foulds, robinson_foulds or recombination for the --converge_method option" ) if ( GubbinsCommon.does_file_exist(self.args.alignment_filename, "Alignment File") == 0 or not ValidateFastaAlignment(self.args.alignment_filename).is_input_fasta_file_valid() ): sys.exit("There is a problem with your input fasta file so nothing can be done until you fix it") if ( self.args.starting_tree is not None and self.args.starting_tree != "" and ( GubbinsCommon.does_file_exist(self.args.starting_tree, "Starting Tree") == 0 or GubbinsCommon.is_input_starting_tree_valid(self.args.starting_tree) ) ): sys.exit("The starting tree is invalid") if ( self.args.starting_tree is not None and self.args.starting_tree != "" and GubbinsCommon.do_the_names_match_the_fasta_file(self.args.starting_tree, self.args.alignment_filename) == 0 ): sys.exit("The names in the starting tree dont match the names in the fasta file") GubbinsCommon.check_and_fix_window_size(self) current_time = "" if self.args.use_time_stamp > 0: current_time = str(int(time.time())) + "." if self.args.verbose > 0: print(current_time) # get the base filename (base_directory, base_filename) = os.path.split(self.args.alignment_filename) (base_filename_without_ext, extension) = os.path.splitext(base_filename) starting_base_filename = base_filename # put a filtered copy into a temp directory and work from that temp_working_dir = tempfile.mkdtemp(dir=os.getcwd()) pre_process_fasta = PreProcessFasta( self.args.alignment_filename, self.args.verbose, self.args.filter_percentage ) taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data( temp_working_dir + "/" + starting_base_filename, self.args.remove_identical_sequences ) self.args.alignment_filename = temp_working_dir + "/" + starting_base_filename # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from the tree self.args.starting_tree = GubbinsCommon.filter_out_removed_taxa_from_tree_and_return_new_file( self.args.starting_tree, temp_working_dir, taxa_removed ) # get the base filename (base_directory, base_filename) = os.path.split(self.args.alignment_filename) (base_filename_without_ext, extension) = os.path.splitext(base_filename) starting_base_filename = base_filename if len(base_filename) > 115: sys.exit( "Your filename is too long for RAxML at " + str(len(base_filename)) + " characters, please shorten it to less than 115 characters" ) # find all snp sites if self.args.verbose > 0: print(GUBBINS_EXEC + " " + self.args.alignment_filename) try: subprocess.check_call([GUBBINS_EXEC, self.args.alignment_filename]) except: sys.exit("Gubbins crashed, please ensure you have enough free memory") if self.args.verbose > 0: print(int(time.time())) GubbinsCommon.reconvert_fasta_file( starting_base_filename + ".gaps.snp_sites.aln", starting_base_filename + ".start" ) number_of_sequences = GubbinsCommon.number_of_sequences_in_alignment(self.args.alignment_filename) if number_of_sequences < 3: sys.exit("4 or more sequences are required.") latest_file_name = "latest_tree." + base_filename_without_ext + "." + str(current_time) + "tre" tree_file_names = [] tree_building_command = "" gubbins_command = "" previous_tree_name = "" current_tree_name = "" max_iteration = 1 raxml_files_to_delete = GubbinsCommon.raxml_regex_for_file_deletions( base_filename_without_ext, current_time, starting_base_filename, self.args.iterations ) # cleanup RAxML intermediate files if self.args.no_cleanup == 0 or self.args.no_cleanup is None: GubbinsCommon.delete_files_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose) if GubbinsCommon.check_file_exist_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose) == 1: sys.exit( "Intermediate files from a previous run exist. Please rerun without the --no_cleanup option to automatically delete them or with the --use_time_stamp to add a unique prefix." ) for i in range(1, self.args.iterations + 1): max_iteration += 1 if self.args.tree_builder == "hybrid": if i == 1: previous_tree_name = GubbinsCommon.fasttree_previous_tree_name(base_filename, i) current_tree_name = GubbinsCommon.fasttree_current_tree_name(base_filename, i) tree_building_command = GubbinsCommon.fasttree_tree_building_command( i, self.args.starting_tree, current_tree_name, base_filename, previous_tree_name, FASTTREE_EXEC, FASTTREE_PARAMS, base_filename, ) gubbins_command = GubbinsCommon.fasttree_gubbins_command( base_filename, starting_base_filename + ".gaps", i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) elif i == 2: previous_tree_name = current_tree_name current_tree_name = GubbinsCommon.raxml_current_tree_name( base_filename_without_ext, current_time, i ) tree_building_command = GubbinsCommon.raxml_tree_building_command( i, base_filename_without_ext, base_filename, current_time, raxml_executable_obj.tree_building_command(), previous_tree_name, self.args.verbose, ) gubbins_command = GubbinsCommon.raxml_gubbins_command( base_filename_without_ext, starting_base_filename + ".gaps", current_time, i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) else: previous_tree_name = GubbinsCommon.raxml_previous_tree_name( base_filename_without_ext, base_filename, current_time, i ) current_tree_name = GubbinsCommon.raxml_current_tree_name( base_filename_without_ext, current_time, i ) tree_building_command = GubbinsCommon.raxml_tree_building_command( i, base_filename_without_ext, base_filename, current_time, raxml_executable_obj.tree_building_command(), previous_tree_name, self.args.verbose, ) gubbins_command = GubbinsCommon.raxml_gubbins_command( base_filename_without_ext, starting_base_filename + ".gaps", current_time, i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) elif self.args.tree_builder == "raxml": previous_tree_name = GubbinsCommon.raxml_previous_tree_name( base_filename_without_ext, base_filename, current_time, i ) current_tree_name = GubbinsCommon.raxml_current_tree_name(base_filename_without_ext, current_time, i) tree_building_command = GubbinsCommon.raxml_tree_building_command( i, base_filename_without_ext, base_filename, current_time, raxml_executable_obj.tree_building_command(), previous_tree_name, self.args.verbose, ) gubbins_command = GubbinsCommon.raxml_gubbins_command( base_filename_without_ext, starting_base_filename + ".gaps", current_time, i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) elif self.args.tree_builder == "fasttree": previous_tree_name = GubbinsCommon.fasttree_previous_tree_name(base_filename, i) if i == 1: previous_tree_name = base_filename current_tree_name = GubbinsCommon.fasttree_current_tree_name(base_filename, i) tree_building_command = GubbinsCommon.fasttree_tree_building_command( i, self.args.starting_tree, current_tree_name, previous_tree_name, previous_tree_name, FASTTREE_EXEC, FASTTREE_PARAMS, base_filename, ) gubbins_command = GubbinsCommon.fasttree_gubbins_command( base_filename, starting_base_filename + ".gaps", i, self.args.alignment_filename, GUBBINS_EXEC, self.args.min_snps, self.args.alignment_filename, self.args.min_window_size, self.args.max_window_size, ) if self.args.verbose > 0: print(tree_building_command) if self.args.starting_tree is not None and i == 1: shutil.copyfile(self.args.starting_tree, current_tree_name) else: try: subprocess.check_call(tree_building_command, shell=True) except: sys.exit("Failed while building the tree.") if self.args.verbose > 0: print(int(time.time())) GubbinsCommon.reroot_tree(str(current_tree_name), self.args.outgroup) try: raxml_seq_recon = RAxMLSequenceReconstruction( starting_base_filename + ".snp_sites.aln", current_tree_name, starting_base_filename + ".seq.joint.txt", current_tree_name, raxml_executable_obj.internal_sequence_reconstruction_command(), self.args.verbose, ) raxml_seq_recon.reconstruct_ancestor_sequences() except: sys.exit("Failed while running RAxML internal sequence reconstruction") shutil.copyfile(starting_base_filename + ".start", starting_base_filename + ".gaps.snp_sites.aln") GubbinsCommon.reinsert_gaps_into_fasta_file( starting_base_filename + ".seq.joint.txt", starting_base_filename + ".gaps.vcf", starting_base_filename + ".gaps.snp_sites.aln", ) if ( GubbinsCommon.does_file_exist(starting_base_filename + ".gaps.snp_sites.aln", "Alignment File") == 0 or not ValidateFastaAlignment( starting_base_filename + ".gaps.snp_sites.aln" ).is_input_fasta_file_valid() ): sys.exit( "There is a problem with your FASTA file after running RAxML internal sequence reconstruction. Please check this intermediate file is valid: " + str(starting_base_filename) + ".gaps.snp_sites.aln" ) if self.args.verbose > 0: print(int(time.time())) if self.args.verbose > 0: print(gubbins_command) try: subprocess.check_call(gubbins_command, shell=True) except: sys.exit("Failed while running Gubbins. Please ensure you have enough free memory") if self.args.verbose > 0: print(int(time.time())) tree_file_names.append(current_tree_name) if i > 2: if self.args.converge_method == "recombination": current_recomb_file, previous_recomb_files = GubbinsCommon.get_recombination_files( base_filename_without_ext, current_time, max_iteration - 1, starting_base_filename, self.args.tree_builder, ) if GubbinsCommon.have_recombinations_been_seen_before(current_recomb_file, previous_recomb_files): if self.args.verbose > 0: print("Recombinations observed before so stopping: " + str(current_tree_name)) break else: if GubbinsCommon.has_tree_been_seen_before(tree_file_names, self.args.converge_method): if self.args.verbose > 0: print("Tree observed before so stopping: " + str(current_tree_name)) break # cleanup intermediate files if self.args.no_cleanup == 0 or self.args.no_cleanup is None: max_intermediate_iteration = max_iteration - 1 raxml_files_to_delete = GubbinsCommon.raxml_regex_for_file_deletions( base_filename_without_ext, current_time, starting_base_filename, max_intermediate_iteration ) GubbinsCommon.delete_files_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose) fasttree_files_to_delete = GubbinsCommon.fasttree_regex_for_file_deletions( starting_base_filename, max_intermediate_iteration ) GubbinsCommon.delete_files_based_on_list_of_regexes(".", fasttree_files_to_delete, self.args.verbose) shutil.rmtree(temp_working_dir) GubbinsCommon.delete_files_based_on_list_of_regexes( ".", [GubbinsCommon.starting_files_regex("^" + starting_base_filename), "^log.txt"], self.args.verbose ) output_filenames_to_final_filenames = {} if self.args.prefix is None: self.args.prefix = base_filename_without_ext if self.args.tree_builder == "hybrid" or self.args.tree_builder == "raxml": output_filenames_to_final_filenames = GubbinsCommon.translation_of_raxml_filenames_to_final_filenames( base_filename_without_ext, current_time, max_iteration - 1, self.args.prefix ) else: output_filenames_to_final_filenames = GubbinsCommon.translation_of_fasttree_filenames_to_final_filenames( starting_base_filename, max_iteration - 1, self.args.prefix ) GubbinsCommon.rename_files(output_filenames_to_final_filenames) GubbinsCommon.remove_internal_node_labels_from_tree( str(self.args.prefix) + ".final_tree.tre", str(self.args.prefix) + ".no_internal_labels.final_tree.tre" ) shutil.move( str(self.args.prefix) + ".no_internal_labels.final_tree.tre", str(self.args.prefix) + ".final_tree.tre" )
def test_transfer_internal_labels(self): raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', 'output_tree', '', False) raxml_seq_recon.transfer_internal_names_to_tree('gubbins/tests/data/source_tree.tre', 'gubbins/tests/data/destination_tree.tre', 'renamed_output_tree') assert os.path.exists('renamed_output_tree') self.assertTrue(filecmp.cmp('renamed_output_tree','gubbins/tests/data/expected_renamed_output_tree', shallow = False))
def test_add_labels_to_tree(self): raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', '', '', False) raxml_seq_recon.root_tree('gubbins/tests/data/raxml_sequence_reconstruction/unrooted_tree.newick', raxml_seq_recon.temp_rooted_tree) tree = dendropy.Tree.get_from_path(raxml_seq_recon.temp_rooted_tree, 'newick', preserve_underscores=True) self.assertEqual("((B:0.1,(C:0.1,(D:0.1,E:0.1))),(A:0.1,F:0.1):0.0);\n",tree.as_string(schema='newick'))
def test_merging_fasta_files(self): raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', '', '', False) raxml_seq_recon.combine_fastas('gubbins/tests/data/raxml_sequence_reconstruction/1.fasta','gubbins/tests/data/raxml_sequence_reconstruction/2.fasta','combined.fasta') self.assertTrue(filecmp.cmp('combined.fasta','gubbins/tests/data/raxml_sequence_reconstruction/expected_combined_1_2.fasta', shallow = False))
def test_convert_raw_ancestral_file_to_fasta(self): raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', '', '', False) raxml_seq_recon.convert_raw_ancestral_states_to_fasta('gubbins/tests/data/raxml_sequence_reconstruction/raw_marginalAncestralStates.phylip','outputfile') self.assertTrue(filecmp.cmp('outputfile','gubbins/tests/data/raxml_sequence_reconstruction/expected_marginalAncestralStates.fasta', shallow = False))
def test_root_input_tree(self): raxml_seq_recon = RAxMLSequenceReconstruction('abc', 'gubbins/tests/data/raxml_sequence_reconstruction/unrooted_tree.newick', 'abc', 'abc', '', False) output_tree = raxml_seq_recon.root_tree('gubbins/tests/data/raxml_sequence_reconstruction/unrooted_tree.newick',raxml_seq_recon.temp_rooted_tree) self.assertTrue(filecmp.cmp(str(raxml_seq_recon.temp_rooted_tree), 'gubbins/tests/data/raxml_sequence_reconstruction/expected_rooted_tree.newick', shallow = False))
def test_working_directory_construction(self): raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', '', '', False) self.assertTrue( os.path.exists(raxml_seq_recon.working_dir) )
def test_ancestor_raxml_command_verbose(self): raxml_seq_recon = RAxMLSequenceReconstruction('input_alignment.fasta', 'input_tree', 'output_alignment_filename', 'output_tree', 'raxmlHPC -f A -p 1 -m GTRGAMMA', verbose = True) self.assertEqual(raxml_seq_recon.raxml_reconstruction_command(raxml_seq_recon.working_dir+'/rooted_tree.newick'), 'raxmlHPC -f A -p 1 -m GTRGAMMA -s '+raxml_seq_recon.input_alignment_filename+' -t ' + raxml_seq_recon.working_dir+'/rooted_tree.newick -n internal ')