コード例 #1
0
	def test_run_raxml_ancestor_reconstruction(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('gubbins/tests/data/raxml_sequence_reconstruction/input_alignment.fasta',
			'gubbins/tests/data/raxml_sequence_reconstruction/unrooted_tree.newick',
			'outputfile', 'output_tree', RAxMLExecutable(1).internal_sequence_reconstruction_command(), False)
		raxml_seq_recon.reconstruct_ancestor_sequences()

		assert os.path.exists('outputfile')
コード例 #2
0
	def test_more_complex_tree(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('gubbins/tests/data/multiple_recombinations.aln',
			'gubbins/tests/data/expected_RAxML_result.multiple_recombinations.iteration_5.output_tree',
			'output_alignment', 'output_tree', RAxMLExecutable(1).internal_sequence_reconstruction_command(), False)
		raxml_seq_recon.reconstruct_ancestor_sequences()

		assert os.path.exists('output_alignment')
		assert os.path.exists('output_tree')
コード例 #3
0
ファイル: common.py プロジェクト: sanger-pathogens/gubbins
    def parse_and_run(self):
        # Default parameters
        raxml_executable_obj = RAxMLExecutable(self.args.threads, self.args.raxml_model, self.args.verbose)

        fasttree_executables = ["FastTree", "fasttree"]
        FASTTREE_EXEC = GubbinsCommon.choose_executable(fasttree_executables)

        FASTTREE_PARAMS = "-nosupport -gtr -gamma -nt"
        GUBBINS_EXEC = "gubbins"

        GUBBINS_BUNDLED_EXEC = "../src/gubbins"

        # check that all the external executable dependancies are available
        if GubbinsCommon.which(GUBBINS_EXEC) is None:
            GUBBINS_EXEC = GubbinsCommon.use_bundled_exec(GUBBINS_EXEC, GUBBINS_BUNDLED_EXEC)
            if GubbinsCommon.which(GUBBINS_EXEC) is None:
                sys.exit(GUBBINS_EXEC + " is not in your path")

        if self.args.tree_builder == "fasttree" or self.args.tree_builder == "hybrid":
            if GubbinsCommon.which(FASTTREE_EXEC) is None:
                sys.exit("FastTree is not in your path")

        if self.args.converge_method not in ["weighted_robinson_foulds", "robinson_foulds", "recombination"]:
            sys.exit(
                "Please choose weighted_robinson_foulds, robinson_foulds or recombination for the --converge_method option"
            )

        if (
            GubbinsCommon.does_file_exist(self.args.alignment_filename, "Alignment File") == 0
            or not ValidateFastaAlignment(self.args.alignment_filename).is_input_fasta_file_valid()
        ):
            sys.exit("There is a problem with your input fasta file so nothing can be done until you fix it")

        if (
            self.args.starting_tree is not None
            and self.args.starting_tree != ""
            and (
                GubbinsCommon.does_file_exist(self.args.starting_tree, "Starting Tree") == 0
                or GubbinsCommon.is_input_starting_tree_valid(self.args.starting_tree)
            )
        ):
            sys.exit("The starting tree is invalid")

        if (
            self.args.starting_tree is not None
            and self.args.starting_tree != ""
            and GubbinsCommon.do_the_names_match_the_fasta_file(self.args.starting_tree, self.args.alignment_filename)
            == 0
        ):
            sys.exit("The names in the starting tree dont match the names in the fasta file")

        GubbinsCommon.check_and_fix_window_size(self)

        current_time = ""
        if self.args.use_time_stamp > 0:
            current_time = str(int(time.time())) + "."
            if self.args.verbose > 0:
                print(current_time)

        # get the base filename
        (base_directory, base_filename) = os.path.split(self.args.alignment_filename)
        (base_filename_without_ext, extension) = os.path.splitext(base_filename)
        starting_base_filename = base_filename

        # put a filtered copy into a temp directory and work from that
        temp_working_dir = tempfile.mkdtemp(dir=os.getcwd())

        pre_process_fasta = PreProcessFasta(
            self.args.alignment_filename, self.args.verbose, self.args.filter_percentage
        )
        taxa_removed = pre_process_fasta.remove_duplicate_sequences_and_sequences_missing_too_much_data(
            temp_working_dir + "/" + starting_base_filename, self.args.remove_identical_sequences
        )

        self.args.alignment_filename = temp_working_dir + "/" + starting_base_filename

        # If a starting tree has been provided make sure that taxa filtered out in the previous step are removed from the tree
        self.args.starting_tree = GubbinsCommon.filter_out_removed_taxa_from_tree_and_return_new_file(
            self.args.starting_tree, temp_working_dir, taxa_removed
        )

        # get the base filename
        (base_directory, base_filename) = os.path.split(self.args.alignment_filename)
        (base_filename_without_ext, extension) = os.path.splitext(base_filename)
        starting_base_filename = base_filename

        if len(base_filename) > 115:
            sys.exit(
                "Your filename is too long for RAxML at "
                + str(len(base_filename))
                + " characters, please shorten it to less than 115 characters"
            )

        # find all snp sites
        if self.args.verbose > 0:
            print(GUBBINS_EXEC + " " + self.args.alignment_filename)
        try:
            subprocess.check_call([GUBBINS_EXEC, self.args.alignment_filename])
        except:
            sys.exit("Gubbins crashed, please ensure you have enough free memory")

        if self.args.verbose > 0:
            print(int(time.time()))

        GubbinsCommon.reconvert_fasta_file(
            starting_base_filename + ".gaps.snp_sites.aln", starting_base_filename + ".start"
        )

        number_of_sequences = GubbinsCommon.number_of_sequences_in_alignment(self.args.alignment_filename)
        if number_of_sequences < 3:
            sys.exit("4 or more sequences are required.")

        latest_file_name = "latest_tree." + base_filename_without_ext + "." + str(current_time) + "tre"
        tree_file_names = []

        tree_building_command = ""
        gubbins_command = ""
        previous_tree_name = ""
        current_tree_name = ""
        max_iteration = 1

        raxml_files_to_delete = GubbinsCommon.raxml_regex_for_file_deletions(
            base_filename_without_ext, current_time, starting_base_filename, self.args.iterations
        )
        # cleanup RAxML intermediate files
        if self.args.no_cleanup == 0 or self.args.no_cleanup is None:
            GubbinsCommon.delete_files_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose)

        if GubbinsCommon.check_file_exist_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose) == 1:
            sys.exit(
                "Intermediate files from a previous run exist. Please rerun without the --no_cleanup option to automatically delete them or with the --use_time_stamp to add a unique prefix."
            )

        for i in range(1, self.args.iterations + 1):
            max_iteration += 1

            if self.args.tree_builder == "hybrid":
                if i == 1:
                    previous_tree_name = GubbinsCommon.fasttree_previous_tree_name(base_filename, i)
                    current_tree_name = GubbinsCommon.fasttree_current_tree_name(base_filename, i)
                    tree_building_command = GubbinsCommon.fasttree_tree_building_command(
                        i,
                        self.args.starting_tree,
                        current_tree_name,
                        base_filename,
                        previous_tree_name,
                        FASTTREE_EXEC,
                        FASTTREE_PARAMS,
                        base_filename,
                    )
                    gubbins_command = GubbinsCommon.fasttree_gubbins_command(
                        base_filename,
                        starting_base_filename + ".gaps",
                        i,
                        self.args.alignment_filename,
                        GUBBINS_EXEC,
                        self.args.min_snps,
                        self.args.alignment_filename,
                        self.args.min_window_size,
                        self.args.max_window_size,
                    )

                elif i == 2:
                    previous_tree_name = current_tree_name
                    current_tree_name = GubbinsCommon.raxml_current_tree_name(
                        base_filename_without_ext, current_time, i
                    )
                    tree_building_command = GubbinsCommon.raxml_tree_building_command(
                        i,
                        base_filename_without_ext,
                        base_filename,
                        current_time,
                        raxml_executable_obj.tree_building_command(),
                        previous_tree_name,
                        self.args.verbose,
                    )
                    gubbins_command = GubbinsCommon.raxml_gubbins_command(
                        base_filename_without_ext,
                        starting_base_filename + ".gaps",
                        current_time,
                        i,
                        self.args.alignment_filename,
                        GUBBINS_EXEC,
                        self.args.min_snps,
                        self.args.alignment_filename,
                        self.args.min_window_size,
                        self.args.max_window_size,
                    )
                else:
                    previous_tree_name = GubbinsCommon.raxml_previous_tree_name(
                        base_filename_without_ext, base_filename, current_time, i
                    )
                    current_tree_name = GubbinsCommon.raxml_current_tree_name(
                        base_filename_without_ext, current_time, i
                    )
                    tree_building_command = GubbinsCommon.raxml_tree_building_command(
                        i,
                        base_filename_without_ext,
                        base_filename,
                        current_time,
                        raxml_executable_obj.tree_building_command(),
                        previous_tree_name,
                        self.args.verbose,
                    )
                    gubbins_command = GubbinsCommon.raxml_gubbins_command(
                        base_filename_without_ext,
                        starting_base_filename + ".gaps",
                        current_time,
                        i,
                        self.args.alignment_filename,
                        GUBBINS_EXEC,
                        self.args.min_snps,
                        self.args.alignment_filename,
                        self.args.min_window_size,
                        self.args.max_window_size,
                    )

            elif self.args.tree_builder == "raxml":
                previous_tree_name = GubbinsCommon.raxml_previous_tree_name(
                    base_filename_without_ext, base_filename, current_time, i
                )
                current_tree_name = GubbinsCommon.raxml_current_tree_name(base_filename_without_ext, current_time, i)
                tree_building_command = GubbinsCommon.raxml_tree_building_command(
                    i,
                    base_filename_without_ext,
                    base_filename,
                    current_time,
                    raxml_executable_obj.tree_building_command(),
                    previous_tree_name,
                    self.args.verbose,
                )

                gubbins_command = GubbinsCommon.raxml_gubbins_command(
                    base_filename_without_ext,
                    starting_base_filename + ".gaps",
                    current_time,
                    i,
                    self.args.alignment_filename,
                    GUBBINS_EXEC,
                    self.args.min_snps,
                    self.args.alignment_filename,
                    self.args.min_window_size,
                    self.args.max_window_size,
                )

            elif self.args.tree_builder == "fasttree":
                previous_tree_name = GubbinsCommon.fasttree_previous_tree_name(base_filename, i)
                if i == 1:
                    previous_tree_name = base_filename
                current_tree_name = GubbinsCommon.fasttree_current_tree_name(base_filename, i)

                tree_building_command = GubbinsCommon.fasttree_tree_building_command(
                    i,
                    self.args.starting_tree,
                    current_tree_name,
                    previous_tree_name,
                    previous_tree_name,
                    FASTTREE_EXEC,
                    FASTTREE_PARAMS,
                    base_filename,
                )
                gubbins_command = GubbinsCommon.fasttree_gubbins_command(
                    base_filename,
                    starting_base_filename + ".gaps",
                    i,
                    self.args.alignment_filename,
                    GUBBINS_EXEC,
                    self.args.min_snps,
                    self.args.alignment_filename,
                    self.args.min_window_size,
                    self.args.max_window_size,
                )

            if self.args.verbose > 0:
                print(tree_building_command)

            if self.args.starting_tree is not None and i == 1:
                shutil.copyfile(self.args.starting_tree, current_tree_name)
            else:
                try:
                    subprocess.check_call(tree_building_command, shell=True)
                except:
                    sys.exit("Failed while building the tree.")

            if self.args.verbose > 0:
                print(int(time.time()))

            GubbinsCommon.reroot_tree(str(current_tree_name), self.args.outgroup)

            try:
                raxml_seq_recon = RAxMLSequenceReconstruction(
                    starting_base_filename + ".snp_sites.aln",
                    current_tree_name,
                    starting_base_filename + ".seq.joint.txt",
                    current_tree_name,
                    raxml_executable_obj.internal_sequence_reconstruction_command(),
                    self.args.verbose,
                )
                raxml_seq_recon.reconstruct_ancestor_sequences()

            except:
                sys.exit("Failed while running RAxML internal sequence reconstruction")

            shutil.copyfile(starting_base_filename + ".start", starting_base_filename + ".gaps.snp_sites.aln")
            GubbinsCommon.reinsert_gaps_into_fasta_file(
                starting_base_filename + ".seq.joint.txt",
                starting_base_filename + ".gaps.vcf",
                starting_base_filename + ".gaps.snp_sites.aln",
            )

            if (
                GubbinsCommon.does_file_exist(starting_base_filename + ".gaps.snp_sites.aln", "Alignment File") == 0
                or not ValidateFastaAlignment(
                    starting_base_filename + ".gaps.snp_sites.aln"
                ).is_input_fasta_file_valid()
            ):
                sys.exit(
                    "There is a problem with your FASTA file after running RAxML internal sequence reconstruction. Please check this intermediate file is valid: "
                    + str(starting_base_filename)
                    + ".gaps.snp_sites.aln"
                )

            if self.args.verbose > 0:
                print(int(time.time()))

            if self.args.verbose > 0:
                print(gubbins_command)
            try:
                subprocess.check_call(gubbins_command, shell=True)
            except:
                sys.exit("Failed while running Gubbins. Please ensure you have enough free memory")
            if self.args.verbose > 0:
                print(int(time.time()))

            tree_file_names.append(current_tree_name)
            if i > 2:
                if self.args.converge_method == "recombination":
                    current_recomb_file, previous_recomb_files = GubbinsCommon.get_recombination_files(
                        base_filename_without_ext,
                        current_time,
                        max_iteration - 1,
                        starting_base_filename,
                        self.args.tree_builder,
                    )

                    if GubbinsCommon.have_recombinations_been_seen_before(current_recomb_file, previous_recomb_files):
                        if self.args.verbose > 0:
                            print("Recombinations observed before so stopping: " + str(current_tree_name))
                        break
                else:
                    if GubbinsCommon.has_tree_been_seen_before(tree_file_names, self.args.converge_method):
                        if self.args.verbose > 0:
                            print("Tree observed before so stopping: " + str(current_tree_name))
                        break

        # cleanup intermediate files
        if self.args.no_cleanup == 0 or self.args.no_cleanup is None:
            max_intermediate_iteration = max_iteration - 1

            raxml_files_to_delete = GubbinsCommon.raxml_regex_for_file_deletions(
                base_filename_without_ext, current_time, starting_base_filename, max_intermediate_iteration
            )
            GubbinsCommon.delete_files_based_on_list_of_regexes(".", raxml_files_to_delete, self.args.verbose)

            fasttree_files_to_delete = GubbinsCommon.fasttree_regex_for_file_deletions(
                starting_base_filename, max_intermediate_iteration
            )
            GubbinsCommon.delete_files_based_on_list_of_regexes(".", fasttree_files_to_delete, self.args.verbose)
            shutil.rmtree(temp_working_dir)

            GubbinsCommon.delete_files_based_on_list_of_regexes(
                ".", [GubbinsCommon.starting_files_regex("^" + starting_base_filename), "^log.txt"], self.args.verbose
            )

        output_filenames_to_final_filenames = {}
        if self.args.prefix is None:
            self.args.prefix = base_filename_without_ext
        if self.args.tree_builder == "hybrid" or self.args.tree_builder == "raxml":
            output_filenames_to_final_filenames = GubbinsCommon.translation_of_raxml_filenames_to_final_filenames(
                base_filename_without_ext, current_time, max_iteration - 1, self.args.prefix
            )
        else:
            output_filenames_to_final_filenames = GubbinsCommon.translation_of_fasttree_filenames_to_final_filenames(
                starting_base_filename, max_iteration - 1, self.args.prefix
            )
        GubbinsCommon.rename_files(output_filenames_to_final_filenames)
        GubbinsCommon.remove_internal_node_labels_from_tree(
            str(self.args.prefix) + ".final_tree.tre", str(self.args.prefix) + ".no_internal_labels.final_tree.tre"
        )
        shutil.move(
            str(self.args.prefix) + ".no_internal_labels.final_tree.tre", str(self.args.prefix) + ".final_tree.tre"
        )
コード例 #4
0
	def test_transfer_internal_labels(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', 'output_tree', '', False)
		raxml_seq_recon.transfer_internal_names_to_tree('gubbins/tests/data/source_tree.tre', 'gubbins/tests/data/destination_tree.tre', 'renamed_output_tree')
		assert os.path.exists('renamed_output_tree')
		self.assertTrue(filecmp.cmp('renamed_output_tree','gubbins/tests/data/expected_renamed_output_tree', shallow = False))
コード例 #5
0
	def test_add_labels_to_tree(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', '', '', False)
		raxml_seq_recon.root_tree('gubbins/tests/data/raxml_sequence_reconstruction/unrooted_tree.newick', raxml_seq_recon.temp_rooted_tree)
		
		tree  = dendropy.Tree.get_from_path(raxml_seq_recon.temp_rooted_tree, 'newick', preserve_underscores=True)
		self.assertEqual("((B:0.1,(C:0.1,(D:0.1,E:0.1))),(A:0.1,F:0.1):0.0);\n",tree.as_string(schema='newick'))
コード例 #6
0
	def test_merging_fasta_files(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', '', '', False)
		raxml_seq_recon.combine_fastas('gubbins/tests/data/raxml_sequence_reconstruction/1.fasta','gubbins/tests/data/raxml_sequence_reconstruction/2.fasta','combined.fasta')
		self.assertTrue(filecmp.cmp('combined.fasta','gubbins/tests/data/raxml_sequence_reconstruction/expected_combined_1_2.fasta', shallow = False))
コード例 #7
0
	def test_convert_raw_ancestral_file_to_fasta(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', '', '', False)
		raxml_seq_recon.convert_raw_ancestral_states_to_fasta('gubbins/tests/data/raxml_sequence_reconstruction/raw_marginalAncestralStates.phylip','outputfile')
		self.assertTrue(filecmp.cmp('outputfile','gubbins/tests/data/raxml_sequence_reconstruction/expected_marginalAncestralStates.fasta', shallow = False))
コード例 #8
0
	def test_root_input_tree(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('abc', 'gubbins/tests/data/raxml_sequence_reconstruction/unrooted_tree.newick', 'abc', 'abc', '', False)
		output_tree = raxml_seq_recon.root_tree('gubbins/tests/data/raxml_sequence_reconstruction/unrooted_tree.newick',raxml_seq_recon.temp_rooted_tree)
		self.assertTrue(filecmp.cmp(str(raxml_seq_recon.temp_rooted_tree), 'gubbins/tests/data/raxml_sequence_reconstruction/expected_rooted_tree.newick', shallow = False))
コード例 #9
0
	def test_working_directory_construction(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('', '', '', '', '', False)
		self.assertTrue( os.path.exists(raxml_seq_recon.working_dir) )
コード例 #10
0
	def test_ancestor_raxml_command_verbose(self):
		raxml_seq_recon = RAxMLSequenceReconstruction('input_alignment.fasta', 'input_tree',
			'output_alignment_filename', 'output_tree',
			'raxmlHPC -f A -p 1 -m GTRGAMMA',
			verbose = True)
		self.assertEqual(raxml_seq_recon.raxml_reconstruction_command(raxml_seq_recon.working_dir+'/rooted_tree.newick'), 'raxmlHPC -f A -p 1 -m GTRGAMMA  -s '+raxml_seq_recon.input_alignment_filename+' -t ' + raxml_seq_recon.working_dir+'/rooted_tree.newick -n internal ')