コード例 #1
0
ファイル: MHRunner.py プロジェクト: lukakolar/Python-SCITE
    def calculate_initial_scores(self, mutation_matrix, mask_matrix,
                                 log_scores, beta_move_probability,
                                 beta_params):
        """Evaluate initial mutation tree and beta.

        Args:
            mutation_matrix (np.ndarray): Mutation matrix (D matrix).
            mask_matrix (np.ndarray): Matrix of 'count of probabilities' of type P(D_ij = x|E_ij = 0) and
                P(D_ij = x|E_ij = 1) from which fast_matrix can be obtained if log scores matrix is known.
            log_scores (np.ndarray): Log scores matrix corresponding to error probabilities.
            beta_move_probability (float): Probability of choosing beta move instead of tree move.
            beta_params (list): Parameters of beta distribution for beta.
        """
        # Use accurate evaluation to obtain accurate log tree score and fast matrix
        self.tree_log_score, self.fast_matrix = ScoreUtils.get_fast_tree_score(
            mutation_matrix, mask_matrix, self.parent_vector, self.dft,
            log_scores, 1)

        # Beta moves are not used
        if beta_move_probability == 0.0:
            self.beta_log_score = 0.0
        # Beta moves are used
        else:
            # Evaluate current beta to obtain beta log score
            self.beta_log_score = ScoreUtils.get_beta_score(
                self.beta, beta_params)

        self.update_combined_log_score()
コード例 #2
0
ファイル: Tests.py プロジェクト: lukakolar/Python-SCITE
    def scenario_test_get_partial_tree_score_prune_and_reattach(self, scenario):
        parent_vector = get_mock_tree_parent_vector()
        mutation_matrix = np.random.randint(0, 4, (parent_vector.shape[0], 1000), dtype=np.int32)
        mask_matrix = ScoreUtils.get_mask_matrix(mutation_matrix)
        log_scores = np.asarray([-7.33943332981645e-05, -1.53502641492029, -9.7145214530235, -0.563699113373059,
                                 -11.2512044842885, -1.53502641492029, 0, 0]).reshape(4, 2)
        dft, ancestor_matrix = get_other_tree_representations(parent_vector)

        _, fast_matrix = ScoreUtils.get_fast_tree_score(mutation_matrix, mask_matrix, parent_vector, dft, log_scores, 1)

        node_to_move, new_parent, old_parent = scenario.move_parameters

        MoveUtils.prune_and_reattach_parent_vector_in_place(parent_vector, node_to_move, new_parent)
        proposed_dft = TreeUtils.get_depth_first_traversal(parent_vector)

        score, fast_matrix = ScoreUtils.get_partial_tree_score_prune_and_reattach(mutation_matrix,
                                                                                  mask_matrix,
                                                                                  parent_vector,
                                                                                  proposed_dft,
                                                                                  ancestor_matrix,
                                                                                  log_scores,
                                                                                  fast_matrix,
                                                                                  1,
                                                                                  node_to_move)

        score_test, fast_matrix_test = ScoreUtils.get_fast_tree_score(mutation_matrix,
                                                                      mask_matrix,
                                                                      parent_vector,
                                                                      proposed_dft,
                                                                      log_scores,
                                                                      1)

        self.assertTrue(check_max_variation_matrix(fast_matrix_test, fast_matrix, 1e-15)
                        and np.abs(score - score_test) < 1e-15, msg=scenario.fail_message)
コード例 #3
0
ファイル: Tests.py プロジェクト: lukakolar/Python-SCITE
    def test_get_accurate_tree_score(self):
        mutation_matrix = FileUtils.read_mutation_matrix("datasets/dataHou78.csv")
        mask_matrix = ScoreUtils.get_mask_matrix(mutation_matrix)
        log_scores = np.asarray([-7.33943332981645e-05, -1.53502641492029, -9.7145214530235, -0.563699113373059,
                                 -11.2512044842885, -1.53502641492029, 0, 0]).reshape(4, 2)
        parent_vector = np.asarray([9, 13, 12, 31, 5, 76, 37, 38, 1, 36, 1, 29, 67, 78, 67, 23, 14, 63, 70, 61, 53, 11,
                                    10, 69, 43, 7, 48, 29, 65, 43, 55, 21, 1, 35, 13, 14, 11, 5, 59, 46, 5, 61, 14, 71,
                                    21, 7, 6, 55, 44, 33, 48, 26, 24, 78, 10, 8, 23, 77, 13, 10, 35, 57, 46, 73, 58, 8,
                                    2, 23, 32, 61, 53, 67, 60, 48, 58, 11, 3, 38])
        dft = TreeUtils.get_depth_first_traversal(parent_vector)

        score, _ = ScoreUtils.get_accurate_tree_score(mask_matrix, parent_vector, dft, log_scores)

        self.assertAlmostEqual(score, -7478.72872196288)
コード例 #4
0
ファイル: MHRunner.py プロジェクト: lukakolar/Python-SCITE
    def propose_new_beta(self, mutation_matrix, mask_matrix, log_scores,
                         best_tree_log_score, jump_stdev, beta_params):
        """Construct a beta move in Metropolis-Hastings algorithm and accept or reject it.

        Args:
            mutation_matrix (np.ndarray): Mutation matrix (D matrix).
            mask_matrix (np.ndarray): Matrix of 'count of probabilities' of type P(D_ij = x|E_ij = 0) and
                P(D_ij = x|E_ij = 1) from which fast_matrix can be obtained if log scores matrix is known.
            log_scores (np.ndarray): Log scores matrix corresponding to error probabilities.
            best_tree_log_score (float): Best tree log score so far.
            jump_stdev (float): Beta jump normal random variable standard deviation.
            beta_params (list): Beta distribution parameters for beta.

        Returns:
            np.ndarray: Log scores matrix corresponding to error probabilities.
        """
        # Find new beta
        proposed_beta = self.beta + np.random.normal(0, jump_stdev)

        # Mirror value if not on interval [0, 1]
        proposed_beta = MiscUtils.get_mirrored_beta(proposed_beta)

        # Calculate score of the proposed beta
        proposed_beta_log_score = ScoreUtils.get_beta_score(
            proposed_beta, beta_params)

        # Update log scores matrix
        proposed_log_scores = ScoreUtils.get_updated_log_scores_matrix(
            log_scores, proposed_beta)

        # Calculate score of the mutation tree with the new beta
        proposed_tree_log_score, proposed_fast_matrix = ScoreUtils.get_fast_tree_score(
            mutation_matrix, mask_matrix, self.parent_vector, self.dft,
            proposed_log_scores, best_tree_log_score)

        # Accept move
        if ScoreUtils.check_accept_move(
                proposed_beta_log_score + proposed_tree_log_score,
                self.combined_log_score):
            self.update_beta(proposed_beta, proposed_beta_log_score,
                             proposed_tree_log_score, proposed_fast_matrix)

            return proposed_log_scores

        # Reject move
        return log_scores
コード例 #5
0
ファイル: MHRunner.py プロジェクト: lukakolar/Python-SCITE
    def tree_move_prune_and_reattach(self, mutation_matrix, mask_matrix,
                                     log_scores, best_tree_log_score):
        """Construct a prune and reattach tree move in Metropolis-Hastings algorithm and accept or reject it.

        Args:
            mutation_matrix (np.ndarray): Mutation matrix (D matrix).
            mask_matrix (np.ndarray): Matrix of 'count of probabilities' of type P(D_ij = x|E_ij = 0) and
                P(D_ij = x|E_ij = 1) from which fast_matrix can be obtained if log scores matrix is known.
            log_scores (np.ndarray): Log scores matrix corresponding to error probabilities.
            best_tree_log_score (float): Best tree log score so far.
        """
        # Get move parameters
        node_to_move, new_parent, old_parent = \
            MoveUtils.get_move_params_prune_and_reattach(self.num_nodes, self.parent_vector, self.ancestor_matrix)

        # Change parent vector in-place - self.parent_vector represents proposed parent vector
        MoveUtils.prune_and_reattach_parent_vector_in_place(
            self.parent_vector, node_to_move, new_parent)

        # Calculate dft
        proposed_dft = TreeUtils.get_depth_first_traversal(self.parent_vector)

        # Evaluate the new tree
        proposed_tree_log_score, proposed_fast_matrix = ScoreUtils.get_partial_tree_score_prune_and_reattach(
            mutation_matrix, mask_matrix, self.parent_vector, proposed_dft,
            self.ancestor_matrix, log_scores, self.fast_matrix,
            best_tree_log_score, node_to_move)

        # Accept move
        if ScoreUtils.check_accept_move(proposed_tree_log_score,
                                        self.tree_log_score):
            self.dft = proposed_dft
            self.fast_matrix = proposed_fast_matrix
            self.update_tree_score(proposed_tree_log_score)

            # Change ancestor matrix accordingly
            MoveUtils.prune_and_reattach_ancestor_matrix_in_place(
                self.ancestor_matrix, node_to_move, new_parent, old_parent)

        # Reject move
        else:
            # Revert parent vector
            MoveUtils.prune_and_reattach_parent_vector_revert_in_place(
                self.parent_vector, node_to_move, old_parent)
コード例 #6
0
ファイル: MHRunner.py プロジェクト: lukakolar/Python-SCITE
    def tree_move_swap_subtrees(self, mutation_matrix, mask_matrix, log_scores,
                                best_tree_log_score):
        """Create a swap subtrees tree move in Metropolis-Hastings algorithm and accept or reject it.

        Args:
            mutation_matrix (np.ndarray): Mutation matrix (D matrix).
            mask_matrix (np.ndarray): Matrix of 'count of probabilities' of type P(D_ij = x|E_ij = 0) and
                P(D_ij = x|E_ij = 1) from which fast_matrix can be obtained if log scores matrix is known.
            log_scores (np.ndarray): Log scores matrix corresponding to error probabilities.
            best_tree_log_score (float): Best tree log score so far.
        """
        # Get move parameters
        above_node, below_node, same_lineage, new_parent, nbh = \
            MoveUtils.get_move_params_swap_subtrees(self.num_nodes, self.ancestor_matrix)

        # Construct parent vector
        proposed_parent_vector = MoveUtils.swap_subtrees_parent_vector(
            self.parent_vector, above_node, below_node, same_lineage,
            new_parent)

        # Construct dft
        proposed_dft = TreeUtils.get_depth_first_traversal(
            proposed_parent_vector)

        # Evaluate the new tree
        proposed_tree_log_score, proposed_fast_matrix = ScoreUtils.get_partial_tree_score_swap_subtrees(
            mutation_matrix, mask_matrix, proposed_parent_vector, proposed_dft,
            self.ancestor_matrix, log_scores, self.fast_matrix,
            best_tree_log_score, above_node, below_node, same_lineage)

        # Accept move
        if ScoreUtils.check_accept_move(proposed_tree_log_score,
                                        self.tree_log_score,
                                        neighbourhood_correction=nbh):
            self.parent_vector = proposed_parent_vector
            self.dft = proposed_dft
            self.fast_matrix = proposed_fast_matrix
            self.update_tree_score(proposed_tree_log_score)
            self.ancestor_matrix = TreeUtils.get_ancestor_matrix(
                proposed_parent_vector, proposed_dft)
コード例 #7
0
ファイル: Tests.py プロジェクト: lukakolar/Python-SCITE
    def test_get_attachment_matrix(self):
        # Attachment to MAP tree with fixed beta obtained from dataXu.csv
        beta = 0.198
        log_scores = ScoreUtils.get_log_scores_matrix(beta, 2.67E-5, 0, 0)
        mutation_matrix = FileUtils.read_mutation_matrix("datasets/dataXu.csv")
        mask_matrix = ScoreUtils.get_mask_matrix(mutation_matrix)
        parent_vector = np.asarray([5, 35, 0, 10, 2, 27, 9, 32, 11, 25, 17, 33, 15, 23, 34, 3, 14, 1, 13, 24, 19, 28,
                                    16, 30, 7, 8, 22, 6, 31, 26, 12, 20, 29, 18, 4], dtype=np.int32)
        dft = TreeUtils.get_depth_first_traversal(parent_vector)
        attachment_matrix = ScoreUtils.get_attachment_matrix(mask_matrix,
                                                             parent_vector,
                                                             dft,
                                                             beta,
                                                             log_scores)

        attachment_matrix_test_list = [6, 8, 7, 2, 8, 8, 9, 8, 14, 13, 16, 13, 19, 2, 19, 12, 20, 2, 20, 10, 20, 12,
                                       21, 7, 21, 9, 21, 15, 21, 16, 22, 13, 24, 2, 25, 8, 26, 11, 28, 0, 28, 1, 28,
                                       3, 28, 5, 28, 6, 28, 14, 29, 11, 31, 3, 31, 4, 32, 11]
        attachment_matrix_test = np.asarray(attachment_matrix_test_list, dtype=np.int32) \
            .reshape(int(len(attachment_matrix_test_list) / 2), 2)

        self.assertTrue(check_each_element_matrix(attachment_matrix == attachment_matrix_test))
コード例 #8
0
ファイル: MHRunner.py プロジェクト: lukakolar/Python-SCITE
    def __init__(self, mutation_matrix_filename):
        """Initialization. Throughout the class beta means the probability of false negative, which means the
        probability of not observing mutation, even though the mutation is present. Beta consists of probabilities
        P(D_ij = 0|E_ij = 1) and P(D_ij = 2|E_ij = 1). All trees are mutation trees.

        Args:
            mutation_matrix_filename (str): Path to file that contains the data. File should contain numbers
                0, 1, 2, 3; numbers must be separated with spaces. Each line (row) represents a mutation and each
                column represents a cell (mutation profile).
        """
        # Read the mutation matrix from file
        self.mutation_matrix = FileUtils.read_mutation_matrix(
            mutation_matrix_filename)

        # Construct mask matrix
        self.mask_matrix = ScoreUtils.get_mask_matrix(self.mutation_matrix)

        # Store dimensions
        self.num_nodes, self.num_cells = self.mutation_matrix.shape
コード例 #9
0
ファイル: MHRunner.py プロジェクト: lukakolar/Python-SCITE
    def run_mh(self,
               num_repetitions,
               chain_length,
               d0e1_probability,
               d1e0_probability,
               d2e0_probability=0.0,
               d2e1_probability=0.0,
               burn_in_proportion=0.25,
               sampling_enabled=False,
               beta_move_probability=0.0,
               mh_jump_scaling=10.0,
               beta_prior_stdev=0.1,
               gene_names_filename=None,
               store_best_trees=True,
               max_best_trees_stored=100,
               add_cells_to_best_trees=True,
               prune_reattach_probability=0.55,
               swap_node_labels_probability=0.4,
               swap_subtrees_probability=0.05,
               remove_and_insert_probability=0.0,
               silent=False,
               output_name=""):
        """Run Metropolis-Hastings algorithm with the data provided in the initialization.

        Args:
            num_repetitions (int): Number of repetitions of the Metropolis-Hastings algorithm.
            chain_length (int): Length of chain (number of steps) in each repetition.
            d0e1_probability (float): Probability P(D_ij = 0|E_ij = 1).
            d1e0_probability (float): Probability P(D_ij = 1|E_ij = 0).
            d2e0_probability (float): Probability P(D_ij = 2|E_ij = 0).
            d2e1_probability (float): Probability P(D_ij = 2|E_ij = 1).
            burn_in_proportion (float): Proportion of initial steps that are deemed burn in phase.
            sampling_enabled (bool): Sampling of the posterior distribution. If enabled, in each step, current tree and
                beta are stored.
            beta_move_probability (float): Probability of choosing beta move instead of a tree move.
            mh_jump_scaling (float): Scaling of beta jump standard deviation in relation to beta prior standard
                deviation (beta jump standard deviation = beta prior standard deviation / mh_jump_scaling).
            beta_prior_stdev (float): Beta standard deviation prior.
            gene_names_filename (str): Path to file that contains gene names of genes in the mutation matrix. Each name
                must be in its own line in file. Order of names should correspond to the order of genes in the mutation
                matrix.
            store_best_trees (bool): Store best MAP trees encountered during the algorithm execution. Trees are written
                to separate files in Graphviz format and are stored in folder best_trees.
            max_best_trees_stored (int): Maximum number of best trees stored to folder best_trees if store_best_trees
                is set to True.
            add_cells_to_best_trees: Find best attachment of all cells when the algorithm terminates.
            prune_reattach_probability (float): Probability of choosing prune and reattach tree move when tree move is
                used.
            swap_node_labels_probability (float): Probability of choosing swap node labels tree move when tree move is
                used.
            swap_subtrees_probability (float): Probability of choosing swap subtrees tree move when tree move is used.
            remove_and_insert_probability (float): Probability of choosing remove and insert tree move when tree move is
                used.
            silent (bool): If True, output less intermediate information.
            output_name (str): String used for naming best trees and posterior samples files.
        """
        # Time spent in optimal states after the burn in phase
        time_in_optimal_states_after_burn_in = 0

        # Number of steps that are in the burn-in phase and number of steps that are not in the-burn in phase
        num_burn_in_steps = int(burn_in_proportion * chain_length)
        num_non_burn_in_steps = chain_length - num_burn_in_steps

        # Reserve space for posterior distributions
        posterior_trees = None
        posterior_betas = None
        if sampling_enabled:
            num_posterior_samples = num_repetitions * num_non_burn_in_steps
            posterior_trees = np.empty((num_posterior_samples, self.num_nodes),
                                       dtype=np.int32)
            posterior_betas = np.empty((num_posterior_samples, ),
                                       dtype=np.float64)

        # Check that provided error probabilities are valid
        self.check_error_probabilities(d0e1_probability, d1e0_probability,
                                       d2e0_probability, d2e1_probability)

        # Check that provided move probabilities are valid or correct them if they are not
        prune_reattach_probability, swap_node_labels_probability, swap_subtrees_probability, \
            remove_and_insert_probability, beta_move_probability = \
            self.check_move_probabilities(prune_reattach_probability,
                                          swap_node_labels_probability,
                                          swap_subtrees_probability,
                                          remove_and_insert_probability,
                                          beta_move_probability)

        # Estimated beta (false negative probability) mean and standard deviation
        beta_prior_mean = float(d0e1_probability + d2e1_probability)
        beta_prior_stdev = float(beta_prior_stdev)

        # Calculation of beta distribution parameters for beta
        beta_params = ScoreUtils.calculate_beta_distribution_parameters(
            beta_prior_mean, beta_prior_stdev)

        # Scaling of beta jump standard deviation in relation to beta prior standard deviation
        jump_stdev = beta_prior_stdev / float(mh_jump_scaling)

        # Best results
        # Value 1 means that best score is not yet present (equivalent of None)
        # Integer default value is used so that variables can be used in @njit(cache=True) annotated functions
        best_tree_log_score = 1
        best_combined_log_score = 1
        best_beta = beta_prior_mean

        # Define a variable here so that attachment matrix can be calculated
        log_scores = None

        # Initialize object in which best trees and betas are stored
        best_results = BestResults(max_best_trees_stored, self.num_nodes)

        for repetition in range(num_repetitions):
            # Print information to console regarding current repetition
            print("Repetition: ", repetition + 1)
            if not silent:
                print("{:>25} {:>25} {:>25} {:>25}".format(
                    "num steps", "best_tree_log_score", "best_beta",
                    "best_combined_log_score"))

            # Initialize a tabras
            # Mutation tree is initialized as a random mutation tree
            # Beta is initialized as beta_prior_mean
            tabras = TreeAndBetaRepresentationsAndScores(
                self.num_nodes, beta_prior_mean)

            # Construct log scores matrix
            # Each entry in the log scores matrix corresponds to one of the log probabilities P(D_ij = x|E_ij = y)
            log_scores = ScoreUtils.get_log_scores_matrix(
                d0e1_probability, d1e0_probability, d2e0_probability,
                d2e1_probability)

            # Evaluate initialized tabras
            tabras.calculate_initial_scores(self.mutation_matrix,
                                            self.mask_matrix, log_scores,
                                            beta_move_probability, beta_params)

            for step in range(chain_length):
                # Output intermediate information
                if not silent and (step == 1
                                   or step > 0 and step % 10000 == 0):
                    print("{:>25} {:>25.15f} {:>25.15f} {:>25.15f}".format(
                        str(step), best_tree_log_score, best_beta,
                        best_combined_log_score))

                # Beta move is chosen
                if beta_move_probability > 0 and np.random.random(
                ) < beta_move_probability:
                    log_scores = tabras.propose_new_beta(
                        self.mutation_matrix, self.mask_matrix, log_scores,
                        best_tree_log_score, jump_stdev, beta_params)

                # Tree move is chosen
                else:
                    tabras.propose_new_tree(self.mutation_matrix,
                                            self.mask_matrix, log_scores,
                                            best_tree_log_score,
                                            prune_reattach_probability,
                                            swap_node_labels_probability,
                                            swap_subtrees_probability)

                # Update optimal trees if current tree is optimal (at least currently)
                if store_best_trees:
                    best_results.update_results(tabras.parent_vector,
                                                tabras.beta,
                                                tabras.combined_log_score,
                                                best_combined_log_score)

                # Store tree and beta for future sampling from posterior distribution
                if sampling_enabled and step >= num_burn_in_steps:
                    posterior_index = repetition * num_non_burn_in_steps + step - num_burn_in_steps
                    posterior_trees[posterior_index, :] = tabras.parent_vector
                    posterior_betas[posterior_index] = tabras.beta

                # Update log scores if current tree and beta are the best until now
                if best_combined_log_score == 1 or tabras.combined_log_score > best_combined_log_score:
                    time_in_optimal_states_after_burn_in = 0
                    best_tree_log_score = tabras.tree_log_score
                    best_combined_log_score = tabras.combined_log_score
                    best_beta = tabras.beta

                if tabras.combined_log_score == best_combined_log_score and step >= num_burn_in_steps:
                    time_in_optimal_states_after_burn_in += 1

            print("{:>25} {:>25.15f} {:>25.15f} {:>25.15f}".format(
                chain_length, best_tree_log_score, best_beta,
                best_combined_log_score))

        print("Number of steps in optimal states after burn-in: {0}".format(
            time_in_optimal_states_after_burn_in))

        if output_name != "":
            added_string = "_" + output_name
        else:
            added_string = ""

        # Store samples to a file
        if sampling_enabled:
            np.save("posterior_samples/trees" + added_string + ".npy",
                    posterior_trees)
            np.save("posterior_samples/betas" + added_string + ".npy",
                    posterior_betas)

        if store_best_trees:
            # Read gene names
            gene_names = FileUtils.get_gene_names(gene_names_filename,
                                                  self.num_nodes)

            # Write best trees to files
            best_trees, best_beta = best_results.get_best_results()
            num_best_results = min(best_trees.shape[0], max_best_trees_stored)

            for i in range(num_best_results):
                output_filename = "best_trees/map" + added_string + "_" + str(
                    i) + ".gv"
                attachment_matrix = None

                if add_cells_to_best_trees:
                    dft = TreeUtils.get_depth_first_traversal(best_trees[i])
                    attachment_matrix = ScoreUtils.get_attachment_matrix(
                        self.mask_matrix, best_trees[i], dft, best_beta[i],
                        log_scores)

                FileUtils.output_graph_viz_file(output_filename,
                                                best_trees[i, :], gene_names,
                                                add_cells_to_best_trees,
                                                attachment_matrix)