def calculate_initial_scores(self, mutation_matrix, mask_matrix, log_scores, beta_move_probability, beta_params): """Evaluate initial mutation tree and beta. Args: mutation_matrix (np.ndarray): Mutation matrix (D matrix). mask_matrix (np.ndarray): Matrix of 'count of probabilities' of type P(D_ij = x|E_ij = 0) and P(D_ij = x|E_ij = 1) from which fast_matrix can be obtained if log scores matrix is known. log_scores (np.ndarray): Log scores matrix corresponding to error probabilities. beta_move_probability (float): Probability of choosing beta move instead of tree move. beta_params (list): Parameters of beta distribution for beta. """ # Use accurate evaluation to obtain accurate log tree score and fast matrix self.tree_log_score, self.fast_matrix = ScoreUtils.get_fast_tree_score( mutation_matrix, mask_matrix, self.parent_vector, self.dft, log_scores, 1) # Beta moves are not used if beta_move_probability == 0.0: self.beta_log_score = 0.0 # Beta moves are used else: # Evaluate current beta to obtain beta log score self.beta_log_score = ScoreUtils.get_beta_score( self.beta, beta_params) self.update_combined_log_score()
def scenario_test_get_partial_tree_score_prune_and_reattach(self, scenario): parent_vector = get_mock_tree_parent_vector() mutation_matrix = np.random.randint(0, 4, (parent_vector.shape[0], 1000), dtype=np.int32) mask_matrix = ScoreUtils.get_mask_matrix(mutation_matrix) log_scores = np.asarray([-7.33943332981645e-05, -1.53502641492029, -9.7145214530235, -0.563699113373059, -11.2512044842885, -1.53502641492029, 0, 0]).reshape(4, 2) dft, ancestor_matrix = get_other_tree_representations(parent_vector) _, fast_matrix = ScoreUtils.get_fast_tree_score(mutation_matrix, mask_matrix, parent_vector, dft, log_scores, 1) node_to_move, new_parent, old_parent = scenario.move_parameters MoveUtils.prune_and_reattach_parent_vector_in_place(parent_vector, node_to_move, new_parent) proposed_dft = TreeUtils.get_depth_first_traversal(parent_vector) score, fast_matrix = ScoreUtils.get_partial_tree_score_prune_and_reattach(mutation_matrix, mask_matrix, parent_vector, proposed_dft, ancestor_matrix, log_scores, fast_matrix, 1, node_to_move) score_test, fast_matrix_test = ScoreUtils.get_fast_tree_score(mutation_matrix, mask_matrix, parent_vector, proposed_dft, log_scores, 1) self.assertTrue(check_max_variation_matrix(fast_matrix_test, fast_matrix, 1e-15) and np.abs(score - score_test) < 1e-15, msg=scenario.fail_message)
def test_get_accurate_tree_score(self): mutation_matrix = FileUtils.read_mutation_matrix("datasets/dataHou78.csv") mask_matrix = ScoreUtils.get_mask_matrix(mutation_matrix) log_scores = np.asarray([-7.33943332981645e-05, -1.53502641492029, -9.7145214530235, -0.563699113373059, -11.2512044842885, -1.53502641492029, 0, 0]).reshape(4, 2) parent_vector = np.asarray([9, 13, 12, 31, 5, 76, 37, 38, 1, 36, 1, 29, 67, 78, 67, 23, 14, 63, 70, 61, 53, 11, 10, 69, 43, 7, 48, 29, 65, 43, 55, 21, 1, 35, 13, 14, 11, 5, 59, 46, 5, 61, 14, 71, 21, 7, 6, 55, 44, 33, 48, 26, 24, 78, 10, 8, 23, 77, 13, 10, 35, 57, 46, 73, 58, 8, 2, 23, 32, 61, 53, 67, 60, 48, 58, 11, 3, 38]) dft = TreeUtils.get_depth_first_traversal(parent_vector) score, _ = ScoreUtils.get_accurate_tree_score(mask_matrix, parent_vector, dft, log_scores) self.assertAlmostEqual(score, -7478.72872196288)
def propose_new_beta(self, mutation_matrix, mask_matrix, log_scores, best_tree_log_score, jump_stdev, beta_params): """Construct a beta move in Metropolis-Hastings algorithm and accept or reject it. Args: mutation_matrix (np.ndarray): Mutation matrix (D matrix). mask_matrix (np.ndarray): Matrix of 'count of probabilities' of type P(D_ij = x|E_ij = 0) and P(D_ij = x|E_ij = 1) from which fast_matrix can be obtained if log scores matrix is known. log_scores (np.ndarray): Log scores matrix corresponding to error probabilities. best_tree_log_score (float): Best tree log score so far. jump_stdev (float): Beta jump normal random variable standard deviation. beta_params (list): Beta distribution parameters for beta. Returns: np.ndarray: Log scores matrix corresponding to error probabilities. """ # Find new beta proposed_beta = self.beta + np.random.normal(0, jump_stdev) # Mirror value if not on interval [0, 1] proposed_beta = MiscUtils.get_mirrored_beta(proposed_beta) # Calculate score of the proposed beta proposed_beta_log_score = ScoreUtils.get_beta_score( proposed_beta, beta_params) # Update log scores matrix proposed_log_scores = ScoreUtils.get_updated_log_scores_matrix( log_scores, proposed_beta) # Calculate score of the mutation tree with the new beta proposed_tree_log_score, proposed_fast_matrix = ScoreUtils.get_fast_tree_score( mutation_matrix, mask_matrix, self.parent_vector, self.dft, proposed_log_scores, best_tree_log_score) # Accept move if ScoreUtils.check_accept_move( proposed_beta_log_score + proposed_tree_log_score, self.combined_log_score): self.update_beta(proposed_beta, proposed_beta_log_score, proposed_tree_log_score, proposed_fast_matrix) return proposed_log_scores # Reject move return log_scores
def tree_move_prune_and_reattach(self, mutation_matrix, mask_matrix, log_scores, best_tree_log_score): """Construct a prune and reattach tree move in Metropolis-Hastings algorithm and accept or reject it. Args: mutation_matrix (np.ndarray): Mutation matrix (D matrix). mask_matrix (np.ndarray): Matrix of 'count of probabilities' of type P(D_ij = x|E_ij = 0) and P(D_ij = x|E_ij = 1) from which fast_matrix can be obtained if log scores matrix is known. log_scores (np.ndarray): Log scores matrix corresponding to error probabilities. best_tree_log_score (float): Best tree log score so far. """ # Get move parameters node_to_move, new_parent, old_parent = \ MoveUtils.get_move_params_prune_and_reattach(self.num_nodes, self.parent_vector, self.ancestor_matrix) # Change parent vector in-place - self.parent_vector represents proposed parent vector MoveUtils.prune_and_reattach_parent_vector_in_place( self.parent_vector, node_to_move, new_parent) # Calculate dft proposed_dft = TreeUtils.get_depth_first_traversal(self.parent_vector) # Evaluate the new tree proposed_tree_log_score, proposed_fast_matrix = ScoreUtils.get_partial_tree_score_prune_and_reattach( mutation_matrix, mask_matrix, self.parent_vector, proposed_dft, self.ancestor_matrix, log_scores, self.fast_matrix, best_tree_log_score, node_to_move) # Accept move if ScoreUtils.check_accept_move(proposed_tree_log_score, self.tree_log_score): self.dft = proposed_dft self.fast_matrix = proposed_fast_matrix self.update_tree_score(proposed_tree_log_score) # Change ancestor matrix accordingly MoveUtils.prune_and_reattach_ancestor_matrix_in_place( self.ancestor_matrix, node_to_move, new_parent, old_parent) # Reject move else: # Revert parent vector MoveUtils.prune_and_reattach_parent_vector_revert_in_place( self.parent_vector, node_to_move, old_parent)
def tree_move_swap_subtrees(self, mutation_matrix, mask_matrix, log_scores, best_tree_log_score): """Create a swap subtrees tree move in Metropolis-Hastings algorithm and accept or reject it. Args: mutation_matrix (np.ndarray): Mutation matrix (D matrix). mask_matrix (np.ndarray): Matrix of 'count of probabilities' of type P(D_ij = x|E_ij = 0) and P(D_ij = x|E_ij = 1) from which fast_matrix can be obtained if log scores matrix is known. log_scores (np.ndarray): Log scores matrix corresponding to error probabilities. best_tree_log_score (float): Best tree log score so far. """ # Get move parameters above_node, below_node, same_lineage, new_parent, nbh = \ MoveUtils.get_move_params_swap_subtrees(self.num_nodes, self.ancestor_matrix) # Construct parent vector proposed_parent_vector = MoveUtils.swap_subtrees_parent_vector( self.parent_vector, above_node, below_node, same_lineage, new_parent) # Construct dft proposed_dft = TreeUtils.get_depth_first_traversal( proposed_parent_vector) # Evaluate the new tree proposed_tree_log_score, proposed_fast_matrix = ScoreUtils.get_partial_tree_score_swap_subtrees( mutation_matrix, mask_matrix, proposed_parent_vector, proposed_dft, self.ancestor_matrix, log_scores, self.fast_matrix, best_tree_log_score, above_node, below_node, same_lineage) # Accept move if ScoreUtils.check_accept_move(proposed_tree_log_score, self.tree_log_score, neighbourhood_correction=nbh): self.parent_vector = proposed_parent_vector self.dft = proposed_dft self.fast_matrix = proposed_fast_matrix self.update_tree_score(proposed_tree_log_score) self.ancestor_matrix = TreeUtils.get_ancestor_matrix( proposed_parent_vector, proposed_dft)
def test_get_attachment_matrix(self): # Attachment to MAP tree with fixed beta obtained from dataXu.csv beta = 0.198 log_scores = ScoreUtils.get_log_scores_matrix(beta, 2.67E-5, 0, 0) mutation_matrix = FileUtils.read_mutation_matrix("datasets/dataXu.csv") mask_matrix = ScoreUtils.get_mask_matrix(mutation_matrix) parent_vector = np.asarray([5, 35, 0, 10, 2, 27, 9, 32, 11, 25, 17, 33, 15, 23, 34, 3, 14, 1, 13, 24, 19, 28, 16, 30, 7, 8, 22, 6, 31, 26, 12, 20, 29, 18, 4], dtype=np.int32) dft = TreeUtils.get_depth_first_traversal(parent_vector) attachment_matrix = ScoreUtils.get_attachment_matrix(mask_matrix, parent_vector, dft, beta, log_scores) attachment_matrix_test_list = [6, 8, 7, 2, 8, 8, 9, 8, 14, 13, 16, 13, 19, 2, 19, 12, 20, 2, 20, 10, 20, 12, 21, 7, 21, 9, 21, 15, 21, 16, 22, 13, 24, 2, 25, 8, 26, 11, 28, 0, 28, 1, 28, 3, 28, 5, 28, 6, 28, 14, 29, 11, 31, 3, 31, 4, 32, 11] attachment_matrix_test = np.asarray(attachment_matrix_test_list, dtype=np.int32) \ .reshape(int(len(attachment_matrix_test_list) / 2), 2) self.assertTrue(check_each_element_matrix(attachment_matrix == attachment_matrix_test))
def __init__(self, mutation_matrix_filename): """Initialization. Throughout the class beta means the probability of false negative, which means the probability of not observing mutation, even though the mutation is present. Beta consists of probabilities P(D_ij = 0|E_ij = 1) and P(D_ij = 2|E_ij = 1). All trees are mutation trees. Args: mutation_matrix_filename (str): Path to file that contains the data. File should contain numbers 0, 1, 2, 3; numbers must be separated with spaces. Each line (row) represents a mutation and each column represents a cell (mutation profile). """ # Read the mutation matrix from file self.mutation_matrix = FileUtils.read_mutation_matrix( mutation_matrix_filename) # Construct mask matrix self.mask_matrix = ScoreUtils.get_mask_matrix(self.mutation_matrix) # Store dimensions self.num_nodes, self.num_cells = self.mutation_matrix.shape
def run_mh(self, num_repetitions, chain_length, d0e1_probability, d1e0_probability, d2e0_probability=0.0, d2e1_probability=0.0, burn_in_proportion=0.25, sampling_enabled=False, beta_move_probability=0.0, mh_jump_scaling=10.0, beta_prior_stdev=0.1, gene_names_filename=None, store_best_trees=True, max_best_trees_stored=100, add_cells_to_best_trees=True, prune_reattach_probability=0.55, swap_node_labels_probability=0.4, swap_subtrees_probability=0.05, remove_and_insert_probability=0.0, silent=False, output_name=""): """Run Metropolis-Hastings algorithm with the data provided in the initialization. Args: num_repetitions (int): Number of repetitions of the Metropolis-Hastings algorithm. chain_length (int): Length of chain (number of steps) in each repetition. d0e1_probability (float): Probability P(D_ij = 0|E_ij = 1). d1e0_probability (float): Probability P(D_ij = 1|E_ij = 0). d2e0_probability (float): Probability P(D_ij = 2|E_ij = 0). d2e1_probability (float): Probability P(D_ij = 2|E_ij = 1). burn_in_proportion (float): Proportion of initial steps that are deemed burn in phase. sampling_enabled (bool): Sampling of the posterior distribution. If enabled, in each step, current tree and beta are stored. beta_move_probability (float): Probability of choosing beta move instead of a tree move. mh_jump_scaling (float): Scaling of beta jump standard deviation in relation to beta prior standard deviation (beta jump standard deviation = beta prior standard deviation / mh_jump_scaling). beta_prior_stdev (float): Beta standard deviation prior. gene_names_filename (str): Path to file that contains gene names of genes in the mutation matrix. Each name must be in its own line in file. Order of names should correspond to the order of genes in the mutation matrix. store_best_trees (bool): Store best MAP trees encountered during the algorithm execution. Trees are written to separate files in Graphviz format and are stored in folder best_trees. max_best_trees_stored (int): Maximum number of best trees stored to folder best_trees if store_best_trees is set to True. add_cells_to_best_trees: Find best attachment of all cells when the algorithm terminates. prune_reattach_probability (float): Probability of choosing prune and reattach tree move when tree move is used. swap_node_labels_probability (float): Probability of choosing swap node labels tree move when tree move is used. swap_subtrees_probability (float): Probability of choosing swap subtrees tree move when tree move is used. remove_and_insert_probability (float): Probability of choosing remove and insert tree move when tree move is used. silent (bool): If True, output less intermediate information. output_name (str): String used for naming best trees and posterior samples files. """ # Time spent in optimal states after the burn in phase time_in_optimal_states_after_burn_in = 0 # Number of steps that are in the burn-in phase and number of steps that are not in the-burn in phase num_burn_in_steps = int(burn_in_proportion * chain_length) num_non_burn_in_steps = chain_length - num_burn_in_steps # Reserve space for posterior distributions posterior_trees = None posterior_betas = None if sampling_enabled: num_posterior_samples = num_repetitions * num_non_burn_in_steps posterior_trees = np.empty((num_posterior_samples, self.num_nodes), dtype=np.int32) posterior_betas = np.empty((num_posterior_samples, ), dtype=np.float64) # Check that provided error probabilities are valid self.check_error_probabilities(d0e1_probability, d1e0_probability, d2e0_probability, d2e1_probability) # Check that provided move probabilities are valid or correct them if they are not prune_reattach_probability, swap_node_labels_probability, swap_subtrees_probability, \ remove_and_insert_probability, beta_move_probability = \ self.check_move_probabilities(prune_reattach_probability, swap_node_labels_probability, swap_subtrees_probability, remove_and_insert_probability, beta_move_probability) # Estimated beta (false negative probability) mean and standard deviation beta_prior_mean = float(d0e1_probability + d2e1_probability) beta_prior_stdev = float(beta_prior_stdev) # Calculation of beta distribution parameters for beta beta_params = ScoreUtils.calculate_beta_distribution_parameters( beta_prior_mean, beta_prior_stdev) # Scaling of beta jump standard deviation in relation to beta prior standard deviation jump_stdev = beta_prior_stdev / float(mh_jump_scaling) # Best results # Value 1 means that best score is not yet present (equivalent of None) # Integer default value is used so that variables can be used in @njit(cache=True) annotated functions best_tree_log_score = 1 best_combined_log_score = 1 best_beta = beta_prior_mean # Define a variable here so that attachment matrix can be calculated log_scores = None # Initialize object in which best trees and betas are stored best_results = BestResults(max_best_trees_stored, self.num_nodes) for repetition in range(num_repetitions): # Print information to console regarding current repetition print("Repetition: ", repetition + 1) if not silent: print("{:>25} {:>25} {:>25} {:>25}".format( "num steps", "best_tree_log_score", "best_beta", "best_combined_log_score")) # Initialize a tabras # Mutation tree is initialized as a random mutation tree # Beta is initialized as beta_prior_mean tabras = TreeAndBetaRepresentationsAndScores( self.num_nodes, beta_prior_mean) # Construct log scores matrix # Each entry in the log scores matrix corresponds to one of the log probabilities P(D_ij = x|E_ij = y) log_scores = ScoreUtils.get_log_scores_matrix( d0e1_probability, d1e0_probability, d2e0_probability, d2e1_probability) # Evaluate initialized tabras tabras.calculate_initial_scores(self.mutation_matrix, self.mask_matrix, log_scores, beta_move_probability, beta_params) for step in range(chain_length): # Output intermediate information if not silent and (step == 1 or step > 0 and step % 10000 == 0): print("{:>25} {:>25.15f} {:>25.15f} {:>25.15f}".format( str(step), best_tree_log_score, best_beta, best_combined_log_score)) # Beta move is chosen if beta_move_probability > 0 and np.random.random( ) < beta_move_probability: log_scores = tabras.propose_new_beta( self.mutation_matrix, self.mask_matrix, log_scores, best_tree_log_score, jump_stdev, beta_params) # Tree move is chosen else: tabras.propose_new_tree(self.mutation_matrix, self.mask_matrix, log_scores, best_tree_log_score, prune_reattach_probability, swap_node_labels_probability, swap_subtrees_probability) # Update optimal trees if current tree is optimal (at least currently) if store_best_trees: best_results.update_results(tabras.parent_vector, tabras.beta, tabras.combined_log_score, best_combined_log_score) # Store tree and beta for future sampling from posterior distribution if sampling_enabled and step >= num_burn_in_steps: posterior_index = repetition * num_non_burn_in_steps + step - num_burn_in_steps posterior_trees[posterior_index, :] = tabras.parent_vector posterior_betas[posterior_index] = tabras.beta # Update log scores if current tree and beta are the best until now if best_combined_log_score == 1 or tabras.combined_log_score > best_combined_log_score: time_in_optimal_states_after_burn_in = 0 best_tree_log_score = tabras.tree_log_score best_combined_log_score = tabras.combined_log_score best_beta = tabras.beta if tabras.combined_log_score == best_combined_log_score and step >= num_burn_in_steps: time_in_optimal_states_after_burn_in += 1 print("{:>25} {:>25.15f} {:>25.15f} {:>25.15f}".format( chain_length, best_tree_log_score, best_beta, best_combined_log_score)) print("Number of steps in optimal states after burn-in: {0}".format( time_in_optimal_states_after_burn_in)) if output_name != "": added_string = "_" + output_name else: added_string = "" # Store samples to a file if sampling_enabled: np.save("posterior_samples/trees" + added_string + ".npy", posterior_trees) np.save("posterior_samples/betas" + added_string + ".npy", posterior_betas) if store_best_trees: # Read gene names gene_names = FileUtils.get_gene_names(gene_names_filename, self.num_nodes) # Write best trees to files best_trees, best_beta = best_results.get_best_results() num_best_results = min(best_trees.shape[0], max_best_trees_stored) for i in range(num_best_results): output_filename = "best_trees/map" + added_string + "_" + str( i) + ".gv" attachment_matrix = None if add_cells_to_best_trees: dft = TreeUtils.get_depth_first_traversal(best_trees[i]) attachment_matrix = ScoreUtils.get_attachment_matrix( self.mask_matrix, best_trees[i], dft, best_beta[i], log_scores) FileUtils.output_graph_viz_file(output_filename, best_trees[i, :], gene_names, add_cells_to_best_trees, attachment_matrix)