def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None, weights=None, verbose=0, iterations=5): """take a set of discrete states associated with tips of a tree and reconstruct their ancestral states along with a GTR model that approximately maximizes the likelihood of the states on the tree. Parameters ---------- tree : str, Bio.Phylo.Tree name of tree file or Biopython tree object traits : dict dictionary linking tips to straits missing_data : str, optional string indicating missing data pc : float, optional number of pseudo-counts to be used during GTR inference, default 1.0 sampling_bias_correction : float, optional factor to inflate overall switching rate by to counteract sampling bias weights : str, optional name of file with equilibirum frequencies verbose : int, optional level of verbosity in output iterations : int, optional number of times non-linear optimization of overall rate and transmission estimation are iterated Returns ------- tuple tuple of treeanc object, forward and reverse alphabets Raises ------ TreeTimeError raise error if ancestral reconstruction errors out """ ########################################################################### ### make a single character alphabet that maps to discrete states ########################################################################### unique_states = set(traits.values()) n_observed_states = len(unique_states) # load weights from file and convert to dict if supplied as string if type(weights)==str: try: tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',', skipinitialspace=True) weight_dict = {row[0]:row[1] for ri,row in tmp_weights.iterrows() if not np.isnan(row[1])} except: raise ValueError("Loading of weights file '%s' failed!"%weights) elif type(weights)==dict: weight_dict = weights else: weight_dict = None # add weights to unique states for alphabet construction if weight_dict is not None: unique_states.update(weight_dict.keys()) missing_weights = [c for c in unique_states if c not in weight_dict and c is not missing_data] if len(missing_weights): print("Missing weights for values: " + ", ".join(missing_weights)) if len(missing_weights)>0.5*n_observed_states: print("More than half of discrete states missing from the weights file") print("Weights read from file are:", weights) raise TreeTimeError("More than half of discrete states missing from the weights file") unique_states=sorted(unique_states) # make a map from states (excluding missing data) to characters in the alphabet # note that gap character '-' is chr(45) and will never be included here reverse_alphabet = {state:chr(65+i) for i,state in enumerate(unique_states) if state!=missing_data} alphabet = list(reverse_alphabet.values()) # construct a look up from alphabet character to states letter_to_state = {v:k for k,v in reverse_alphabet.items()} # construct the vector with weights to be used as equilibrium frequency if weight_dict is not None: mean_weight = np.mean(list(weight_dict.values())) weights = np.array([weight_dict[letter_to_state[c]] if letter_to_state[c] in weight_dict else mean_weight for c in alphabet], dtype=float) weights/=weights.sum() # consistency checks if len(alphabet)<2: print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr) return None, None, None n_states = len(alphabet) missing_char = chr(65+n_states) reverse_alphabet[missing_data]=missing_char letter_to_state[missing_char]=missing_data ########################################################################### ### construct gtr model ########################################################################### # set up dummy matrix W = np.ones((n_states,n_states), dtype=float) mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(n_states) mugration_GTR.ambiguous=missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose, convert_upper=False, one_mutation=0.001) treeanc.use_mutation_length = False pseudo_seqs = [SeqRecord(id=n.name,name=n.name, seq=Seq(reverse_alphabet[traits[n.name]] if n.name in traits else missing_char)) for n in treeanc.tree.get_terminals()] valid_seq = np.array([str(s.seq)!=missing_char for s in pseudo_seqs]) print("Assigned discrete traits to %d out of %d taxa.\n"%(np.sum(valid_seq),len(valid_seq))) treeanc.aln = MultipleSeqAlignment(pseudo_seqs) try: ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=pc, marginal=True, normalized_rate=False, fixed_pi=weights, reconstruct_tip_states=True) treeanc.optimize_gtr_rate() except TreeTimeError as e: print("\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n") raise e for i in range(iterations): treeanc.infer_gtr(marginal=True, normalized_rate=False, pc=pc, fixed_pi=weights) treeanc.optimize_gtr_rate() if sampling_bias_correction: treeanc.gtr.mu *= sampling_bias_correction treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False, reconstruct_tip_states=True) print(fill("NOTE: previous versions (<0.7.0) of this command made a 'short-branch length assumption. " "TreeTime now optimizes the overall rate numerically and thus allows for long branches " "along which multiple changes accumulated. This is expected to affect estimates of the " "overall rate while leaving the relative rates mostly unchanged.")) return treeanc, letter_to_state, reverse_alphabet
def reconstruct_discrete_traits(tree, traits, missing_data='?', pc=1.0, sampling_bias_correction=None, weights=None, verbose=0, iterations=5): """take a set of discrete states associated with tips of a tree and reconstruct their ancestral states along with a GTR model that approximately maximizes the likelihood of the states on the tree. Parameters ---------- tree : str, Bio.Phylo.Tree name of tree file or Biopython tree object traits : dict dictionary linking tips to straits missing_data : str, optional string indicating missing data pc : float, optional number of pseudo-counts to be used during GTR inference, default 1.0 sampling_bias_correction : float, optional factor to inflate overall switching rate by to counteract sampling bias weights : str, optional name of file with equilibirum frequencies verbose : int, optional level of verbosity in output iterations : int, optional number of times non-linear optimization of overall rate and transmission estimation are iterated Returns ------- tuple tuple of treeanc object, forward and reverse alphabets Raises ------ TreeTimeError raise error if ancestral reconstruction errors out """ unique_states = sorted(set(traits.values())) nc = len(unique_states) if nc>180: print("mugration: can't have more than 180 states!", file=sys.stderr) return None, None, None elif nc<2: print("mugration: only one or zero states found -- this doesn't make any sense", file=sys.stderr) return None, None, None ########################################################################### ### make a single character alphabet that maps to discrete states ########################################################################### alphabet = [chr(65+i) for i,state in enumerate(unique_states)] missing_char = chr(65+nc) letter_to_state = {a:unique_states[i] for i,a in enumerate(alphabet)} letter_to_state[missing_char]=missing_data reverse_alphabet = {v:k for k,v in letter_to_state.items()} ########################################################################### ### construct gtr model ########################################################################### if type(weights)==str: tmp_weights = pd.read_csv(weights, sep='\t' if weights[-3:]=='tsv' else ',', skipinitialspace=True) weights = {row[0]:row[1] for ri,row in tmp_weights.iterrows()} mean_weight = np.mean(list(weights.values())) weights = np.array([weights[c] if c in weights else mean_weight for c in unique_states], dtype=float) weights/=weights.sum() else: weights = None # set up dummy matrix W = np.ones((nc,nc), dtype=float) mugration_GTR = GTR.custom(pi = weights, W=W, alphabet = np.array(alphabet)) mugration_GTR.profile_map[missing_char] = np.ones(nc) mugration_GTR.ambiguous=missing_char ########################################################################### ### set up treeanc ########################################################################### treeanc = TreeAnc(tree, gtr=mugration_GTR, verbose=verbose, convert_upper=False, one_mutation=0.001) treeanc.use_mutation_length = False pseudo_seqs = [SeqRecord(id=n.name,name=n.name, seq=Seq(reverse_alphabet[traits[n.name]] if n.name in traits else missing_char)) for n in treeanc.tree.get_terminals()] treeanc.aln = MultipleSeqAlignment(pseudo_seqs) try: ndiff = treeanc.infer_ancestral_sequences(method='ml', infer_gtr=True, store_compressed=False, pc=pc, marginal=True, normalized_rate=False, fixed_pi=weights, reconstruct_tip_states=True) treeanc.optimize_gtr_rate() except TreeTimeError as e: print("\nAncestral reconstruction failed, please see above for error messages and/or rerun with --verbose 4\n") raise e for i in range(iterations): treeanc.infer_gtr(marginal=True, normalized_rate=False, pc=pc) treeanc.optimize_gtr_rate() if sampling_bias_correction: treeanc.gtr.mu *= sampling_bias_correction treeanc.infer_ancestral_sequences(infer_gtr=False, store_compressed=False, marginal=True, normalized_rate=False, reconstruct_tip_states=True) print(fill("NOTE: previous versions (<0.7.0) of this command made a 'short-branch length assumption. " "TreeTime now optimizes the overall rate numerically and thus allows for long branches " "along which multiple changes accumulated. This is expected to affect estimates of the " "overall rate while leaving the relative rates mostly unchanged.")) return treeanc, letter_to_state, reverse_alphabet