def target_distance_fcn(args, this_seq, target_seqs): if args.metric_for_target_distance == 'aa': return min([hamming_distance(this_seq.aa, t.aa) for t in target_seqs]) elif args.metric_for_target_distance == 'nuc': return min( [hamming_distance(this_seq.nuc, t.nuc) for t in target_seqs]) else: assert False
def calc_Kd(seqAA, targetAAseqs, hd2affy): '''Find the closest target sequence to and apply the "hamming distance to affinity" transformation function.''' if '*' in seqAA: # Non-sense sequences have zero affinity (zero affinty meaning infinity Kd) return (float('inf')) else: hd = min([hamming_distance(seqAA, t) for t in targetAAseqs]) return (hd2affy(hd))
def compare(self, tree2, method='identity'): '''compare this tree to the other tree''' if method == 'identity': # we compare lists of seq, parent, abundance # return true if these lists are identical, else false list1 = sorted((node.sequence, node.frequency, node.up.sequence if node.up is not None else None) for node in self.tree.traverse()) list2 = sorted((node.sequence, node.frequency, node.up.sequence if node.up is not None else None) for node in tree2.tree.traverse()) return list1 == list2 elif method == 'MRCA': # matrix of hamming distance of common ancestors of taxa # takes a true and inferred tree as CollapsedTree objects taxa = [ node.sequence for node in self.tree.traverse() if node.frequency ] n_taxa = len(taxa) d = scipy.zeros(shape=(n_taxa, n_taxa)) sum_sites = scipy.zeros(shape=(n_taxa, n_taxa)) for i in range(n_taxa): nodei_true = self.tree.iter_search_nodes( sequence=taxa[i]).next() nodei = tree2.tree.iter_search_nodes(sequence=taxa[i]).next() for j in range(i + 1, n_taxa): nodej_true = self.tree.iter_search_nodes( sequence=taxa[j]).next() nodej = tree2.tree.iter_search_nodes( sequence=taxa[j]).next() MRCA_true = self.tree.get_common_ancestor( (nodei_true, nodej_true)).sequence MRCA = tree2.tree.get_common_ancestor( (nodei, nodej)).sequence d[i, j] = hamming_distance(MRCA_true, MRCA) sum_sites[i, j] = len(MRCA_true) return d.sum() / sum_sites.sum() elif method == 'RF': tree1_copy = self.tree.copy(method='deepcopy') tree2_copy = tree2.tree.copy(method='deepcopy') for treex in (tree1_copy, tree2_copy): for node in list(treex.traverse()): if node.frequency > 0: child = TreeNode() child.add_feature('sequence', node.sequence) node.add_child(child) try: return tree1_copy.robinson_foulds(tree2_copy, attr_t1='sequence', attr_t2='sequence', unrooted_trees=True)[0] except: return tree1_copy.robinson_foulds(tree2_copy, attr_t1='sequence', attr_t2='sequence', unrooted_trees=True, allow_dup=True)[0] else: raise ValueError('invalid distance method: ' + method)
def ASR_parser(args): try: import cPickle as pickle except: import pickle from GCutils import CollapsedForest, CollapsedTree, hamming_distance try: tree = Tree(args.tree, format=1) except Exception as e: print(e) raise TreeFileParsingError('Could not read the input tree. Is this really newick format?') counts = {l.split(',')[0]:int(l.split(',')[1]) for l in open(args.counts)} tree.add_feature('frequency', 0) # Placeholder will be deleted when rerooting tree.add_feature('sequence', 'DUMMY') # Placeholder will be deleted when rerooting tree = map_asr_to_tree(args.asr_seq, args.leaf_seq, tree, args.naive, counts) # Reroot to make the naive sequence the real root instead of just an outgroup: tree = reroot_tree(tree, pattern=args.naive) # Recompute branch lengths as hamming distances: tree.dist = 0 # No branch above root for node in tree.iter_descendants(): node.dist = hamming_distance(node.sequence, node.up.sequence) iqtree_tree = CollapsedTree(tree=tree, name=args.name) # Add colors: if args.colormap is not None: with open(args.colormap, 'rb') as fh: colormap = pickle.load(fh) with open(args.idmap, 'rb') as fh: id_map = pickle.load(fh) # Reverse the id_map: id_map = {cs:seq_id for seq_id, cell_ids in id_map.items() for cs in cell_ids} # Expand the colormap and map to sequence ids: colormap_seqid = dict() for key, color in colormap.items(): if isinstance(key, str) and key in id_map: colormap_seqid[id_map[key]] = color else: for cell_id in key: if cell_id in id_map: colormap_seqid[id_map[cell_id]] = color colormap = colormap_seqid else: colormap = None iqtree_tree.render(args.outbase + '.svg', colormap=colormap) iqtree_forest = CollapsedForest(forest=[iqtree_tree], name=args.name) # Dump tree as newick: iqtree_forest.write_random_tree(args.outbase+'.tree') print('number of trees with integer branch lengths:', iqtree_forest.n_trees) with open(args.outbase + '.p', 'wb') as f: pickle.dump(iqtree_forest, f) print('Done parsing IQ-TREE tree')
def build_tree(sequences, parents, counts=None, naive='naive'): # build an ete tree # first a dictionary of disconnected nodes nodes = {} for name in sequences: node = Tree() node.name = name node.add_feature('nuc_seq', sequences[node.name]) node.add_feature('aa_seq', local_translate(sequences[node.name])) if counts is not None and node.name in counts: node.add_feature('frequency', counts[node.name]) else: node.add_feature('frequency', 0) nodes[name] = node for name in sequences: if name in parents: nodes[parents[name]].add_child(nodes[name]) else: tree = nodes[name] # Reroot on naive: if naive is not None: naive_id = [n for n in nodes if naive in n][0] assert len(nodes[naive_id].children) == 0 naive_parent = nodes[naive_id].up naive_parent.remove_child(nodes[naive_id]) nodes[naive_id].add_child(naive_parent) # remove possible unecessary unifurcation after rerooting if len(naive_parent.children) == 1: naive_parent.delete(prevent_nondicotomic=False) naive_parent.children[0].dist = hamming_distance( naive_parent.children[0].nuc_seq, nodes[naive_id].nuc_seq) tree = nodes[naive_id] # make random choices for ambiguous bases tree = disambiguate(tree) # compute branch lengths tree.dist = 0 # no branch above root for node in tree.iter_descendants(): node.dist = hamming_distance(node.nuc_seq, node.up.nuc_seq) return tree
def one_mutant(self, sequence, Nmuts, lambda0=0.1): ''' Make a single mutant with a hamming distance, in amino acid space, of Nmuts away from the starting point. ''' trial = 100 # Allow 100 trials before quitting while trial > 0: mut_seq = sequence[:] aa = translate(sequence) aa_mut = translate(mut_seq) dist = hamming_distance(aa, aa_mut) while dist < Nmuts: mut_seq = self.mutate(mut_seq, lambda0=lambda0) aa_mut = translate(mut_seq) dist = hamming_distance(aa, aa_mut) if dist == Nmuts and '*' not in aa_mut: # Stop codon cannot be part of the return return aa_mut else: trial -= 1 raise RuntimeError( '100 consecutive attempts for creating a target sequence failed.')
def simulate(args): ''' Simulation subprogram. Can simulate in two modes. a) Neutral mode. A Galton–Watson process, with mutation probabilities according to a user defined motif model e.g. S5F. b) Selection mode. Using the same mutation process as in a), but in selection mode the poisson progeny distribution's lambda parameter is dynamically adjusted accordring to the hamming distance to a list of target sequences. The closer a sequence gets to one of the targets the higher fitness and the closer lambda will approach 2, vice versa when the sequence is far away lambda approaches 0. ''' if args.random_seed is not None: numpy.random.seed(args.random_seed) random.seed(args.random_seed) mutation_model = MutationModel(args.mutability, args.substitution) if args.lambda0 is None: args.lambda0 = [max([1, int(.01 * len(args.sequence))])] if args.random_seq is not None: from Bio import SeqIO records = list(SeqIO.parse(args.random_seq, "fasta")) random.shuffle(records) args.sequence = str(records[0].seq).upper() else: args.sequence = args.sequence.upper() if args.sequence2 is not None: if len(args.lambda0 ) == 1: # Use the same mutation rate on both sequences args.lambda0 = [args.lambda0[0], args.lambda0[0]] elif len(args.lambda0) != 2: raise Exception( 'Only one or two lambda0 can be defined for a two sequence simulation.' ) # Extract the bounds between sequence 1 and 2: pair_bounds = ((0, len(args.sequence)), (len(args.sequence), len(args.sequence) + len(args.sequence2))) # Merge the two seqeunces to simplify future dealing with the pair: args.sequence += args.sequence2.upper() else: pair_bounds = None if args.selection: assert ( args.B_total >= args.f_full ) # the fully activating fraction on BA must be possible to reach within B_total # Find the total amount of A necessary for sustaining the inputted carrying capacity: print((args.carry_cap, args.B_total, args.f_full, args.mature_affy)) A_total = selection_utils.find_A_total(args.carry_cap, args.B_total, args.f_full, args.mature_affy, args.U) # Calculate the parameters for the logistic function: Lp = selection_utils.find_Lp(args.f_full, args.U) selection_params = [ args.stop_dist, args.mature_affy, args.naive_affy, args.target_dist, args.target_count, args.skip_update, A_total, args.B_total, Lp, args.k, args.outbase ] else: selection_params = None trials = 1000 # This loop makes us resimulate if size too small, or backmutation: for trial in range(trials): try: tree = mutation_model.simulate(args.sequence, pair_bounds=pair_bounds, lambda_=args.lambda_, lambda0=args.lambda0, n=args.n, N=args.N, T=args.T, verbose=args.verbose, selection_params=selection_params) if args.selection: collapsed_tree = CollapsedTree(tree=tree, name='GCsim selection', collapse_syn=False, allow_repeats=True) else: collapsed_tree = CollapsedTree( tree=tree, name='GCsim neutral' ) # <-- This will fail if backmutations tree.ladderize() uniques = sum(node.frequency > 0 for node in collapsed_tree.tree.traverse()) if uniques < 2: raise RuntimeError( 'collapsed tree contains {} sampled sequences'.format( uniques)) break except RuntimeError as e: print('{}, trying again'.format(e)) else: raise if trial == trials - 1: raise RuntimeError('{} attempts exceeded'.format(trials)) # In the case of a sequence pair print them to separate files: if args.sequence2 is not None: fh1 = open(args.outbase + '_seq1.fasta', 'w') fh2 = open(args.outbase + '_seq2.fasta', 'w') fh1.write('>naive\n') fh1.write(args.sequence[pair_bounds[0][0]:pair_bounds[0][1]] + '\n') fh2.write('>naive\n') fh2.write(args.sequence[pair_bounds[1][0]:pair_bounds[1][1]] + '\n') for leaf in tree.iter_leaves(): if leaf.frequency != 0: fh1.write('>' + leaf.name + '\n') fh1.write(leaf.sequence[pair_bounds[0][0]:pair_bounds[0][1]] + '\n') fh2.write('>' + leaf.name + '\n') fh2.write(leaf.sequence[pair_bounds[1][0]:pair_bounds[1][1]] + '\n') else: with open(args.outbase + '.fasta', 'w') as f: f.write('>naive\n') f.write(args.sequence + '\n') for leaf in tree.iter_leaves(): if leaf.frequency != 0: f.write('>' + leaf.name + '\n') f.write(leaf.sequence + '\n') # Some observable simulation stats to write: frequency, distance_from_naive, degree = zip( *[(node.frequency, hamming_distance(node.sequence, args.sequence), sum( hamming_distance(node.sequence, node2.sequence) == 1 for node2 in collapsed_tree.tree.traverse() if node2.frequency and node2 is not node)) for node in collapsed_tree.tree.traverse() if node.frequency]) stats = pd.DataFrame({ 'genotype abundance': frequency, 'Hamming distance to root genotype': distance_from_naive, 'Hamming neighbor genotypes': degree }) stats.to_csv(args.outbase + '_stats.tsv', sep='\t', index=False) print('{} simulated observed sequences'.format( sum(leaf.frequency for leaf in collapsed_tree.tree.traverse()))) # Render the full lineage tree: ts = TreeStyle() ts.rotation = 90 ts.show_leaf_name = False ts.show_scale = False colors = {} palette = SVG_COLORS palette -= set(['black', 'white', 'gray']) palette = cycle(list(palette)) # <-- Circular iterator # Either plot by DNA sequence or amino acid sequence: if args.plotAA and args.selection: colors[tree.AAseq] = 'gray' else: colors[tree.sequence] = 'gray' for n in tree.traverse(): nstyle = NodeStyle() nstyle["size"] = 10 if args.plotAA: if n.AAseq not in colors: colors[n.AAseq] = next(palette) nstyle['fgcolor'] = colors[n.AAseq] else: if n.sequence not in colors: colors[n.sequence] = next(palette) nstyle['fgcolor'] = colors[n.sequence] n.set_style(nstyle) # Render and pickle lineage tree: tree.render(args.outbase + '_lineage_tree.svg', tree_style=ts) with open(args.outbase + '_lineage_tree.p', 'wb') as f: pickle.dump(tree, f) # Render collapsed tree, # create an id-wise colormap # NOTE: node.name can be a set if args.plotAA and args.selection: colormap = { node.name: colors[node.AAseq] for node in collapsed_tree.tree.traverse() } else: colormap = { node.name: colors[node.sequence] for node in collapsed_tree.tree.traverse() } collapsed_tree.write(args.outbase + '_collapsed_tree.p') collapsed_tree.render(args.outbase + '_collapsed_tree.svg', idlabel=args.idlabel, colormap=colormap) # Print colormap to file: with open(args.outbase + '_collapsed_tree_colormap.tsv', 'w') as f: for name, color in colormap.items(): f.write((name if isinstance(name, str) else ','.join(name)) + '\t' + color + '\n') with open(args.outbase + '_collapsed_tree_colormap.p', 'wb') as f: pickle.dump(colormap, f) if args.selection: # Define a list a suitable colors that are easy to distinguish: palette = [ 'crimson', 'purple', 'hotpink', 'limegreen', 'darkorange', 'darkkhaki', 'brown', 'lightsalmon', 'darkgreen', 'darkseagreen', 'darkslateblue', 'teal', 'olive', 'wheat', 'magenta', 'lightsteelblue', 'plum', 'gold' ] palette = cycle(list(palette)) # <-- circular iterator colors = { i: next(palette) for i in range(int(len(args.sequence) // 3)) } # The minimum distance to the target is colored: colormap = { node.name: colors[node.target_dist] for node in collapsed_tree.tree.traverse() } collapsed_tree.write(args.outbase + '_collapsed_runstat_color_tree.p') collapsed_tree.render(args.outbase + '_collapsed_runstat_color_tree.svg', idlabel=args.idlabel, colormap=colormap) # Write a file with the selection run stats. These are also plotted: with open(args.outbase + '_selection_runstats.p', 'rb') as fh: runstats = pickle.load(fh) selection_utils.plot_runstats(runstats, args.outbase, colors)
def simulate(self, sequence, pair_bounds=None, lambda_=0.9, lambda0=[1], N=None, T=None, n=None, verbose=False, selection_params=None): ''' Simulate a poisson branching process with mutation introduced by the chosen mutation model e.g. motif or uniform. Can either simulate under a neutral model without selection, or using an affinity muturation inspired model for selection. ''' progeny = poisson(lambda_) # Default progeny distribution stop_dist = None # Default stopping criterium for affinity simulation # Checking the validity of the input parameters: if N is not None and T is not None: raise ValueError( 'Only one of N and T can be used. One must be None.') if selection_params is not None and T is None: raise ValueError( 'Simulation with selection was chosen. A time, T, must be specified.' ) elif N is None and T is None: raise ValueError('Either N or T must be specified.') if N is not None and n is not None and n[-1] > N: raise ValueError('n ({}) must not larger than N ({})'.format( n[-1], N)) elif N is not None and n is not None and len(n) != 1: raise ValueError( 'n ({}) must a single value when specifying N'.format(n)) if T is not None and len(T) > 1 and (n is None or (len(n) != 1 and len(n) != len(T))): raise ValueError( 'n must be specified when using intermediate sampling:', n) elif T is not None and len(T) > 1 and len(n) == 1: n = [n[-1]] * len(T) # Planting the tree: tree = TreeNode() tree.dist = 0 tree.add_feature('sequence', sequence) tree.add_feature('terminated', False) tree.add_feature('sampled', False) tree.add_feature('frequency', 0) tree.add_feature('time', 0) if selection_params is not None: hd_generation = list( ) # Collect an array of the counts of each hamming distance at each time step stop_dist, mature_affy, naive_affy, target_dist, target_count, skip_update, A_total, B_total, Lp, k, outbase = selection_params # Make a list of target sequences: targetAAseqs = [ self.one_mutant(sequence, target_dist) for i in range(target_count) ] # Assert that the target sequences are comparable to the naive sequence: aa = translate(tree.sequence) assert (sum([1 for t in targetAAseqs if len(t) != len(aa)]) == 0 ) # All targets are same length assert (sum([ 1 for t in targetAAseqs if hamming_distance(aa, t) == target_dist ])) # All target are "target_dist" away from the naive sequence # Affinity is an exponential function of hamming distance: assert (target_dist > 0) def hd2affy(hd): return (mature_affy + hd**k * (naive_affy - mature_affy) / target_dist**k) # We store both the amino acid sequence and the affinity as tree features: tree.add_feature('AAseq', str(aa)) tree.add_feature( 'Kd', selection_utils.calc_Kd(tree.AAseq, targetAAseqs, hd2affy)) tree.add_feature( 'target_dist', min([ hamming_distance(tree.AAseq, taa) for taa in targetAAseqs ])) t = 0 # <-- Time at start leaves_unterminated = 1 # Small lambdas are causing problems so make a minimum: lambda_min = 10e-10 hd_distrib = [] while leaves_unterminated > 0 and ( leaves_unterminated < N if N is not None else True) and (t < max(T) if T is not None else True) and ( stop_dist >= min(hd_distrib) if stop_dist is not None and t > 0 else True): if verbose: print('At time:', t) t += 1 # Sample intermediate time point: if T is not None and len(T) > 1 and (t - 1) in T: si = T.index(t - 1) live_nostop_leaves = [ l for l in tree.iter_leaves() if not l.terminated and not has_stop(l.sequence) ] random.shuffle(live_nostop_leaves) if len(live_nostop_leaves) < n[si]: raise RuntimeError( 'tree with {} leaves, less than what desired for intermediate sampling {}. Try later generation or increasing the carrying capacity.' .format(leaves_unterminated, n)) # Make the sample and kill the cells sampled: for leaf in live_nostop_leaves[:n[si]]: leaves_unterminated -= 1 leaf.sampled = True leaf.terminated = True if verbose: print('Made an intermediate sample at time:', t - 1) live_leaves = [l for l in tree.iter_leaves() if not l.terminated] random.shuffle(live_leaves) skip_lambda_n = 0 # At every new round reset the all the lambdas # Draw progeny for each leaf: for leaf in live_leaves: if selection_params is not None: if skip_lambda_n == 0: skip_lambda_n = skip_update + 1 # Add one so skip_update=0 is no skip tree = selection_utils.lambda_selection( tree, targetAAseqs, hd2affy, A_total, B_total, Lp) if leaf.lambda_ > lambda_min: progeny = poisson(leaf.lambda_) else: progeny = poisson(lambda_min) skip_lambda_n -= 1 n_children = progeny.rvs() leaves_unterminated += n_children - 1 # <-- Getting 1, is equal to staying alive if not n_children: leaf.terminated = True for child_count in range(n_children): # If sequence pair mutate them separately with their own mutation rate: if pair_bounds is not None: mutated_sequence1 = self.mutate( leaf.sequence[pair_bounds[0][0]:pair_bounds[0][1]], lambda0=lambda0[0]) mutated_sequence2 = self.mutate( leaf.sequence[pair_bounds[1][0]:pair_bounds[1][1]], lambda0=lambda0[1]) mutated_sequence = mutated_sequence1 + mutated_sequence2 else: mutated_sequence = self.mutate(leaf.sequence, lambda0=lambda0[0]) child = TreeNode() child.dist = sum( x != y for x, y in zip(mutated_sequence, leaf.sequence)) child.add_feature('sequence', mutated_sequence) if selection_params is not None: aa = translate(child.sequence) child.add_feature('AAseq', str(aa)) child.add_feature( 'Kd', selection_utils.calc_Kd(child.AAseq, targetAAseqs, hd2affy)) child.add_feature( 'target_dist', min([ hamming_distance(child.AAseq, taa) for taa in targetAAseqs ])) child.add_feature('frequency', 0) child.add_feature('terminated', False) child.add_feature('sampled', False) child.add_feature('time', t) leaf.add_child(child) if selection_params is not None: hd_distrib = [ min([ hamming_distance(tn.AAseq, ta) for ta in targetAAseqs ]) for tn in tree.iter_leaves() if not tn.terminated ] if target_dist > 0: hist = scipy.histogram(hd_distrib, bins=list(range(target_dist * 10))) else: # Just make a minimum of 10 bins hist = scipy.histogram(hd_distrib, bins=list(range(10))) hd_generation.append(hist) if verbose and hd_distrib: print('Total cell population:', sum(hist[0])) print('Majority hamming distance:', scipy.argmax(hist[0])) print('Affinity of latest sampled leaf:', leaf.Kd) print( 'Progeny distribution lambda for the latest sampled leaf:', leaf.lambda_) if leaves_unterminated < N: raise RuntimeError( 'Tree terminated with {} leaves, {} desired'.format( leaves_unterminated, N)) # Keep a histogram of the hamming distances at each generation: if selection_params is not None: with open(outbase + '_selection_runstats.p', 'wb') as f: pickle.dump(hd_generation, f) # Each leaf in final generation gets an observation frequency of 1, unless downsampled: if T is not None and len(T) > 1: # Iterate the intermediate time steps (excluding the last time): for Ti in sorted(T)[:-1]: si = T.index(Ti) # Only sample those that have been 'sampled' at intermediate sampling times: final_leaves = [ leaf for leaf in tree.iter_descendants() if leaf.time == Ti and leaf.sampled ] if len(final_leaves) < n[si]: raise RuntimeError( 'tree terminated with {} leaves, less than what desired after downsampling {}' .format(leaves_unterminated, n[si])) for leaf in final_leaves: # No need to down-sample, this was already done in the simulation loop leaf.frequency = 1 if selection_params and max(T) != t: raise RuntimeError( 'tree terminated with before the requested sample time.') # Do the normal sampling of the last time step: final_leaves = [ leaf for leaf in tree.iter_leaves() if leaf.time == t and not has_stop(leaf.sequence) ] # Report stop codon sequences: stop_leaves = [ leaf for leaf in tree.iter_leaves() if leaf.time == t and has_stop(leaf.sequence) ] if stop_leaves: print( 'Tree contains {} leaves with stop codons, out of {} total at last time point.' .format(len(stop_leaves), len(final_leaves))) if T is not None: si = T.index(sorted(T)[-1]) else: si = 0 # By default, downsample to the target simulation size: if n is not None and len(final_leaves) >= n[si]: for leaf in random.sample(final_leaves, n[si]): leaf.frequency = 1 elif n is None and N is not None: if len( final_leaves ) < N: # Removed nonsense sequences might decrease the number of final leaves to less than N N = len(final_leaves) for leaf in random.sample(final_leaves, N): leaf.frequency = 1 elif N is None and T is not None: for leaf in final_leaves: leaf.frequency = 1 elif n is not None and len(final_leaves) < n[si]: raise RuntimeError( 'tree terminated with {} leaves, less than what desired after downsampling {}' .format(leaves_unterminated, n[si])) else: raise RuntimeError('Unknown option.') # Prune away lineages that are unobserved: for node in tree.iter_descendants(): if sum(node2.frequency for node2 in node.traverse()) == 0: node.detach() # Remove unobserved unifurcations: for node in tree.iter_descendants(): parent = node.up if node.frequency == 0 and len(node.children) == 1: node.delete(prevent_nondicotomic=False) node.children[0].dist = hamming_distance( node.children[0].sequence, parent.sequence) # Assign unique names to each node: for i, node in enumerate(tree.traverse(), 1): node.name = 'simcell_{}'.format(i) # Return the uncollapsed tree: return tree
def align_lineages(seq, tree_t, tree_i, gap_penalty_pct=0, known_root=True, allow_double_gap=False): ''' Standard implementation of a Needleman-Wunsch algorithm as described here: http://telliott99.blogspot.com/2009/08/alignment-needleman-wunsch.html https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm And implemented here: https://github.com/alevchuk/pairwise-alignment-in-python/blob/master/alignment.py gap_penalty_pct is the gap penalty relative to the sequence length of the sequences on the tree. ''' nt = find_node_by_seq(tree_t, seq) lt = reconstruct_lineage(tree_t, nt) ni = find_node_by_seq(tree_i, seq) li = reconstruct_lineage(tree_i, ni) # One lineages must be longer than just the root and the terminal node if len(lt) <= 2 and len(li) <= 2: return False # Gap penalty chosen not too large: gap_penalty = -1 * int((len(seq) / 100.0) * gap_penalty_pct) assert (gap_penalty <= 0) # Penalties must be negative if gap_penalty == 0: # If gap penalty is zero only gaps in the shortes sequence will be allowed assert (allow_double_gap is False) # Generate a score matrix matrix: kt = len(lt) ki = len(li) # Disallow gaps in the longest list: if allow_double_gap is False and kt > ki: # If true is longer than inferred allow gap only in inferred: gap_penalty_i = gap_penalty gap_penalty_j = -1 * float('inf') elif allow_double_gap is False and kt < ki: # If inferred is longer than true allow gap only in true: gap_penalty_i = -1 * float('inf') gap_penalty_j = gap_penalty elif allow_double_gap is False and kt == ki: # If lists are equally long no gaps are allowed: gap_penalty_i = -1 * float('inf') gap_penalty_j = -1 * float('inf') else: gap_penalty_i = gap_penalty gap_penalty_j = gap_penalty sc_mat = np.zeros((kt, ki), dtype=np.float64) for i in range(kt): for j in range(ki): # Notice the score is defined by number of mismatches: #sc_mat[i, j] = len(lt[i]) - hamming_distance(lt[i], li[j]) sc_mat[i, j] = -1 * hamming_distance(lt[i], li[j]) ### print(sc_mat) # Calculate the alignment scores: aln_sc = np.zeros((kt + 1, ki + 1), dtype=np.float64) for i in range(0, kt + 1): if known_root is True: aln_sc[i][0] = -1 * float('inf') else: aln_sc[i][0] = gap_penalty_i * i for j in range(0, ki + 1): if known_root is True: aln_sc[0][j] = -1 * float('inf') else: aln_sc[0][j] = gap_penalty_j * j aln_sc[0][0] = 0 # The top left is fixed to zero ### print(aln_sc) for i in range(1, kt + 1): for j in range(1, ki + 1): match = aln_sc[i - 1][j - 1] + sc_mat[i - 1, j - 1] gap_in_inferred = aln_sc[i - 1][j] + gap_penalty_i gap_in_true = aln_sc[i][j - 1] + gap_penalty_j aln_sc[i][j] = max(match, gap_in_inferred, gap_in_true) ### print(aln_sc) # Traceback to compute the alignment: align_t, align_i, asr_align = list(), list(), list() i, j = kt, ki alignment_score = aln_sc[i][j] while i > 0 and j > 0: sc_current = aln_sc[i][j] sc_diagonal = aln_sc[i - 1][j - 1] sc_up = aln_sc[i][j - 1] sc_left = aln_sc[i - 1][j] if sc_current == (sc_diagonal + sc_mat[i - 1, j - 1]): align_t.append(lt[i - 1]) align_i.append(li[j - 1]) i -= 1 j -= 1 elif sc_current == (sc_left + gap_penalty_i): align_t.append(lt[i - 1]) align_i.append('-') i -= 1 elif sc_current == (sc_up + gap_penalty_j): align_t.append('-') align_i.append(li[j - 1]) j -= 1 # If space left fill it with gaps: while i > 0: asr_align.append(gap_penalty_i) align_t.append(lt[i - 1]) align_i.append('-') i -= 1 while j > 0: asr_align.append(gap_penalty_j) align_t.append('-') align_i.append(li[j - 1]) j -= 1 max_penalty = 0 for a, b in zip(align_t, align_i): if a == '-' or b == '-': max_penalty += gap_penalty else: max_penalty += -len(a) # Notice that the root and the terminal node is excluded from this comparison. # by adding their length to the max_penalty: if known_root is True: max_penalty += 2 * len(lt[0]) else: # Or in the case of an unknown root, just add the terminal node max_penalty += len(lt[0]) return [align_t, align_i, alignment_score, max_penalty]
def render(self, outfile, idlabel=False, colormap=None, show_support=False, chain_split=None): '''render to image file, filetype inferred from suffix, svg for color images''' def my_layout(node): circle_color = 'lightgray' if colormap is None or node.name not in colormap else colormap[ node.name] text_color = 'black' if isinstance(circle_color, str): C = CircleFace(radius=max(3, 10 * scipy.sqrt(node.frequency)), color=circle_color, label={ 'text': str(node.frequency), 'color': text_color } if node.frequency > 0 else None) C.rotation = -90 C.hz_align = 1 faces.add_face_to_node(C, node, 0) else: P = PieChartFace( [100 * x / node.frequency for x in circle_color.values()], 2 * 10 * scipy.sqrt(node.frequency), 2 * 10 * scipy.sqrt(node.frequency), colors=[(color if color != 'None' else 'lightgray') for color in list(circle_color.keys())], line_color=None) T = TextFace(' '.join( [str(x) for x in list(circle_color.values())]), tight_text=True) T.hz_align = 1 T.rotation = -90 faces.add_face_to_node(P, node, 0, position='branch-right') faces.add_face_to_node(T, node, 1, position='branch-right') if idlabel: T = TextFace(node.name, tight_text=True, fsize=6) T.rotation = -90 T.hz_align = 1 faces.add_face_to_node( T, node, 1 if isinstance(circle_color, str) else 2, position='branch-right') for node in self.tree.traverse(): nstyle = NodeStyle() nstyle['size'] = 0 if node.up is not None: if set(node.sequence.upper()) == set('ACGT'): if chain_split is not None: if self.frame is not None: raise NotImplementedError( 'frame not implemented with chain_split') leftseq_mutated = hamming_distance( node.sequence[:chain_split], node.up.sequence[:chain_split]) > 0 rightseq_mutated = hamming_distance( node.sequence[chain_split:], node.up.sequence[chain_split:]) > 0 if leftseq_mutated and rightseq_mutated: nstyle['hz_line_color'] = 'purple' nstyle['hz_line_width'] = 3 elif leftseq_mutated: nstyle['hz_line_color'] = 'red' nstyle['hz_line_width'] = 2 elif rightseq_mutated: nstyle['hz_line_color'] = 'blue' nstyle['hz_line_width'] = 2 if self.frame is not None: aa = Seq( node.sequence[(self.frame - 1):(self.frame - 1 + (3 * (((len(node.sequence) - (self.frame - 1)) // 3))))], generic_dna).translate() aa_parent = Seq( node.up.sequence[(self.frame - 1):(self.frame - 1 + (3 * (( (len(node.sequence) - (self.frame - 1)) // 3))))], generic_dna).translate() nonsyn = hamming_distance(aa, aa_parent) if '*' in aa: nstyle['bgcolor'] = 'red' if nonsyn > 0: nstyle['hz_line_color'] = 'black' nstyle['hz_line_width'] = nonsyn else: nstyle['hz_line_type'] = 1 node.set_style(nstyle) ts = TreeStyle() ts.show_leaf_name = False ts.rotation = 90 ts.draw_aligned_faces_as_table = False ts.allow_face_overlap = True ts.layout_fn = my_layout ts.show_scale = False ts.show_branch_support = show_support self.tree.render(outfile, tree_style=ts) # if we labelled seqs, let's also write the alignment out so we have the sequences (including of internal nodes) if idlabel: aln = MultipleSeqAlignment([]) for node in self.tree.traverse(): aln.append( SeqRecord(Seq(str(node.sequence), generic_dna), id=str(node.name), description='abundance={}'.format( node.frequency))) AlignIO.write(aln, open(os.path.splitext(outfile)[0] + '.fasta', 'w'), 'fasta')
def __init__(self, params=None, tree=None, frame=None, collapse_syn=False, allow_repeats=False): ''' For intialization, either params or tree (or both) must be provided params: offspring distribution parameters tree: ete tree with frequency node feature. If uncollapsed, it will be collapsed frame: tranlation frame, with default None, no tranlation attempted ''' LeavesAndClades.__init__(self, params=params) if frame is not None and frame not in (1, 2, 3): raise RuntimeError('frame must be 1, 2, 3, or None') self.frame = frame if collapse_syn is True: tree.dist = 0 # no branch above root for node in tree.iter_descendants(): aa = Seq( node.sequence[(frame - 1):(frame - 1 + (3 * (((len(node.sequence) - (frame - 1)) // 3))))], generic_dna).translate() aa_parent = Seq( node.up.sequence[(frame - 1):(frame - 1 + (3 * (((len(node.sequence) - (frame - 1)) // 3))))], generic_dna).translate() node.dist = hamming_distance(aa, aa_parent) if tree is not None: self.tree = tree.copy() # remove unobserved internal unifurcations for node in self.tree.iter_descendants(): parent = node.up if node.frequency == 0 and len(node.children) == 1: node.delete(prevent_nondicotomic=False) node.children[0].dist = hamming_distance( node.children[0].sequence, parent.sequence) # iterate over the tree below root and collapse edges of zero length # if the node is a leaf and it's parent has nonzero frequency we combine taxa names to a set # this acommodates bootstrap samples that result in repeated genotypes observed_genotypes = set((leaf.name for leaf in self.tree)) observed_genotypes.add(self.tree.name) for node in self.tree.get_descendants(strategy='postorder'): if node.dist == 0: node.up.frequency += node.frequency node_set = set([node.name]) if isinstance( node.name, str) else set(node.name) node_up_set = set([node.up.name]) if isinstance( node.up.name, str) else set(node.up.name) if node_up_set < observed_genotypes: if node_set < observed_genotypes: node.up.name = tuple(node_set | node_up_set) if len(node.up.name) == 1: node.up.name = node.up.name[0] elif node_set < observed_genotypes: node.up.name = tuple(node_set) if len(node.up.name) == 1: node.up.name = node.up.name[0] node.delete(prevent_nondicotomic=False) final_observed_genotypes = set([ name for node in self.tree.traverse() if node.frequency > 0 or node == self.tree for name in (( node.name, ) if isinstance(node.name, str) else node.name) ]) if final_observed_genotypes != observed_genotypes: raise RuntimeError( 'observed genotypes don\'t match after collapse\n\tbefore: {}\n\tafter: {}\n\tsymmetric diff: {}' .format(observed_genotypes, final_observed_genotypes, observed_genotypes ^ final_observed_genotypes)) assert sum(node.frequency for node in tree.traverse()) == sum( node.frequency for node in self.tree.traverse()) rep_seq = sum( node.frequency > 0 for node in self.tree.traverse()) - len( set([ node.sequence for node in self.tree.traverse() if node.frequency > 0 ])) if not allow_repeats and rep_seq: raise RuntimeError( 'Repeated observed sequences in collapsed tree. {} sequences were found repeated.' .format(rep_seq)) elif allow_repeats and rep_seq: rep_seq = sum(node.frequency > 0 for node in self.tree.traverse()) - len( set([ node.sequence for node in self.tree.traverse() if node.frequency > 0 ])) print( 'Repeated observed sequences in collapsed tree. {} sequences were found repeated.' .format(rep_seq)) # a custom ladderize accounting for abundance and sequence to break ties in abundance for node in self.tree.traverse(strategy='postorder'): # add a partition feature and compute it recursively up the tree node.add_feature( 'partition', node.frequency + sum(node2.partition for node2 in node.children)) # sort children of this node based on partion and sequence node.children.sort( key=lambda node: (node.partition, node.sequence)) else: self.tree = tree