def get_nonrep_matrix(tids, rep_ids, dist): """ Get a distance matrix for non-representative species using a distance matrix for representative species Args: tids (array-like) : taxon IDs for taxa to orient matrix to rep_ids (array-like) : taxon IDs for the representative taxa in *tids* dist (DistanceMatrix) : the distance matrix to orient """ orig_dist = dist uniq, counts = np.unique(rep_ids, return_counts=True) dist = orig_dist.filter(uniq).data extra = counts - 1 indices = np.where(extra > 0)[0] dupes = np.repeat(np.arange(len(uniq)), extra) rep_map = dict() for rep, const in zip(rep_ids, tids): rep_map.setdefault(rep, list()).append(const) rep_order = np.concatenate([np.arange(dist.shape[0]), dupes]) new_tids = [rep_map[uniq[i]].pop() for i in rep_order] dupe_dist = duplicate_dmat_samples(dist, dupes) ret = ssd.DistanceMatrix(dupe_dist, ids=new_tids) ret = ret.filter(tids) return ret
def get_toy_dmat(): """ Get a simple skbio DistanceMatrix for testing """ dist = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]]) ids = ['a', 'b', 'c'] toy_dmat = ssd.DistanceMatrix(dist, ids=ids) return toy_dmat
def update_max_gnid_distances(msd, dm, tree, gnids): """Update pairs containing elements of gnids in matrix of maximum distances.""" data = -msd.data tip_names1 = [tip.name for tip in tree.tips()] tip_names2 = [ tip_name for tip_name in tip_names1 if tip_name.split(':')[1] in gnids ] name2idx = { tip_name: msd.index(tip_name.split(':')[1]) for tip_name in tip_names1 } for tip_name1, tip_name2 in product(tip_names1, tip_names2): idx1 = (name2idx[tip_name1], name2idx[tip_name2]) idx2 = (tip_name1, tip_name2) data[idx1] = max(data[idx1], dm[idx2]) data[idx1[::-1]] = max(data[idx1], dm[idx2]) np.fill_diagonal(data, 0) # Set diagonal to 0 data = np.abs(data) return distance.DistanceMatrix( data, ids=sorted(msd.ids, key=lambda x: msd.index(x))) # Ensure sort
def get_max_gnid_distances(dm): """Return matrix of maximum distances between GNIDs.""" gnid2idx = {} name2idx = {} for tip_name in dm.ids: gnid = tip_name.split(':')[1] try: idx = gnid2idx[gnid] except KeyError: idx = len(gnid2idx) gnid2idx[gnid] = idx name2idx[tip_name] = idx data = np.zeros((len(gnid2idx), len(gnid2idx))) for tip_name1, tip_name2 in product(dm.ids, repeat=2): idx1 = (name2idx[tip_name1], name2idx[tip_name2]) idx2 = (tip_name1, tip_name2) data[idx1] = max(data[idx1], dm[idx2]) np.fill_diagonal(data, 0) # Set diagonal to 0 return distance.DistanceMatrix( data, ids=sorted(gnid2idx, key=lambda x: gnid2idx[x])) # Ensure sort
def reduce(OGid, OG): """Return representative PPIDs for all GNIDs associated with tips in node.""" # Extract sequences seqs = [] ids = [] for ppid in OG: seqs.append(ppid2seq[ppid]) gnid, spid, _ = ppid2meta[ppid] ids.append(f"'{spid}:{gnid}:{ppid}'" ) # Wrap in quotes to ensure correct parsing # Make distance matrix k, p = 4, 2 # Tuple size and power i, j = 0, 1 # Matrix indices dm0 = np.zeros((len(seqs), len(seqs))) for seq1, seq2 in combinations(seqs, 2): # Calculate distance and store in matrix d = get_ktuple_distance(seq1.translate(table), seq2.translate(table), k, p) dm0[i, j] = d dm0[j, i] = d # Calculate indices j += 1 if j > len(seqs) - 1: i += 1 j = i + 1 dm0 = distance.DistanceMatrix(dm0, ids=ids) # Make tree tree = skbio.tree.nj(dm0) update_tip_names(tree) dm = tree.tip_tip_distances() # Use tree distances for pruning msd = get_max_gnid_distances(dm) # Prune tree gnids = {ppid2meta[ppid][0] for ppid in OG} while len(tree.tip_names) > len(gnids): # Remove non-minimal tips in single-species clades for node in tree.postorder(): if node.is_tip(): node.min_tips = {(node.name, node.length)} elif len({ name.split(':')[1] for child in node.children for name, _ in child.min_tips }) == 1: name, length = min([ min_tip for child in node.children for min_tip in child.min_tips ], key=lambda x: x[1]) node.min_tips = {(name, length + node.length)} else: node.min_tips = { min_tip for child in node.children for min_tip in child.min_tips } min_names = {tip_name for tip_name, _ in tree.min_tips} if min_names < tree.tip_names: tip_gnids = { tip_name.split(':')[1] for tip_name in (tree.tip_names - min_names) } tree = tree.shear(min_names) update_tip_names(tree) msd = update_max_gnid_distances(msd, dm, tree, tip_gnids) # Split tree trees = [] for node in tree.traverse(include_self=False): tip_gnids1 = { tip_name.split(':')[1] for tip_name in node.tip_names } tip_gnids2 = { tip_name.split(':')[1] for tip_name in (tree.tip_names - node.tip_names) } if tip_gnids1 == gnids: tree1 = tree.shear(node.tip_names) msd1 = update_max_gnid_distances(msd, dm, tree1, tip_gnids2) trees.append((tree1, msd1)) if tip_gnids2 == gnids: tree2 = tree.shear(tree.tip_names - node.tip_names) msd2 = update_max_gnid_distances(msd, dm, tree2, tip_gnids1) trees.append((tree2, msd2)) if trees: tree, msd = min(trees, key=lambda x: x[1].data.sum()) update_tip_names(tree) # Extract sequences from tree rOG = [tip_name.split(':') for tip_name in tree.tip_names] return OGid, rOG