def jackknifing_tree(file_pattern, di_method): """ Given a pattern for the list of subsampled DI files (each file should have per-line format <sample>,<size>,<comma-separated DI> and have one of the sizes be 'real') Run clustering and count the differences between subsampled and real trees Returns: tree (size-->sample-->list of trees), symmetric_difference (size-->list of diffs), robinson_foulds_distance (size-->list of diffs) """ from clustering import Cluster import dendropy dTree = lambda x: dendropy.Tree.get_from_string(x, "newick") samples = None sizes = None trees = {} for file in glob.iglob(file_pattern): print >> sys.stderr, "reading subsampled DI file {0}....".format(file) d = {} with open(file) as f: for line in f: sample, size, di = line.strip().split(',', 2) if size not in d: d[size] = {} d[size][sample] = np.array(map(float, di.split(','))) if len(d) == 0: continue if sizes is None: sizes = d.keys() sizes.sort() samples = d[sizes[0]].keys() samples.sort() for size, di_dict in d.iteritems(): c = Cluster(None) c.init_from_di_list(di_dict, method=di_method, threshold=0) c.run_till_end() try: trees[size].append(dTree(str(c.trees[0]))) except KeyError: trees[size] = [dTree(str(c.trees[0]))] # tally (1) symmetric differences (edge weight ignored) # (2) robinson_foulds_distance (edge weight considered) # 'real' is the size that is the full pool that we compare all other trees to sym_diff = {} rob_diff = {} for size in sizes: if size == 'real': continue t_real = trees['real'][0] sym_diff[size] = [t_real.symmetric_difference(t) for t in trees[size]] rob_diff[size] = [t_real.robinson_foulds_distance(t) for t in trees[size]] return trees, sym_diff, rob_diff
def jackknifing_tree_DF(file_pattern, di_method, samples_to_exclude=['1412-1','1412-4']): """ Similar as jackknifing_tree but using DF files and (probably improved clustering in clustering.py which I need manually turn on) Run clustering and count the differences between subsampled and real trees Returns: tree (size-->sample-->list of trees), symmetric_difference (size-->list of diffs), robinson_foulds_distance (size-->list of diffs) """ from clustering import Cluster import dendropy dTree = lambda x: dendropy.Tree.get_from_string(x, "newick") trees = {} for file in glob.iglob(file_pattern): print >> sys.stderr, "reading subsampled DF file {0}....".format(file) d = {} # size --> list of dfs with open(file) as f: for df in DF.DFReader(f): sample = df.name if sample in samples_to_exclude: print >> sys.stderr, "EXCLUDING SAMPLE {0}!".format(sample) continue size = df.annotations['size'] if size not in d: d[size] = [] # need to change the mask for df!!! # not a problem when we did with DI becuz it was already masked df.change_vec_mask(valid_DI_pos) d[size].append(df) for size, df_list in d.iteritems(): c = Cluster(df_list, method=di_method, threshold=0) c.run_till_end() try: trees[size].append(dTree(str(c.trees[0]))) except KeyError: trees[size] = [dTree(str(c.trees[0]))] print "size", size, "file", file print c.trees[0] # tally (1) symmetric differences (edge weight ignored) # (2) robinson_foulds_distance (edge weight considered) # 'real' is the size that is the full pool that we compare all other trees to sym_diff = {} rob_diff = {} for size in trees: if size == 'real': continue t_real = trees['real'][0] sym_diff[size] = [t_real.symmetric_difference(t) for t in trees[size]] rob_diff[size] = [t_real.robinson_foulds_distance(t) for t in trees[size]] return trees, sym_diff, rob_diff