def jackknifing_tree(file_pattern, di_method):
	"""
	Given a pattern for the list of subsampled DI files
	(each file should have per-line format <sample>,<size>,<comma-separated DI>
	 and have one of the sizes be 'real')
	Run clustering and count the differences between subsampled and real trees
	Returns: tree (size-->sample-->list of trees), symmetric_difference (size-->list of diffs),
	         robinson_foulds_distance (size-->list of diffs)
	"""
	from clustering import Cluster
	import dendropy
	dTree = lambda x: dendropy.Tree.get_from_string(x, "newick")
	samples = None
	sizes = None
	trees = {}
	for file in glob.iglob(file_pattern):
		print >> sys.stderr, "reading subsampled DI file {0}....".format(file)
		d = {}
		with open(file) as f:
			for line in f:
				sample, size, di = line.strip().split(',', 2)
				if size not in d:
					d[size] = {}
				d[size][sample] = np.array(map(float, di.split(',')))
		if len(d) == 0: continue
		if sizes is None:
			sizes = d.keys()
			sizes.sort()
			samples = d[sizes[0]].keys()
			samples.sort()
		for size, di_dict in d.iteritems():
			c = Cluster(None)
			c.init_from_di_list(di_dict, method=di_method, threshold=0)
			c.run_till_end()
			try:
				trees[size].append(dTree(str(c.trees[0])))
			except KeyError:
				trees[size] = [dTree(str(c.trees[0]))]

	# tally (1) symmetric differences (edge weight ignored)
	# (2) robinson_foulds_distance (edge weight considered)
	# 'real' is the size that is the full pool that we compare all other trees to
	sym_diff = {}
	rob_diff = {}
	for size in sizes:
		if size == 'real': continue
		t_real = trees['real'][0]
		sym_diff[size] = [t_real.symmetric_difference(t) for t in trees[size]]
		rob_diff[size] = [t_real.robinson_foulds_distance(t) for t in trees[size]]

	return trees, sym_diff, rob_diff
def jackknifing_tree_DF(file_pattern, di_method, samples_to_exclude=['1412-1','1412-4']):
	"""
	Similar as jackknifing_tree but using DF files and
	(probably improved clustering in clustering.py which I need manually turn on)
	Run clustering and count the differences between subsampled and real trees
	Returns: tree (size-->sample-->list of trees), symmetric_difference (size-->list of diffs),
	         robinson_foulds_distance (size-->list of diffs)
	"""
	from clustering import Cluster
	import dendropy
	dTree = lambda x: dendropy.Tree.get_from_string(x, "newick")
	trees = {}
	for file in glob.iglob(file_pattern):
		print >> sys.stderr, "reading subsampled DF file {0}....".format(file)
		d = {} # size --> list of dfs
		with open(file) as f:
			for df in DF.DFReader(f):
				sample = df.name
				if sample in samples_to_exclude:
					print >> sys.stderr, "EXCLUDING SAMPLE {0}!".format(sample)
					continue
				size = df.annotations['size']
				if size not in d:
					d[size] = []
				# need to change the mask for df!!! 
				# not a problem when we did with DI becuz it was already masked
				df.change_vec_mask(valid_DI_pos)
				d[size].append(df)
		for size, df_list in d.iteritems():
			c = Cluster(df_list, method=di_method, threshold=0)
			c.run_till_end()
			try:
				trees[size].append(dTree(str(c.trees[0])))
			except KeyError:
				trees[size] = [dTree(str(c.trees[0]))]
			print "size", size, "file", file
			print c.trees[0]
	# tally (1) symmetric differences (edge weight ignored)
	# (2) robinson_foulds_distance (edge weight considered)
	# 'real' is the size that is the full pool that we compare all other trees to
	sym_diff = {}
	rob_diff = {}
	for size in trees:
		if size == 'real': continue
		t_real = trees['real'][0]
		sym_diff[size] = [t_real.symmetric_difference(t) for t in trees[size]]
		rob_diff[size] = [t_real.robinson_foulds_distance(t) for t in trees[size]]

	return trees, sym_diff, rob_diff