def subsampling_BowTie_n_inhouse(file_iter, di_method, df_output):
	"""
	Run subsampling for BowTie+inhouse (inhouse must already have been pre-processed for eligibility)
	Simply prints out per line: <sample>,<size>,<comma-separated DI>
	"""
	runner = DiversityIndexRunner()
	seqvec = np.zeros((5,520), dtype=np.int)
	print >> sys.stderr, "DFs will be written to {0}....".format(df_output)
	h = open(df_output, 'w')
	w = DF.DFWriter(h)
	for sample,file in file_iter:
		eligible_bowtie = hello.subsample_reads_BowTie_prepare(file, refmap, phred_cutoff, min_length, ecoli_lo, ecoli_hi)
		eligible_inhouse = load(open(inhouse_eligible.format(sample, phred_cutoff, min_length, max_degen, L2[ecoli_lo], L2[ecoli_hi])))
		print >> sys.stderr, "eligible reads for {0}: bowtie {1}, inhouse {2}".format(sample, \
				len(eligible_bowtie), len(eligible_inhouse))
		p = len(eligible_bowtie)*1./(len(eligible_bowtie)+len(eligible_inhouse))
		for size in SUBSAMPLE_SIZES:
			print >> sys.stderr, sample, size
			seqvec[:] = 0
			hello.subsample_reads_BowTie(file, refmap, seqvec, eligible_bowtie, int(size*p))
			hello.subsample_reads_inhouse(refmap, seqvec, eligible_inhouse, phred_cutoff, min_length, size-int(size*p))
			df = Read.ReadDF(sample, refmap)
			df.len = 520
			df.assign_vec(seqvec)
			df.add_annotation('size', size)
			w.write(df)
			di=runner.run(df, method=di_method, threshold=0, vec_pre_normalized=False, ignoreN=True)[valid_DI_pos]
			print("{0},{1},{2}".format(sample,size,",".join(map(str,di))))
	h.close()
def main(file_iter, output_df_filename, log_f):
	log_f.write("phred cutoff:{0}\n".format(phred_cutoff))
	log_f.write("min length:{0}\n".format(min_length))
	log_f.write("max degen (if used):{0}\n".format(max_degen))
	log_f.write("use ecoli range {0}-{1}\n".format(ecoli_lo, ecoli_hi))
	f = open(output_df_filename, 'w')
	dfwriter = DF.DFWriter(f)
	for sample,file in file_iter:
		print >> sys.stderr, "processing {0}.........".format(sample)
		seqvec = np.zeros((5,520), dtype=np.int)
		# --------------- for in-house aligned ------------- #
		for file in glob.iglob(inhouse.format(sample)):
#			if file.endswith('.bz2'):
#				os.system("bunzip2 " + file)
#				file = file[:-4]
			used, discarded = hello.gather_reads_inhouse(file, refmap, seqvec, phred_cutoff, min_length, max_degen, ecoli_lo, ecoli_hi)
			print >> sys.stderr, file, used, discarded
			log_f.write("FILE:{0} USED:{1} DISCARDED:{2}\n".format(file, used, discarded))
#			os.system("bzip2 " + file)
		#  ---------------- for BowTie-aligned ---------------#
#		used, discarded = hello.gather_reads_BowTie(file, refmap, seqvec, phred_cutoff, min_length, ecoli_lo, ecoli_hi)
#		print >> sys.stderr, "used:", used, "discarded:", discarded
#		log_f.write("FILE:{0} USED:{1} DISCARDED:{2}\n".format(file, used, discarded))
		df = Read.ReadDF(sample, refmap)
		df.len = 520
		df.assign_vec(seqvec)
		dfwriter.write(df)
		runner = DiversityIndexRunner()
		di=runner.run(df, method='Simpson', threshold=0, vec_pre_normalized=False, ignoreN=True)[valid_DI_pos]
		print("{0},{1}".format(sample,",".join(map(str,di))))
	f.close()
Ejemplo n.º 3
0
	def __init__(self, df_list, **kwargs):
		if df_list is None:
			print >> sys.stderr, "called with a None, I hope you know what you're doing!"
			print >> sys.stderr, "calling init_from_di_list later perhaps?"
			self.df_list = None
			return
		self.df_list = df_list
		self.original_names = [df.name for df in self.df_list]
		self.mask = kwargs['mask'] if 'mask' in kwargs else 1.
		self.method = kwargs['method'] if 'method' in kwargs else 'Simpson'
		self.threshold = kwargs['threshold'] if 'threshold' in kwargs else 10

		self.m = len(df_list) # number of samples (rows)
		self.n = df_list[0].len # length of the DF vectors (columns)

		self.runner = DiversityIndexRunner(self.mask)

		self.trees = [tree.Leaf(self.df_list[i].name) for i in xrange(self.m)]

		self.X = np.zeros((self.m, self.n), dtype=np.float)
		for i,df in enumerate(self.df_list):
			#print >> sys.stderr, "normalizing {0}....".format(df.name)
			df.normalized_vec(ignoreN=True)
			di = self.runner.run(df, method=self.method, threshold=self.threshold, \
					vec_pre_normalized=True, ignoreN=True)
			self.X[i, :] = di	

		# calculate the initial distance matrix
		self._dist = np.zeros((self.m, self.m), dtype=np.float)
		for i in xrange(self.m):
			self._dist[i, i] = float("inf")
			for j in xrange(i+1, self.m):
				# method 1: Euclidean distance between DIs
				d = math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[j,:]))
				# method 2: sum of sum of distances squared between DFs
				#d = self.df_list[i].get_vec_diff_sqsum(self.df_list[j])
				self._dist[i, j] = d
				self._dist[j, i] = d
Ejemplo n.º 4
0
class Cluster:
	def __init__(self, df_list, **kwargs):
		if df_list is None:
			print >> sys.stderr, "called with a None, I hope you know what you're doing!"
			print >> sys.stderr, "calling init_from_di_list later perhaps?"
			self.df_list = None
			return
		self.df_list = df_list
		self.original_names = [df.name for df in self.df_list]
		self.mask = kwargs['mask'] if 'mask' in kwargs else 1.
		self.method = kwargs['method'] if 'method' in kwargs else 'Simpson'
		self.threshold = kwargs['threshold'] if 'threshold' in kwargs else 10

		self.m = len(df_list) # number of samples (rows)
		self.n = df_list[0].len # length of the DF vectors (columns)

		self.runner = DiversityIndexRunner(self.mask)

		self.trees = [tree.Leaf(self.df_list[i].name) for i in xrange(self.m)]

		self.X = np.zeros((self.m, self.n), dtype=np.float)
		for i,df in enumerate(self.df_list):
			#print >> sys.stderr, "normalizing {0}....".format(df.name)
			df.normalized_vec(ignoreN=True)
			di = self.runner.run(df, method=self.method, threshold=self.threshold, \
					vec_pre_normalized=True, ignoreN=True)
			self.X[i, :] = di	

		# calculate the initial distance matrix
		self._dist = np.zeros((self.m, self.m), dtype=np.float)
		for i in xrange(self.m):
			self._dist[i, i] = float("inf")
			for j in xrange(i+1, self.m):
				# method 1: Euclidean distance between DIs
				d = math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[j,:]))
				# method 2: sum of sum of distances squared between DFs
				#d = self.df_list[i].get_vec_diff_sqsum(self.df_list[j])
				self._dist[i, j] = d
				self._dist[j, i] = d
	
	def init_from_di_list(self, di_dict, **kwargs):
		"""
		alternative __init__ taking a dict sample_name --> array of DI as input
		if this init is used, we're expecting to run UPGMA clustering
		"""
		self.original_names = di_dict.keys()
		self.original_names.sort()
		self.mask = kwargs['mask'] if 'mask' in kwargs else 1.
		self.method = kwargs['method'] if 'method' in kwargs else 'Simpson'
		self.threshold = kwargs['threshold'] if 'threshold' in kwargs else 10
		self.m = len(di_dict)
		self.n = len(di_dict.itervalues().next())
		self.trees = [tree.Leaf(x) for x in self.original_names]
		self.X = np.zeros((self.m, self.n), dtype=np.float) 
		# fill up X using di_dict
		for i in xrange(self.m):
			self.X[i] = di_dict[self.original_names[i]]
		self._dist = np.zeros((self.m, self.m), dtype=np.float)
		for i in xrange(self.m):
			self._dist[i, i] = float("inf")
			for j in xrange(i+1, self.m):
				# method 1: Euclidean distance between DIs
				d = math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[j,:]))
				# method 2: sum of sum of distances squared between DFs
				#d = self.df_list[i].get_vec_diff_sqsum(self.df_list[j])
				self._dist[i, j] = d
				self._dist[j, i] = d
	
	def write_DI(self, output_filename, mask=None):
		with open(output_filename, 'w') as f:
			for i, name in enumerate(self.original_names):
				di = self.X[i, ] if mask is None else self.X[i, mask]
				f.write(name + ',')
				di.tofile(f, sep=",")
				f.write('\n')
						
	def run_one_cluster_step(self):
		d = self._dist.argmin()
		i, j = d / self.m, d % self.m
		_min_val = self._dist[i, j]
		if _min_val == float("inf"):
			raise StopIteration, "done!"
		#print >> sys.stderr, "combining {0} and {1}".format(self.trees[i], self.trees[j])
		
		# merge j into i
		size_i = len(self.trees[i].get_leaves())
		size_j = len(self.trees[j].get_leaves())
		t = tree.Tree()
		t.add_edge((self.trees[i], 0, _min_val/2)) # (subtree-i, bootstrap=0, branch length=dist)
		t.add_edge((self.trees[j], 0, _min_val/2)) 
		self.trees[i] = t
		self.trees[j] = None
		# NEW!!! instead of just adding df_list[j] to df_list[i], normalize the counts FIRST!!!
		if self.df_list is not None:
			self.df_list[i].normalized_vec_add(self.df_list[j], vec_pre_normalized=True, ignoreN=True)

#			print "before", self.X[i, ]
			self.X[i] = self.runner.run(self.df_list[i], method=self.method, threshold=self.threshold,\
					vec_pre_normalized=True, ignoreN=True)
#			print("merged {0} and {1}".format(i, j))
#			print "new vec is now", self.X[i, ]
		
#		self._dist[j, :] = float("inf")
#		self._dist[:, j] = float("inf")
		for k in xrange(self.m):
			if k==i or k==j or self.trees[k] is None: continue
			# method 1:
			#d = math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[k,:]))
			# method 2:
			#d = self.df_list[i].get_vec_diff_sqsum(self.df_list[k])
			# method 3: UPGMA
			d = (self._dist[k, i] * size_i + self._dist[k, j] * size_j) / (size_i + size_j)
			# method 4: complete linkage
			#d = max(self._dist[k, i], self._dist[k, j])
			#print >> sys.stderr, "using Euclidean dist: {0}, using vecdiff: {1}".format(\
			#		math.sqrt(sum(x**2 for x in self.X[i,:]-self.X[k,:])), d)
			self._dist[i, k] = d
			self._dist[k, i] = d
		self._dist[j, :] = float("inf")
		self._dist[:, j] = float("inf")

	def run_till_end(self):
		while len(self.trees) > 1:
			try:
				self.run_one_cluster_step()
			except StopIteration:
				break