Exemple #1
0
def smythEmissionDistribution(pair):
	"""
	Given a pair (S: list of sequences, target_m: int), get the emission
	distribution for Smyth's "default" HMM. target_m is an upper bound on the
	number of states -- if we can only have m' distinct observation values, then
	the distribution for a m' state HMM is returned.

	@param pair: A tuple of the form (S: list of sequences, m: int)
	@return: The corresponding emission distribution encoded as a list
		of (mu, stddev) pairs
	"""
	S, target_m = pair
	merged, distinct = prepareSeqs(S)
	m_prime = min(target_m, len(distinct))
	centroids, labels, inertia = k_means(merged, m_prime, init='k-means++')
	clusters = partition(merged, labels)
	B = []
	has_zero = False
	for cluster in clusters:
		assert len(cluster) > 0
		mu = mean(cluster)
		stddev = std(cluster)
		B.append((mu, stddev))
		if stddev < 0.001:
			has_zero = True
	return (B, labels, has_zero)
Exemple #2
0
def smythEmissionDistribution(pair):
	"""
	Given a pair (S: list of sequences, target_m: int), get the emission
	distribution for Smyth's "default" HMM. target_m is an upper bound on the
	number of states -- if we can only have m' distinct observation values, then
	the distribution for a m' state HMM is returned.

	@param pair: A tuple of the form (S: list of sequences, target_m: int)
	@return:  (B, labels, has_zero), where:
	   * S', obs = concat(S), set(S)
	   * m' = min(target_m, len(obs))
	   * [C_0,...,C_{m'-1}] = result of clustering S' with k-means.
	   * labels: tells which cluster each item in merged goes into; i.e.,
	       labels[i] = j, where S'[i] belongs to cluster C_j.
	   * B[i] = (mean(C_i), stddev(C_i)).
	   * has_zero = True if there is i such that B[i][1] ~= 0.0.
	"""
	S, target_m = pair
	# merged list of 1d vectors, set of distinct observation values
	merged, distinct = prepareSeqs(S)

	# m_prime is min of either target_m or the number of distinct obs values 
	m_prime = min(target_m, len(distinct))

	# k-means partitions merged into m_prime clusters [C_0,...,C_{m'-1}].
	# centroids = [c_0,...,c_{m'-1}]: cluster centers; i.e., c_i is the center
	#   of C_j.
	# labels: tells which cluster each item in merged goes into; i.e.,
	#   labels[i] = j, where merged[i] belongs to cluster C_j.
	# inertia: sum of distances of samples to closest cluster center
	#   inertia = sum_{i=0}^{m'-1}(sum_{x in C_i} dist(x, c_i)).
	centroids, labels, inertia = k_means(merged, m_prime, init='k-means++')

	# takes labels and arranges merged into 
	# a list of lists, each of which contains the series from one cluster
	# clusters = [C_0,..,C_{m'-1}]
	clusters = partition(merged, labels)

	# Compute (B, labels, has_zero), where
	#   B[i] = (mean(C_i), stddev(C_i)).
	#   has_zero = True if there is i such that B[i][1] ~= 0.0.
	B = []
	has_zero = False
	for cluster in clusters:
		assert len(cluster) > 0
		mu = mean(cluster)
		stddev = std(cluster)
		B.append((mu, stddev))
		if stddev < 0.001:
			has_zero = True

	return (B, labels, has_zero)
Exemple #3
0
	def _kMedoids(self):
		"""
		Create multiple partitions for k values in [self.min_k... self.max_k]
		via k-medoids.
		"""
		self.dist_matrix = self._getDistMatrix()
		batch_items = ((self.dist_matrix, k, 10) for k in self.k_values)
		printAndFlush("K-medoids clustering (parallel)...")
		results = self._doMap(kMedoids, batch_items)
		printAndFlush("done")
		for i in xrange(0, len(self.k_values)):
			k, result = self.k_values[i], results[i]
			labels, error, nfound = result
			self.labelings[k] = labels
			clusters = partition(self.S, labels)
			self.partitions[k] = (clusters)
Exemple #4
0
	def model(self):
		"""
		With the user specified k range, clustering algorithm, HMM intialization,
		and distance function, create a set of HMM mixtures modeling the
		sequences in self.S. When finished, self.components is populated with a
		dict mapping k values to HMM triples.
		"""
		start = clock()
		# self._cluster()

		for k in self.k_values:
			clusters = partition(self.S, self.labelings[k])
			self.partitions[k] = (clusters)

		self._trainModels()
		self.times['total'] = clock() - start
		if not self.single_threaded:
			self.pool.close()
Exemple #5
0
	def _hierarchical(self):
		"""
		Create multiple partitions for k values in [self.min_k... self.max_k]
		via hierarchical, agglomerative clustering.
		"""
		self.dist_matrix = self._getDistMatrix()
		printAndFlush("Hierarchical clustering (serial)...")
		# tree = treecluster(distancematrix=self.dist_matrix, method='m')
		linkage_matrix = linkage(self.dist_matrix, method='complete')
		for k in self.k_values:
			# labels = tree.cut(k)
			labels = fcluster(linkage_matrix, k, 'maxclust')
			self.labelings[k] = labels
			clusters = partition(self.S, labels)
			# Technically, scipy's tree cutting function isn't guaranteed to
			# produce k clusters. It only seems to do this when there's a very
			# lopsided distance matrix, as was the case before we used log
			# observations. With log observations, it's been fine, and it
			# performs better than Pycluster's analogous routine.
			if len(clusters) != k:
				raise ValueError("fcluster could only produce %i clusters!" %
					len(clusters))
			self.partitions[k] = clusters
		printAndFlush("done")
Exemple #6
0
				seq = list(model.sampleSingle(LEN, seed=j))
				create = 0
				destroy = LEN*WINDOW_SIZE
				records.append({
					'ident': (i, i),
					'create': create,
					'destroy': destroy,
					'relays_in': [],
					'relays_out': map(lambda o: max(0, exp(o)-1), seq)
				})
	elif mode == "-clusters":
		data_path = sys.argv[5]
		with open(data_path) as data_file:
			orig_records = cPickle.load(data_file)['records']
		labels = results['labelings'][k]
		clusters = partition(orig_records, labels)
		sampled = map(lambda c: sample(c, 100) if len(c) > 100 else c, clusters)
		for i, cluster in enumerate(sampled):
			for record in cluster:
				record['ident'] = (i, i)
				records.append(record)
		for record in records:
			if record['ident'] == (6, 6):
				print len(record['relays_out'])

	output = {
		'window_size': WINDOW_SIZE,
		'records': records
	}
	with open(out_path, 'w') as outfile:
		cPickle.dump(output, outfile, protocol=2)