Ejemplo n.º 1
0
class HMM(BaseClassifier):

	def __init__(self):
		self.label_alphabet = Alphabet()
		self.feature_alphabet = Alphabet()
		self.transition_matrix = None
		self.emission_matrix = None
		self.initial_probability = None
		
	@property
	def num_states(self):
		return self.label_alphabet.size()
	
	@property
	def num_observations(self):
		return self.feature_alphabet.size()
		
	def _mutate_data(self, instance):
		try:
			_ = instance.old_data
		except:
			instance.old_data = instance.data
			instance.data = self.feature_alphabet.get_indices(instance.data)
			
	def _mutate_label(self, instance):
		try:
			_ = instance.old_label
		except:
			instance.old_label = instance.label
			instance.label = self.label_alphabet.get_indices(instance.label)
	
	def populate_alphabets(self, instance_list):
		"""Populate alphabets
		
		You guys have done this twice already. So I'm doing it for you this time.
		But a few things to note
			the labels get converted to label indices
			the feature vectors get converted to sparse vector	
			each time step contains exactly one feature (observation)
		
		Feel free to edit/modify/tear apart this function
		"""
		for instance in instance_list:
			for label in instance.label:
				self.label_alphabet.add(label)
			for observation in instance.data:
				self.feature_alphabet.add(observation)
			
			self._mutate_data(instance)
			self._mutate_label(instance)
			
		#for test cases and unsupervised training
		self.transition_matrix = numpy.zeros((self.num_states, self.num_states))
		self.emission_matrix = numpy.zeros((self.num_states, self.num_observations))
		self.initial_probability = numpy.zeros(self.num_states)
	
	def collect_counts(self, instance_list):
		"""Collect counts for fitting HMM parameters
		
		Very similar to Naive Bayes, we have to collect counts for estimating parameters:
		transition_counts[i,j] = the number of occurrences that state i comes before state j
		observation_counts[i,j] = the number of occurrences that state i is aligned with observation j 
		initial_state_counts[i] = the number of occurrences that state i is at the beginning of the sequence
		
		Add your implementation
		"""
		transition_counts = numpy.zeros((self.num_states, self.num_states))
		initial_state_counts = numpy.zeros(self.num_states)
		observation_counts = numpy.zeros((self.num_states, self.num_observations))
		for instance in instance_list:

			trans = zip(instance.label[:-1], instance.label[1:])
			transition_counts[instance.label[:-1], instance.label[1:]] += \
				map(trans.count, trans) #quirky workaround; commented code above doesn't work
			obs = zip(instance.label, instance.data)
			observation_counts[instance.label, instance.data] += \
				map(obs.count, obs)
			initial_state_counts[instance.label[0]] += 1 #increment initial state
			
		return (transition_counts, initial_state_counts, observation_counts)
	
	def train(self, instance_list):
		"""Train the HMM 
		
		Collect counts and find the best parameters for 
		transition matrix, emission matrix, and initial probability
		
		DO NOT smooth the counts
		
		Add your implementation
		"""
		self.populate_alphabets(instance_list)
		transition_counts, initial_state_counts, observation_counts = self.collect_counts(instance_list)
		
		#fill in these matrices
		#availing of columnar summation of numpy arrays 
		self.transition_matrix = transition_counts / numpy.sum(transition_counts, 1)
		self.emission_matrix = (observation_counts.T / (numpy.sum(transition_counts, 0) + \
			initial_state_counts)).T #p(Y1|X0) + p(Y1|X1) + ... = p(Y1)
		self.initial_probability = initial_state_counts / sum(initial_state_counts) #p(X|Start)
	
	def forward_algorithm(self, instance):
		"""Run forward algorithm
		
		Add your implementation
		"""
		sequence_length = len(instance.data) 
		alpha = numpy.zeros((self.num_states, sequence_length))
		
		#initialization
		alpha[:, 0] = self.initial_probability * self.emission_matrix[:,instance.data[0]]
		
		#recursion
		for t in range(1, sequence_length):
			alpha[:, t] = numpy.sum(alpha[:, t-1] * self.transition_matrix.T * \
				self.emission_matrix[:, instance.data[t]], 1)

		return alpha
	
	def backward_algorithm(self, instance):
		"""Run backward algorithm
		
		Add your implementation
		"""
		sequence_length = len(instance.data) 
		beta = numpy.zeros((self.num_states, sequence_length))
		
		#initialization
		beta[:, -1] += 1
		
		#recursion
		for t in reversed(xrange(sequence_length - 1)):
			beta[:, t] = numpy.sum(self.transition_matrix * \
				self.emission_matrix[:, instance.data[t + 1]] * \
				beta[:, t + 1], 1)
				
		return beta
	
	def compute_likelihood(self, alpha):
		"""Compute likelihood P(O1:T) given forward values
		
		This function is necessary for computing expected counts.
		It should assume that alpha (forward) values are computed correctly.
		
		This function should be just one line
		
		Add your implementation
		"""
		#return sum(alpha)[-1]
		#return sum(alpha[:, -1])
		return numpy.sum(alpha, 0)[-1]
		
	def compute_expected_counts(self, instance):
		"""E-step for EM Algorithm for learning HMM parameters
		
		This function is fully implemented for you
		"""
		alpha = self.forward_algorithm(instance)
		beta = self.backward_algorithm(instance)
		sequence_length = len(instance.data)
		likelihood = self.compute_likelihood(alpha)
		
		gamma = alpha * beta / likelihood
		expected_observation_counts = numpy.zeros((self.num_states, self.num_observations)) 
		for t in xrange(sequence_length):
			feature_index = instance.data[t]
			expected_observation_counts[:, feature_index] += gamma[:, t]
		
		expected_transition_counts = numpy.zeros((self.num_states, self.num_states))
		for t in xrange(sequence_length-1):
			feature_index = instance.data[t+1]
			obs = self.emission_matrix[:, feature_index]
			m1 = numpy.matrix(alpha[:, t])
			m2 = numpy.matrix(beta[:, t+1] * obs)
			xi = numpy.multiply(m1.transpose().dot(m2), self.transition_matrix) / likelihood
			expected_transition_counts += xi
		return (expected_transition_counts, expected_observation_counts, likelihood)
		
	def _be_prepared_for_baum_welch(self, training_set, mode = 'uniform', inf = None):
		"""Initialize transition_matrix, emission_matrix, and initial_probability for Baum-Welch.
		
		@param training_set: the training data
		@param mode: can be 'uniform', 'random', or 'sneaky'
		@inf: used in sneaky mode; this is a file containing a dictionary 
			serialization of an HMM object
		"""
	
		self.populate_alphabets(training_set)
		
		HVAL = 100 #added to high positions in sparse rows for weak training
		LVAL = 1 #added to low positions in sparse rows for weak training
		
		if mode == 'uniform': #all elements in a row are equal
			self.transition_matrix += (1.0 / numpy.size(self.transition_matrix, 1))
			self.emission_matrix += (1.0 / numpy.size(self.emission_matrix, 1))
			self.initial_probability += (1.0 / numpy.size(self.initial_probability))
			
		else: #elements will be unequal
			#choose one element per row per matrix to be 
			#much higher than its dear siblings
			
			if mode == 'random': #high element is selected randomly
				random.seed()
				trans = [random.choice(range(numpy.size(self.transition_matrix, 1))) \
					for i in range(numpy.size(self.transition_matrix, 0))]
				emits = [random.choice(range(numpy.size(self.emission_matrix, 1))) \
					for i in range(numpy.size(self.emission_matrix, 0))]
				init = random.choice(range(len(self.initial_probability)))
			
			elif mode == 'sneaky': #use some information from the data, but don't tell anyone!
				tempdict = HMM.from_dict(cPickle.load(inf))
				tcounts, icounts, ocounts = [tempdict[i] for i in 'transition_matrix', \
					'initial_probability', 'emission_matrix']

				trans = numpy.argmax(tcounts, 1)
				emits = numpy.argmax(ocounts, 1)
				init = numpy.argmax(icounts)
			
			#ensure that no element is zero and that the selected element is substantially higher
			self.transition_matrix[range(numpy.size(self.transition_matrix, 0)), trans] += HVAL
			self.transition_matrix += LVAL
			self.emission_matrix[range(numpy.size(self.emission_matrix, 0)), emits] += HVAL
			self.emission_matrix += LVAL
			self.initial_probability[init] += HVAL
			self.initial_probability += LVAL
			
			#normalize
			self.transition_matrix = (self.transition_matrix.T / numpy.sum(\
				self.transition_matrix, 1)).T
			self.emission_matrix = (self.emission_matrix.T / numpy.sum(\
				self.emission_matrix, 1)).T
			self.initial_probability /= sum(self.initial_probability)
	
	def baum_welch_train(self, instance_list):
		"""Baum-Welch unsupervised training
		
		Before calling this function, you have to call
			self.populate_alphabets(instance_list)
			and then initialize transition matrix, observation matrix, and initial probability.
		It's ok to fix initial probability to 1 / self.num_states (Uniform)
			
		This function is not so optimized, so it can't turn the crank on too large a dataset.
		"""
		num_states = self.label_alphabet.size()
		num_features = self.feature_alphabet.size()
		old_total_loglikelihood = - numpy.Infinity
		for i in xrange(30):
			expected_observation_counts = numpy.zeros((num_states, num_features)) 
			expected_transition_counts = numpy.zeros((num_states, num_states)) 
			total_log_likelihood = 0
			#E-Step
			for instance in instance_list:
				transition_counts, obs_counts, likelihood = self.compute_expected_counts(instance)
				expected_observation_counts += obs_counts
				expected_transition_counts += transition_counts
				total_log_likelihood += numpy.log(likelihood)
			#M-Step
			self.transition_matrix = (expected_transition_counts.transpose() / numpy.sum(expected_transition_counts, 1)).transpose()
			self.emission_matrix = (expected_observation_counts.transpose() / numpy.sum(expected_observation_counts, 1)).transpose()
			print 'Iteration %s : %s ' % (i, total_log_likelihood)
			if total_log_likelihood < old_total_loglikelihood:
				break
			old_total_loglikelihood = total_log_likelihood
		self.initial_probability = numpy.zeros(num_states) + 1.0/num_states

	def classify_instance(self, instance):
		"""Viterbi decoding algorithm

		Returns a list of label strings e.g. ['Hot', 'Cold', 'Cold']
		
		Add your implementation
		"""
		
		self._mutate_data(instance) #just in case
		
		#initialization
		slength = len(instance.data)
		v = numpy.zeros((self.num_states, slength))
		backtrace = numpy.zeros((self.num_states, slength))
		v[:, 0] = self.initial_probability * self.emission_matrix[:, \
			instance.data[0]]
			
		#recursion
		for t in range(1, slength):
			tempmat = v[:, t-1] * self.transition_matrix.T
			maxis = numpy.argmax(tempmat, axis = 1)
			backtrace[:, slength - t] = maxis #facilitates reversal later
			v[:, t] = v[maxis, t-1] * self.transition_matrix[maxis, \
				xrange(numpy.size(self.transition_matrix, 1))] * \
				self.emission_matrix[:, instance.data[t]]
			
		#termination
		backtrace[:, 0] = v[:, -1]
		
		return self._run_backtrace(backtrace)
		
	def _run_backtrace(self, back_mat):
		"""
		Helper function for extracting 
		
		@param back_mat: a deque 
		"""
		stack = [numpy.argmax(back_mat[:, 0])]
		for ind in xrange(1, numpy.size(back_mat, 1)):
			stack.append(back_mat[stack[-1], ind])
		res = []
		while stack:
			res.append(self.label_alphabet.get_label(stack.pop()))
		return res
		
	def print_parameters(self):
		"""Print the two parameter matrices
		
		You should take advantage of this function in debugging
		and inspecting the resulting parameters.
		
		This function is implemented for you.
		"""
		state_header = map(str, [self.label_alphabet.get_label(i) \
			for i in xrange(self.label_alphabet.size())])
		obs_header = map(str, [self.feature_alphabet.get_label(i) \
			for i in xrange(self.feature_alphabet.size())])
		print matrix_to_string(self.emission_matrix, state_header, obs_header)
		print matrix_to_string(self.transition_matrix, state_header, state_header)

	def to_dict(self):
		"""Convert HMM instance into a dictionary representation

		The implementation of this should be in sync with from_dict function.
		You should be able to use these two functions to convert the model into
		either representation (object or dictionary)
		
		We have enough of this. This is fully implemented for you.
		"""
		model_dict = {
			'label_alphabet': self.label_alphabet.to_dict(),
			'feature_alphabet': self.feature_alphabet.to_dict(),
			'transition_matrix': self.transition_matrix.to_list(),
			'emission_matrix': self.emission_matrix.to_list(),
			'initial_probability': self.initial_probability.to_list()
		}
		return model_dict

	@classmethod
	def from_dict(model_dict):
		"""Convert a dictionary into HMM instance
		
		The implementation of this should be in sync with to_dict function.
		
		This is fully implemented for you.
		"""
		hmm = HMM()
		hmm.label_alphabet = Alphabet.from_dict(model_dict['label_alphabet'])
		hmm.feature_alphabet = Alphabet.from_dict(model_dict['feature_alphabet'])
		hmm.transition_matrix = numpy.array(model_dict['transition_matrix'])
		hmm.emission_matrix = numpy.array(model_dict['emission_matrix'])
		hmm.initial_probability = numpy.array(model_dict['initial_probability'])
		return hmm
Ejemplo n.º 2
0
class MaxEnt(BaseClassifier):

	def __init__(self, gaussian_prior_variance = 1):
		"""Initialize the model

		label_alphabet, feature_alphabet, parameters must be
		consistent in order for the model to work.

		parameters numpy.array assumes a specific shape. Look athe assignment sheet for detail

		Add your implementation
		"""
		super(MaxEnt, self).__init__()
		self.label_alphabet = Alphabet()
		self.feature_alphabet = Alphabet()
		self.gaussian_prior_variance = gaussian_prior_variance
		self.parameters = numpy.array([])
		self.feature_counts = None

	def get_parameter_indices(self, feature_indices, label_index):
		"""Get the indices on the parameter vector

		Given a list of feature indices and the label index, 
		the function will give you a numpy array of the corresponding indices on self.parameters
		
		This function is fully implemented for you.
		"""
		indices = numpy.array(feature_indices) + 1
		intercept = numpy.array([0])
		indices = numpy.concatenate((intercept, indices), 1)
		indices = indices + (label_index * (self.feature_alphabet.size() + 1))
		return indices

	def compute_observed_counts(self, instance_list):
		"""Compute observed feature counts

		It should only be done once because it's parameter-independent.
		The observed feature counts are then stored internally.
		Note that we are fitting the model with the intercept terms
		so the count of intercept term is the count of that class.
		
		Additionally, we have to
			1) populate alphabet
			2) convert instance.data into a vector of feature indices aka sparse vectors
				(use the alphabet)

		Add your implementation
		"""
		#If it's already been counted, just return the value from the cache
		if not self.feature_counts:
			#populate alphabets here
			for instance in instance_list:
				self.label_alphabet.add(instance.label) #update label dictionary
				for datum in instance.data:
					self.feature_alphabet.add(datum) #update feature dictionary
			self.feature_counts = numpy.zeros((self.feature_alphabet.size() \
				+ 1) * self.label_alphabet.size()) #generate observed count vector

		else:
			return self.feature_counts

		#compute the feature counts here
		for instance in instance_list:
			newinds = self.feature_alphabet.get_indices(instance.data)
			sparse_vector = self.get_parameter_indices(newinds, \
				self.label_alphabet.get_index(instance.label))
			self.feature_counts[sparse_vector] += 1
			#instance.data = newinds
			if not instance.converted:
				instance.data = numpy.array(sorted(set(newinds))) #remove duplicates
				instance.converted = True #do not allow confusion
		return self.feature_counts
		
	def compute_label_unnormalized_loglikelihood_vector(self, sparse_feature_vector):
		"""Compute unnormalized log score from log-linear model

		log P(Y|X) is proportional to feature vector * parameter vector
		But we use a sparse vector representation, so we need to use
		index tricks that numpy allows us to do.
		"""
		loglikelihood_score_vector = numpy.zeros(self.label_alphabet.size())
		for index, label in self.label_alphabet:
			loglikelihood_score_vector[index] = sum(\
				self.parameters[self.get_parameter_indices(\
				sparse_feature_vector, index)])
			#dot product of parameters and feature functions
			#which yields sum of parameters at indices
			
		return loglikelihood_score_vector

	def compute_posterior_distribution(self, instance):
		"""Compute P(Y|X)

		Return a vector of the same size as the label_alphabet	
		
		Add your implementation
		"""
		posterior_distribution = numpy.zeros(self.label_alphabet.size()) #initialize
		unnorm = self.compute_label_unnormalized_loglikelihood_vector(\
				instance.data) #compute unnormalized log-likelihood
		if DEBUG_2:
			print unnorm
		posterior_distribution = numpy.exp(unnorm)/ sum(numpy.exp(unnorm)) #normalize
		return posterior_distribution
		
	def _argmax(self, func, *args):
		"""Not needed because numpy's is better"""
		res = [func(arg) for arg in args]
		m = max(res)
		for arg in args:
			if func(arg) == m:
				return arg

	def compute_expected_feature_counts(self, instance_list):
		"""Compute expected feature counts

		We take advantage of compute_posterior_distribution in this class to compute
		expected feature counts, which is only needed for training.

		Add your implementation
		"""
		expected_feature_counts = numpy.zeros((self.feature_alphabet.size() + 1) * self.label_alphabet.size())
		for instance in instance_list:
			#add posterior to expected_feature_counts at appropriate indices
			post_dist = self.compute_posterior_distribution(instance) #posterior distribution
			for jndex, label in self.label_alphabet:
				indices = self.get_parameter_indices(\
					instance.data, jndex)
				expected_feature_counts[indices] += post_dist[jndex] 
				#	increment expected counts at appropriate indices
		return expected_feature_counts

	def classify_instance(self, instance):
		"""Applying the model to a new ins
		tance

		Convert instance.data into a sparse vector and then classify the instance.
		Returns the predicted label. 

		Add your implementation
		"""
		if DEBUG_2:
			print instance.data
		if not instance.converted:
			instance.data = self.feature_alphabet.get_indices(instance.data) 
			instance.converted = True
			#	get_indices eliminates any heretofore unseen features
		if DEBUG_2:
			print instance.data
			print self.compute_posterior_distribution(instance)
		return self.label_alphabet.get_label(numpy.argmax( \
			self.compute_posterior_distribution(instance))) #return label corresponding to best index

	def objective_function(self, parameters):
		"""Compute negative (log P(Y|X,lambdas) + log P(lambdas))

		The function that we want to optimize over.
		You won't have to call this function yourself. fmin_l_bfgs_b will call it.

		Add your implementation
		"""
		total_loglikelihood = 0.0
		self.parameters = parameters
		#add normalizing term
		total_loglikelihood -= numpy.sum(parameters * parameters) / \
			self.gaussian_prior_variance
		# Compute the loglikelihood here
		for instance in self.training_data:
			#add posterior at correct label index
			total_loglikelihood += self.compute_posterior_distribution(instance) \
				[self.label_alphabet.get_index(instance.label)] 
		return - total_loglikelihood


	def gradient_function(self, parameters):
		"""Compute gradient of negative (log P(Y|X,lambdas) + log P(lambdas)) wrt lambdas

		With some algebra, we have that
		gradient wrt lambda i = observed_count of feature i - expected_count of feature i
		The first term is computed before running the optimization function and is a constant.
		The second term needs inference to get P(Y|X, lambdas) and is a bit expensive.
		The third term is from taking the derivative of log gaussian prior

		Returns:
			a vector of gradient

		Add your implementation
		"""
		gradient_vector = numpy.zeros(len(parameters))
		# compute gradient here
		gradient_vector += self.feature_counts - \
			self.compute_expected_feature_counts(self.training_data) - \
			2 * (parameters) / self.gaussian_prior_variance
		if DEBUG_1:
			print gradient_vector
		return - gradient_vector


	def train(self, instance_list):
		"""Find the optimal parameters for maximum entropy classifier

		We leave the actual number crunching and search to fmin_bfgs function.
		There are a few tunable parameters for the optimization function but
		the default is usually well-tuned and sufficient for most purposes.

		Arg:
			instance_list: each instance.data should be a string feature vector

		This function is fully implemented. But you are allowed to make changes 
		"""
		self.training_data = instance_list
		self.compute_observed_counts(instance_list)
		num_labels = self.label_alphabet.size()
		num_features = self.feature_alphabet.size()
		init_point = numpy.zeros(num_labels * (num_features + 1))
		optimal_parameters, _, _ = fmin_l_bfgs_b(self.objective_function, init_point, fprime=self.gradient_function)
		self.parameters = optimal_parameters


	def to_dict(self):
		"""Convert MaxEnt into a dictionary so that save() will work
		
		Add your implementation
		"""
		res = {}
		res['labalph'] = self.label_alphabet.to_dict()
		res['feaalph'] = self.feature_alphabet.to_dict()
		res['gpv'] = self.gaussian_prior_variance
		res['param'] = self.parameters
		return res


	@classmethod
	def from_dict(cls, model_dictionary):
		"""Return an instance of MaxEnt based on the dictionary created by to_dict
		
		Add your implementation
		"""
		res = MaxEnt()
		res.label_alphabet = Alphabet.from_dict(model_dictionary['labalph'])
		res.feature_alphabet = Alphabet.from_dict(model_dictionary['feaalph'])
		res.gaussian_prior_variance = model_dictionary['gpv']
		res.parameters = model_dictionary['param']
		return res