class EMStep(MRJob): INTERNAL_PROTOCOL = PickleProtocol def __init__(self, *args, **kwargs): MRJob.__init__(self, *args, **kwargs) # Create HMM object. self.hmm = HMM(word_dict, tag_dict) from os import path filename = 'hmm.txt' if path.exists(filename): # Load the HMM parameters from a text file. load_parameters(filename, self.hmm, smoothing=0.1) else: # Initialize the HMM parameters randomly. self.hmm.initialize_random() self.log_likelihood = 0 self.initial_counts = 0 self.emission_counts = 0 self.transition_counts = 0 self.final_counts = 0 def mapper(self, key, s): seq = load_sequence(s, self.hmm.observation_labels, self.hmm.state_labels) log_likelihood, initial_counts, transition_counts, final_counts, emission_counts = predict_sequence( seq, self.hmm) self.log_likelihood += log_likelihood self.initial_counts += initial_counts self.emission_counts += emission_counts self.transition_counts += transition_counts self.final_counts += final_counts def mapper_final(self): num_states = self.hmm.get_num_states() # Number of states. num_observations = self.hmm.get_num_observations() # Number of observation symbols. yield 'log-likelihood', self.log_likelihood for y in xrange(num_states): name_y = self.hmm.state_labels.get_label_name(y) for s in xrange(num_states): name_s = self.hmm.state_labels.get_label_name(s) yield 'transition %s %s' % (name_y, name_s), self.transition_counts[y, s] yield 'final '+name_y, self.final_counts[y] yield 'initial '+name_y, self.initial_counts[y] for w in xrange(num_observations): name_w = self.hmm.observation_labels.get_label_name(w) if self.emission_counts[w].any(): for s in xrange(num_states): name_s = self.hmm.state_labels.get_label_name(s) if self.emission_counts[w, s]: yield 'emission %s %s' % (name_w, name_s), self.emission_counts[w, s] def reducer(self, key, counts): yield key, sum(counts)
def __init__(self, *args, **kwargs): MRJob.__init__(self, *args, **kwargs) from os import path filename = 'hmm.pkl' if path.exists(filename): self.hmm = pickle.loads(open(filename).read().decode('string-escape')) else: # Initialize the HMM parameters randomly. self.hmm = HMM(word_dict, tag_dict) self.hmm.initialize_random() self.log_likelihood = 0 self.initial_counts = 0 self.emission_counts = 0 self.transition_counts = 0 self.final_counts = 0
class EMStep(MRJob): INTERNAL_PROTOCOL = PickleProtocol OUTPUT_PROTOCOL = PickleValueProtocol def __init__(self, *args, **kwargs): MRJob.__init__(self, *args, **kwargs) from os import path filename = 'hmm.pkl' if path.exists(filename): self.hmm = pickle.loads(open(filename).read().decode('string-escape')) else: # Initialize the HMM parameters randomly. self.hmm = HMM(word_dict, tag_dict) self.hmm.initialize_random() self.log_likelihood = 0 self.initial_counts = 0 self.emission_counts = 0 self.transition_counts = 0 self.final_counts = 0 def mapper(self, key, s): seq = load_sequence(s, self.hmm.observation_labels, self.hmm.state_labels) log_likelihood, initial_counts, transition_counts, final_counts,\ emission_counts = predict_sequence(seq, self.hmm) self.log_likelihood += log_likelihood self.initial_counts += initial_counts self.emission_counts += emission_counts self.transition_counts += transition_counts self.final_counts += final_counts def mapper_final(self): yield 'result', (self.log_likelihood, self.initial_counts, self.transition_counts, self.final_counts, self.emission_counts) def reducer(self, key, counts): combine_partials(counts, self.hmm) self.hmm.compute_parameters() yield 'hmm', self.hmm
class EMStep(MRJob): INTERNAL_PROTOCOL = PickleProtocol OUTPUT_PROTOCOL = PickleValueProtocol def __init__(self, *args, **kwargs): MRJob.__init__(self, *args, **kwargs) from os import path filename = 'hmm.pkl' if path.exists(filename): self.hmm = pickle.loads( open(filename).read().decode('string-escape')) else: # Initialize the HMM parameters randomly. self.hmm = HMM(word_dict, tag_dict) self.hmm.initialize_random() self.log_likelihood = 0 self.initial_counts = 0 self.emission_counts = 0 self.transition_counts = 0 self.final_counts = 0 def mapper(self, key, s): seq = load_sequence(s, self.hmm.observation_labels, self.hmm.state_labels) log_likelihood, initial_counts, transition_counts, final_counts,\ emission_counts = predict_sequence(seq, self.hmm) self.log_likelihood += log_likelihood self.initial_counts += initial_counts self.emission_counts += emission_counts self.transition_counts += transition_counts self.final_counts += final_counts def mapper_final(self): yield 'result', (self.log_likelihood, self.initial_counts, self.transition_counts, self.final_counts, self.emission_counts) def reducer(self, key, counts): combine_partials(counts, self.hmm) self.hmm.compute_parameters() yield 'hmm', self.hmm
def __init__(self, *args, **kwargs): MRJob.__init__(self, *args, **kwargs) # Create HMM object. self.hmm = HMM(word_dict, tag_dict) from os import path filename = 'hmm.txt' if path.exists(filename): # Load the HMM parameters from a text file. load_parameters(filename, self.hmm, smoothing=0.1) else: # Initialize the HMM parameters randomly. self.hmm.initialize_random() self.log_likelihood = 0 self.initial_counts = 0 self.emission_counts = 0 self.transition_counts = 0 self.final_counts = 0