Esempio n. 1
0
class EMStep(MRJob):
    INTERNAL_PROTOCOL = PickleProtocol

    def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)

        # Create HMM object.
        self.hmm = HMM(word_dict, tag_dict)

        from os import path
        filename = 'hmm.txt'
        if path.exists(filename):
            # Load the HMM parameters from a text file.
            load_parameters(filename, self.hmm, smoothing=0.1)
        else:
            # Initialize the HMM parameters randomly.
            self.hmm.initialize_random()

        self.log_likelihood = 0
        self.initial_counts = 0
        self.emission_counts = 0
        self.transition_counts = 0
        self.final_counts = 0

    def mapper(self, key, s):
        seq = load_sequence(s, self.hmm.observation_labels, self.hmm.state_labels)

        log_likelihood, initial_counts, transition_counts, final_counts, emission_counts = predict_sequence(
            seq, self.hmm)

        self.log_likelihood += log_likelihood
        self.initial_counts += initial_counts
        self.emission_counts += emission_counts
        self.transition_counts += transition_counts
        self.final_counts += final_counts

    def mapper_final(self):

        num_states = self.hmm.get_num_states()  # Number of states.
        num_observations = self.hmm.get_num_observations()  # Number of observation symbols.

        yield 'log-likelihood', self.log_likelihood
        for y in xrange(num_states):
            name_y = self.hmm.state_labels.get_label_name(y)
            for s in xrange(num_states):
                name_s = self.hmm.state_labels.get_label_name(s)
                yield 'transition %s %s' % (name_y, name_s), self.transition_counts[y, s]
            yield 'final '+name_y, self.final_counts[y]
            yield 'initial '+name_y, self.initial_counts[y]

        for w in xrange(num_observations):
            name_w = self.hmm.observation_labels.get_label_name(w)
            if self.emission_counts[w].any():
                for s in xrange(num_states):
                    name_s = self.hmm.state_labels.get_label_name(s)
                    if self.emission_counts[w, s]:
                        yield 'emission %s %s' % (name_w, name_s), self.emission_counts[w, s]

    def reducer(self, key, counts):
        yield key, sum(counts)
Esempio n. 2
0
    def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)

        from os import path
        filename = 'hmm.pkl'
        if path.exists(filename):
            self.hmm = pickle.loads(open(filename).read().decode('string-escape'))
        else:
            # Initialize the HMM parameters randomly.
            self.hmm = HMM(word_dict, tag_dict)
            self.hmm.initialize_random()

        self.log_likelihood = 0
        self.initial_counts = 0
        self.emission_counts = 0
        self.transition_counts = 0
        self.final_counts = 0
Esempio n. 3
0
class EMStep(MRJob):
    INTERNAL_PROTOCOL   = PickleProtocol
    OUTPUT_PROTOCOL     = PickleValueProtocol
    def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)


        from os import path
        filename = 'hmm.pkl'
        if path.exists(filename):
            self.hmm = pickle.loads(open(filename).read().decode('string-escape'))
        else:
            # Initialize the HMM parameters randomly.
            self.hmm = HMM(word_dict, tag_dict)
            self.hmm.initialize_random()

        self.log_likelihood = 0
        self.initial_counts = 0
        self.emission_counts = 0
        self.transition_counts = 0
        self.final_counts = 0


    def mapper(self, key, s):
        seq = load_sequence(s, self.hmm.observation_labels, self.hmm.state_labels)

        log_likelihood, initial_counts, transition_counts, final_counts,\
            emission_counts = predict_sequence(seq, self.hmm)

        self.log_likelihood += log_likelihood
        self.initial_counts += initial_counts
        self.emission_counts += emission_counts
        self.transition_counts += transition_counts
        self.final_counts += final_counts

    def mapper_final(self):
        yield 'result', (self.log_likelihood,
                        self.initial_counts,
                        self.transition_counts,
                        self.final_counts,
                        self.emission_counts)

    def reducer(self, key, counts):
        combine_partials(counts, self.hmm)
        self.hmm.compute_parameters()
        yield 'hmm', self.hmm
Esempio n. 4
0
class EMStep(MRJob):
    INTERNAL_PROTOCOL = PickleProtocol
    OUTPUT_PROTOCOL = PickleValueProtocol

    def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)

        from os import path
        filename = 'hmm.pkl'
        if path.exists(filename):
            self.hmm = pickle.loads(
                open(filename).read().decode('string-escape'))
        else:
            # Initialize the HMM parameters randomly.
            self.hmm = HMM(word_dict, tag_dict)
            self.hmm.initialize_random()

        self.log_likelihood = 0
        self.initial_counts = 0
        self.emission_counts = 0
        self.transition_counts = 0
        self.final_counts = 0

    def mapper(self, key, s):
        seq = load_sequence(s, self.hmm.observation_labels,
                            self.hmm.state_labels)

        log_likelihood, initial_counts, transition_counts, final_counts,\
            emission_counts = predict_sequence(seq, self.hmm)

        self.log_likelihood += log_likelihood
        self.initial_counts += initial_counts
        self.emission_counts += emission_counts
        self.transition_counts += transition_counts
        self.final_counts += final_counts

    def mapper_final(self):
        yield 'result', (self.log_likelihood, self.initial_counts,
                         self.transition_counts, self.final_counts,
                         self.emission_counts)

    def reducer(self, key, counts):
        combine_partials(counts, self.hmm)
        self.hmm.compute_parameters()
        yield 'hmm', self.hmm
Esempio n. 5
0
    def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)

        # Create HMM object.
        self.hmm = HMM(word_dict, tag_dict)

        from os import path
        filename = 'hmm.txt'
        if path.exists(filename):
            # Load the HMM parameters from a text file.
            load_parameters(filename, self.hmm, smoothing=0.1)
        else:
            # Initialize the HMM parameters randomly.
            self.hmm.initialize_random()

        self.log_likelihood = 0
        self.initial_counts = 0
        self.emission_counts = 0
        self.transition_counts = 0
        self.final_counts = 0