コード例 #1
0
ファイル: test-maxent.py プロジェクト: tensorspace/nlp-python
    def test_zero_weight(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['dog'], 0.0)
コード例 #2
0
    def _linear_smooth(cls, labels, fallback_transition, label_history_size):
        transition = CounterMap()
        linear_smoothing_weights = [1.0 - 0.1 * (label_history_size - 1)]
        linear_smoothing_weights.extend(
            0.1 for _ in xrange(label_history_size - 1))

        # This is super inefficient - it should be caching smoothings involving the less-specific counters
        # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on
        all_label_histories = set(permutations(labels, label_history_size - 1))
        for label_history in all_label_histories:
            histories = [
                history for history in (label_history[i:]
                                        for i in xrange(label_history_size))
            ]
            # >>> label_history = ('WDT', 'RBR')
            # histories = [('WDT', 'RBR'), ('RBR')]

            history_strings = ['::'.join(history) for history in histories]
            history_scores = [
                fallback_transition[len(history)][history_string]
                for history, history_string in izip(histories, history_strings)
            ]

            transition[history_strings[0]] = Counter()
            for smoothing, history_score in izip(linear_smoothing_weights,
                                                 history_scores):
                transition[history_strings[0]] += history_score * smoothing

        transition.normalize()

        return transition
コード例 #3
0
ファイル: test-maxent.py プロジェクト: tensorspace/nlp-python
    def test_extraneous_label(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['cat'], float('-inf'))
コード例 #4
0
ファイル: test-maxent.py プロジェクト: beckgom/python-nlp
	def test_zero_weight(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['dog'], 0.0)
コード例 #5
0
ファイル: test-maxent.py プロジェクト: beckgom/python-nlp
	def test_extraneous_label(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['cat'], float('-inf'))
コード例 #6
0
ファイル: test-maxent.py プロジェクト: tensorspace/nlp-python
    def setUp(self):
        self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy'])

        self.weights = CounterMap()
        self.weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        self.weights['cat'] = Counter({'warm': 0.5, 'fuzzy': 2.0})

        self.labels = set(self.weights.iterkeys())
        self.logp = maxent.get_log_probabilities(self.features, self.weights,
                                                 self.labels)
コード例 #7
0
    def value_and_gradient(self, weights, verbose=False):
        if weights == self.last_vg_weights:
            return self.last_vg
        objective = 0.0
        gradient = CounterMap()

        if verbose:
            print "Calculating log probabilities and objective..."

        # log_prob
        log_probs = list()
        for pos, (label, features) in enumerate(self.labeled_extracted_features):
            log_probs.append(get_log_probs(features, weights, self.labels))
            assert (
                abs(sum(exp(log_probs[pos][label]) for label in self.labels) - 1.0) < 0.0001
            ), "Not a distribution: P[any | features] = %f" % (sum(exp(log_probs[pos][label]) for label in self.labels))

        objective = -sum(log_prob[label] for (log_prob, (label, _)) in zip(log_probs, self.labeled_extracted_features))

        if verbose:
            print "Raw objective: %f" % objective

        if verbose:
            print "Calculating expected counts..."

        expected_counts = get_expected_counts(self.labeled_extracted_features, self.labels, log_probs, CounterMap())

        if verbose:
            print "Calculating gradient..."

        gradient = expected_counts - self.empirical_counts

        if verbose:
            print "Applying penalty"

        # Apply a penalty (e.g. smooth the results)
        if self.sigma:
            penalty = 0.0

            for label, feature_weights in gradient.iteritems():
                for feature in feature_weights:
                    weight = weights[label][feature]
                    penalty += weight ** 2
                    gradient[label][feature] += weight / (self.sigma ** 2)

            penalty /= 2 * self.sigma ** 2
            objective += penalty
            if verbose:
                print "Penalized objective: %f" % objective

        self.last_vg_weights = weights
        self.last_vg = (objective, gradient)
        return (objective, gradient)
コード例 #8
0
    def value_and_gradient(self, weights, verbose=False):
        if weights == self.last_vg_weights:
            return self.last_vg
        objective = 0.0
        gradient = CounterMap()

        if verbose: print "Calculating log probabilities and objective..."

        # log_prob
        log_probs = list()
        for pos, (label,
                  features) in enumerate(self.labeled_extracted_features):
            log_probs.append(get_log_probs(features, weights, self.labels))
            assert abs(
                sum(exp(log_probs[pos][label]) for label in self.labels) -
                1.0) < 0.0001, "Not a distribution: P[any | features] = %f" % (
                    sum(exp(log_probs[pos][label]) for label in self.labels))

        objective = -sum(log_prob[label] for (log_prob, (
            label, _)) in zip(log_probs, self.labeled_extracted_features))

        if verbose: print "Raw objective: %f" % objective

        if verbose: print "Calculating expected counts..."

        expected_counts = get_expected_counts(self.labeled_extracted_features,
                                              self.labels, log_probs,
                                              CounterMap())

        if verbose: print "Calculating gradient..."

        gradient = expected_counts - self.empirical_counts

        if verbose: print "Applying penalty"

        # Apply a penalty (e.g. smooth the results)
        if self.sigma:
            penalty = 0.0

            for label, feature_weights in gradient.iteritems():
                for feature in feature_weights:
                    weight = weights[label][feature]
                    penalty += weight**2
                    gradient[label][feature] += (weight / (self.sigma**2))

            penalty /= 2 * self.sigma**2
            objective += penalty
            if verbose: print "Penalized objective: %f" % objective

        self.last_vg_weights = weights
        self.last_vg = (objective, gradient)
        return (objective, gradient)
コード例 #9
0
	def train(self, labeled_data):
		self.feature_distribution = CounterMap()
		labels = set()

		for label, datum in labeled_data:
			labels.add(label)
			for feature in ngrams(datum, 3)
				self.feature_distribution[feature][label] += 1

		for feature in self.feature_distribution.iterkeys():
			self.feature_distribution[feature].default = 0.01

		self.feature_distribution.normalize()
		self.feature_distribution.log()
コード例 #10
0
ファイル: hmm.py プロジェクト: beckgom/python-nlp
	def __init__(self, label_history_size=2):
		# Distribution over next state given current state
		self.labels = list()
		self.label_history_size = label_history_size
		self.transition = CounterMap()
		self.reverse_transition = CounterMap() # same as transitions but indexed in reverse (useful for decoding)

		self.fallback_emissions_model = None
		self.fallback_transition = None
		self.fallback_reverse_transition = None

		# Multinomial distribution over emissions given label
		self.emission = CounterMap()
		# p(label | emission)
		self.label_emissions = CounterMap()
コード例 #11
0
ファイル: test-maxent.py プロジェクト: beckgom/python-nlp
	def setUp(self):
		self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy'])

		self.weights = CounterMap()
		self.weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		self.weights['cat'] = Counter({'warm' : 0.5, 'fuzzy' : 2.0})

		self.labels = set(self.weights.iterkeys())
		self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels)
コード例 #12
0
def slow_expected_counts(labeled_extracted_features, labels, log_probs):
    expected_counts = CounterMap()

    for (index, (_, datum_features)) in enumerate(labeled_extracted_features):
        for (feature, cnt) in datum_features.iteritems():
            for label in labels:
                expected_counts[label][feature] += exp(
                    log_probs[index][label]) * cnt

    return expected_counts
コード例 #13
0
ファイル: test-maxent.py プロジェクト: tensorspace/nlp-python
    def test_uneven_weights(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 1.0})
        weights['cat'] = Counter({'warm': 1.0, 'fuzzy': 1.0})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        # construct scores
        scores = Counter()
        scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0
        scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0
        scores.log_normalize()

        # check scores explicitly
        self.assertAlmostEqual(scores['dog'], log(0.731), 3)
        self.assertAlmostEqual(scores['cat'], log(0.269), 3)

        # check that log probs is correct
        self.assertEqual(logp['dog'], scores['dog'])
        self.assertEqual(logp['cat'], scores['cat'])
コード例 #14
0
ファイル: test-maxent.py プロジェクト: beckgom/python-nlp
	def test_uneven_weights(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 1.0})
		weights['cat'] = Counter({'warm' : 1.0, 'fuzzy' : 1.0})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		# construct scores
		scores = Counter()
		scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0
		scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0
		scores.log_normalize()

		# check scores explicitly
		self.assertAlmostEqual(scores['dog'], log(0.731), 3)
		self.assertAlmostEqual(scores['cat'], log(0.269), 3)

		# check that log probs is correct
		self.assertEqual(logp['dog'], scores['dog'])
		self.assertEqual(logp['cat'], scores['cat'])
コード例 #15
0
    def _gather_colocation_counts(self, files):
        files = [open(path) for path in files]
        triples = chain(*[self._file_triples(file) for file in files])

        pre_counts = CounterMap()
        post_counts = CounterMap()
        full_counts = CounterMap()

        for pre, word, post in triples:
            full_context = '::'.join(pre + post)
            pre_context = '::'.join(pre)
            post_context = '::'.join(post)

            pre_counts[word][pre_context] += 1
            post_counts[word][post_context] += 1
            full_counts[word][full_context] += 1

        for file in files:
            file.close()

        return pre_counts, post_counts, full_counts
コード例 #16
0
ファイル: test-maxent.py プロジェクト: tensorspace/nlp-python
    def test_fast_slow_equal(self):
        weights = CounterMap()
        weights['cat'] = Counter(
            (key, 1.0)
            for key in ('fuzzy', 'claws', 'small', 'medium', 'large'))
        weights['bear'] = Counter(
            (key, 1.0)
            for key in ('fuzzy', 'claws', 'small', 'medium', 'large'))

        log_probs = [
            maxent.get_log_probabilities(datum[1], weights, self.labels)
            for datum in self.labeled_extracted_features
        ]

        slow_expectation = maximumentropy.slow_expected_counts(
            self.labeled_extracted_features, self.labels, log_probs)
        fast_expectation = maxent.get_expected_counts(
            self.labeled_extracted_features, self.labels, log_probs,
            CounterMap())

        self.assertEqual(slow_expectation, fast_expectation)

        # And try again with different weights
        weights['cat'] = Counter(
            (key, 1.0) for key in ('fuzzy', 'claws', 'small', 'medium'))
        weights['bear'] = Counter(
            (key, 1.0) for key in ('fuzzy', 'claws', 'big'))

        log_probs = [
            maxent.get_log_probabilities(datum[1], weights, self.labels)
            for datum in self.labeled_extracted_features
        ]

        slow_expectation = maximumentropy.slow_expected_counts(
            self.labeled_extracted_features, self.labels, log_probs)
        fast_expectation = maxent.get_expected_counts(
            self.labeled_extracted_features, self.labels, log_probs,
            CounterMap())

        self.assertEqual(slow_expectation, fast_expectation)
コード例 #17
0
    def __init__(self, labeled_extracted_features, labels, features):
        self.labeled_extracted_features = labeled_extracted_features
        self.labels = labels
        self.features = features
        self.empirical_counts = CounterMap()

        print "Calculating empirical counts..."

        for (index,
             (datum_label,
              datum_features)) in enumerate(self.labeled_extracted_features):
            for (feature, cnt) in datum_features.iteritems():
                self.empirical_counts[datum_label][feature] += cnt
コード例 #18
0
class NaiveBayesClassifier:
	def train(self, labeled_data):
		self.feature_distribution = CounterMap()
		labels = set()

		for label, datum in labeled_data:
			labels.add(label)
			for feature in ngrams(datum, 3)
				self.feature_distribution[feature][label] += 1

		for feature in self.feature_distribution.iterkeys():
			self.feature_distribution[feature].default = 0.01

		self.feature_distribution.normalize()
		self.feature_distribution.log()

	def label_distribution(self, datum):
		distribution = None

		for feature in ngrams(datum, 3):
			if distribution:
				distribution += self.feature_distribution[feature]
			else:
				distribution = copy(self.feature_distribution[feature])

		distribution.log_normalize()

		return distribution

	def label(self, datum):
		distribution = None

		for feature in ngrams(datum, 3):
			if distribution:
				distribution += self.feature_distribution[feature]
			else:
				distribution = copy(self.feature_distribution[feature])

		return distribution.arg_max()
コード例 #19
0
ファイル: hmm.py プロジェクト: beckgom/python-nlp
	def _linear_smooth(cls, labels, fallback_transition, label_history_size):
		transition = CounterMap()
		linear_smoothing_weights = [1.0 - 0.1 * (label_history_size-1)]
		linear_smoothing_weights.extend(0.1 for _ in xrange(label_history_size-1))

		# This is super inefficient - it should be caching smoothings involving the less-specific counters
		# e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on
		all_label_histories = set(permutations(labels, label_history_size-1))
		for label_history in all_label_histories:
			histories = [history for history in (label_history[i:] for i in xrange(label_history_size))]
			# >>> label_history = ('WDT', 'RBR')
			# histories = [('WDT', 'RBR'), ('RBR')]

			history_strings = ['::'.join(history) for history in histories]
			history_scores = [fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings)]

			transition[history_strings[0]] = Counter()
			for smoothing, history_score in izip(linear_smoothing_weights, history_scores):
				transition[history_strings[0]] += history_score * smoothing

		transition.normalize()

		return transition
コード例 #20
0
    def train_with_features(self, labeled_features, sigma=None, quiet=False):
        print "Optimizing weights..."
        weight_function = MaxEntWeightFunction(labeled_features, self.labels,
                                               self.features)
        weight_function.sigma = sigma

        print "Building initial dictionary..."
        initial_weights = CounterMap()

        print "Training on %d labelled features" % (len(labeled_features))

        print "Minimizing..."
        self.weights = Minimizer.minimize(weight_function,
                                          initial_weights,
                                          quiet=quiet)
コード例 #21
0
    def __init__(self, label_history_size=2):
        # Distribution over next state given current state
        self.labels = list()
        self.label_history_size = label_history_size
        self.transition = CounterMap()
        self.reverse_transition = CounterMap(
        )  # same as transitions but indexed in reverse (useful for decoding)

        self.fallback_emissions_model = None
        self.fallback_transition = None
        self.fallback_reverse_transition = None

        # Multinomial distribution over emissions given label
        self.emission = CounterMap()
        # p(label | emission)
        self.label_emissions = CounterMap()
コード例 #22
0
class HiddenMarkovModel:
    def __init__(self, label_history_size=2):
        # Distribution over next state given current state
        self.labels = list()
        self.label_history_size = label_history_size
        self.transition = CounterMap()
        self.reverse_transition = CounterMap(
        )  # same as transitions but indexed in reverse (useful for decoding)

        self.fallback_emissions_model = None
        self.fallback_transition = None
        self.fallback_reverse_transition = None

        # Multinomial distribution over emissions given label
        self.emission = CounterMap()
        # p(label | emission)
        self.label_emissions = CounterMap()

    def _pad_sequence(self, sequence, pairs=False):
        if pairs: yield (START_LABEL, START_LABEL)
        else: yield START_LABEL

        for item in sequence:
            yield item

        # Pad the end so we'll decode the whole thing
        for _ in xrange(self.label_history_size):
            if pairs: yield (STOP_LABEL, STOP_LABEL)
            else: yield STOP_LABEL

    @classmethod
    def _extend_labels(cls, sequence, label_history_size):
        '''
		>>> foo = HiddenMarkovModel()
		>>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 1)
		[('A', (), 3), ('B', (), 4), ('C', (), 5)]
		>>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 2)
		[('A', ('<START>',), 3),
		 ('B', ('A',), 4),
		 ('C', ('B',), 5)]
		'''
        last_labels = [START_LABEL for _ in xrange(label_history_size)]

        for label, emission in sequence:
            last_labels.append(label)
            last_labels.pop(0)

            if label == START_LABEL:
                last_labels = [START_LABEL for _ in xrange(label_history_size)]

            all_labels = ('::'.join(last_labels[label_history_size - length -
                                                2:-1])
                          for length in xrange(label_history_size - 1))
            yield (label, tuple(all_labels), emission)

    @property
    def start_label(self):
        return '::'.join(repeat(START_LABEL, self.label_history_size))

    @property
    def stop_label(self):
        return '::'.join(repeat(STOP_LABEL, self.label_history_size))

    def push_label(self, history, label):
        return '::'.join(history.split('::')[1:] + [
            label,
        ])

    @classmethod
    def _linear_smooth(cls, labels, fallback_transition, label_history_size):
        transition = CounterMap()
        linear_smoothing_weights = [1.0 - 0.1 * (label_history_size - 1)]
        linear_smoothing_weights.extend(
            0.1 for _ in xrange(label_history_size - 1))

        # This is super inefficient - it should be caching smoothings involving the less-specific counters
        # e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on
        all_label_histories = set(permutations(labels, label_history_size - 1))
        for label_history in all_label_histories:
            histories = [
                history for history in (label_history[i:]
                                        for i in xrange(label_history_size))
            ]
            # >>> label_history = ('WDT', 'RBR')
            # histories = [('WDT', 'RBR'), ('RBR')]

            history_strings = ['::'.join(history) for history in histories]
            history_scores = [
                fallback_transition[len(history)][history_string]
                for history, history_string in izip(histories, history_strings)
            ]

            transition[history_strings[0]] = Counter()
            for smoothing, history_score in izip(linear_smoothing_weights,
                                                 history_scores):
                transition[history_strings[0]] += history_score * smoothing

        transition.normalize()

        return transition

    def train(self,
              labeled_sequence,
              fallback_model=None,
              fallback_training_limit=None,
              use_linear_smoothing=True):
        label_counts = [Counter() for _ in xrange(self.label_history_size)]
        self.fallback_transition = [
            CounterMap() for _ in xrange(self.label_history_size)
        ]
        self.fallback_reverse_transition = [
            CounterMap() for _ in xrange(self.label_history_size)
        ]

        labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True)
        labeled_sequence = list(
            HiddenMarkovModel._extend_labels(labeled_sequence,
                                             self.label_history_size + 1))

        # Load emission and transition counters from the raw data
        for label, label_histories, emission in labeled_sequence:
            full_label = self.push_label(label_histories[-1], label)

            self.emission[full_label][emission] += 1.0
            self.label_emissions[emission][full_label] += 1.0

            for history_size, label_history in enumerate(label_histories):
                label_counts[history_size][label_history] += 1.0
                self.fallback_transition[history_size][label_history][
                    full_label] += 1.0

        # Make the counters distributions
        for transition in self.fallback_transition:
            transition.normalize()
        self.label_emissions.normalize()
        self.emission.normalize()
        self.labels = self.emission.keys()

        # Smooth transitions using fallback data
        # Doesn't work with label history size 1!
        if use_linear_smoothing and self.label_history_size > 1:
            self.transition = \
             HiddenMarkovModel._linear_smooth(self.labels,
                      self.fallback_transition,
                      self.label_history_size)
        else:
            self.transition = self.fallback_transition[-1]

        # Convert to log score counters
        self.transition.log()
        self.label_emissions.log()
        self.emission.log()

        self.reverse_transition = self.transition.inverted()

        # Train the fallback model on the label-emission pairs
        if fallback_model:
            try:
                start = time()
                pickle_file = open("fallback_model.pickle")
                self.fallback_emissions_model, training_pairs_length = pickle.load(
                    pickle_file)
                pickle_file.close()

                if fallback_training_limit and fallback_training_limit != training_pairs_length:
                    raise IOError()
                elif not fallback_training_limit and len(
                        labeled_sequence) != training_pairs_length:
                    raise IOError()

                print "Unpickling fallback model: %f" % (time() - start)
            except (IOError, EOFError), e:
                print "Training fallback model"
                self.fallback_emissions_model = fallback_model()

                emissions_training_pairs = [
                    (emission_history[-1] + '::' + label, emission)
                    for label, emission_history, emission in labeled_sequence
                    if label != START_LABEL and label != STOP_LABEL
                ]

                if fallback_training_limit:
                    emissions_training_pairs = islice(emissions_training_pairs,
                                                      fallback_training_limit)

                self.fallback_emissions_model.train(emissions_training_pairs)

                serialized = (self.fallback_emissions_model,
                              len(labeled_sequence))
                pickle_file = open("fallback_model.pickle", "w")
                pickle.dump(serialized,
                            pickle_file,
                            protocol=pickle.HIGHEST_PROTOCOL)
                pickle_file.close()

        self._post_training()
コード例 #23
0
ファイル: test-maxent.py プロジェクト: beckgom/python-nlp
class MaximumEntropyLogProbsTest(unittest.TestCase):
	def setUp(self):
		self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy'])

		self.weights = CounterMap()
		self.weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		self.weights['cat'] = Counter({'warm' : 0.5, 'fuzzy' : 2.0})

		self.labels = set(self.weights.iterkeys())
		self.logp = maxent.get_log_probabilities(self.features, self.weights, self.labels)

	def test_fast_slow_equal(self):
		slow_logp = maximumentropy.slow_log_probs(self.features, self.weights, self.labels)

		self.assertEqual(self.logp, slow_logp)

	def test_logp_is_probability_distribution(self):
		"""
		Verify that all log probs are <= 0 and total probability is 1.0
		"""
		self.assertTrue(max(self.logp.itervalues()) <= 0.0)
		self.assertAlmostEqual(sum(exp(val) for val in self.logp.itervalues()), 1.0)

	def test_basic_values(self):
		"""
		Are the log probs as expected?
		"""
		self.assertAlmostEqual(exp(self.logp['cat']), 0.5)
		self.assertAlmostEqual(exp(self.logp['dog']), 0.5)

	def test_single_label(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['dog'], 0.0)

	def test_extraneous_label(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 0.5})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['cat'], float('-inf'))

	def test_zero_weight(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		self.assertEqual(logp['dog'], 0.0)
		
	def test_uneven_weights(self):
		weights = CounterMap()
		weights['dog'] = Counter({'warm' : 2.0, 'fuzzy' : 1.0})
		weights['cat'] = Counter({'warm' : 1.0, 'fuzzy' : 1.0})
		labels = set(weights.iterkeys())
		logp = maxent.get_log_probabilities(self.features, weights, labels)

		# construct scores
		scores = Counter()
		scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0
		scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0
		scores.log_normalize()

		# check scores explicitly
		self.assertAlmostEqual(scores['dog'], log(0.731), 3)
		self.assertAlmostEqual(scores['cat'], log(0.269), 3)

		# check that log probs is correct
		self.assertEqual(logp['dog'], scores['dog'])
		self.assertEqual(logp['cat'], scores['cat'])

	def test_performance(self):
		"""
		C api should be faster than python API (this is potentialy flakey, depending on system load patterns)
		"""
		start = time.time()
		for i in xrange(100000):
			test = maximumentropy.slow_log_probs(self.features, self.weights, self.labels)

		slow_time = time.time() - start

		start = time.time()
		for i in xrange(100000):
			test = maxent.get_log_probabilities(self.features, self.weights, self.labels)

		fast_time = time.time() - start

		self.assertTrue(fast_time < slow_time)
コード例 #24
0
ファイル: perftimings.py プロジェクト: tensorspace/nlp-python
def countermap_init(iter_src):
    test_countermap = CounterMap()
    for i in iter_src:
        test_countermap[i] += 1
    return test_countermap
コード例 #25
0
ファイル: test-maxent.py プロジェクト: tensorspace/nlp-python
class MaximumEntropyLogProbsTest(unittest.TestCase):
    def setUp(self):
        self.features = Counter((key, 1.0) for key in ['warm', 'fuzzy'])

        self.weights = CounterMap()
        self.weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        self.weights['cat'] = Counter({'warm': 0.5, 'fuzzy': 2.0})

        self.labels = set(self.weights.iterkeys())
        self.logp = maxent.get_log_probabilities(self.features, self.weights,
                                                 self.labels)

    def test_fast_slow_equal(self):
        slow_logp = maximumentropy.slow_log_probs(self.features, self.weights,
                                                  self.labels)

        self.assertEqual(self.logp, slow_logp)

    def test_logp_is_probability_distribution(self):
        """
		Verify that all log probs are <= 0 and total probability is 1.0
		"""
        self.assertTrue(max(self.logp.itervalues()) <= 0.0)
        self.assertAlmostEqual(sum(exp(val) for val in self.logp.itervalues()),
                               1.0)

    def test_basic_values(self):
        """
		Are the log probs as expected?
		"""
        self.assertAlmostEqual(exp(self.logp['cat']), 0.5)
        self.assertAlmostEqual(exp(self.logp['dog']), 0.5)

    def test_single_label(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['dog'], 0.0)

    def test_extraneous_label(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 0.5})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['cat'], float('-inf'))

    def test_zero_weight(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        self.assertEqual(logp['dog'], 0.0)

    def test_uneven_weights(self):
        weights = CounterMap()
        weights['dog'] = Counter({'warm': 2.0, 'fuzzy': 1.0})
        weights['cat'] = Counter({'warm': 1.0, 'fuzzy': 1.0})
        labels = set(weights.iterkeys())
        logp = maxent.get_log_probabilities(self.features, weights, labels)

        # construct scores
        scores = Counter()
        scores['dog'] = 2.0 * 1.0 + 1.0 * 1.0
        scores['cat'] = 1.0 * 1.0 + 1.0 * 1.0
        scores.log_normalize()

        # check scores explicitly
        self.assertAlmostEqual(scores['dog'], log(0.731), 3)
        self.assertAlmostEqual(scores['cat'], log(0.269), 3)

        # check that log probs is correct
        self.assertEqual(logp['dog'], scores['dog'])
        self.assertEqual(logp['cat'], scores['cat'])

    def test_performance(self):
        """
		C api should be faster than python API (this is potentialy flakey, depending on system load patterns)
		"""
        start = time.time()
        for i in xrange(100000):
            test = maximumentropy.slow_log_probs(self.features, self.weights,
                                                 self.labels)

        slow_time = time.time() - start

        start = time.time()
        for i in xrange(100000):
            test = maxent.get_log_probabilities(self.features, self.weights,
                                                self.labels)

        fast_time = time.time() - start

        self.assertTrue(fast_time < slow_time)
コード例 #26
0
from itertools import izip, repeat, chain

from maxent import get_log_probabilities, get_expected_counts
from countermap import CounterMap
from counter import Counter


def cnter(l):
    return Counter(izip(l, repeat(1.0, len(l))))


training_data = (('cat', cnter(
    ('fuzzy', 'claws', 'small'))), ('bear', cnter(
        ('fuzzy', 'claws', 'big'))), ('cat', cnter(('claws', 'medium'))))

labels = set([label for label, _ in training_data])
features = set()
for _, counter in training_data:
    features.update(set(counter.keys()))

weights = CounterMap()

log_probs = list()
for pos, (label, features) in enumerate(training_data):
    log_probs.append(get_log_probabilities(features, weights, labels))

test = get_expected_counts(training_data, labels, log_probs, CounterMap())

print test
コード例 #27
0
ファイル: hmm.py プロジェクト: beckgom/python-nlp
class HiddenMarkovModel:
	def __init__(self, label_history_size=2):
		# Distribution over next state given current state
		self.labels = list()
		self.label_history_size = label_history_size
		self.transition = CounterMap()
		self.reverse_transition = CounterMap() # same as transitions but indexed in reverse (useful for decoding)

		self.fallback_emissions_model = None
		self.fallback_transition = None
		self.fallback_reverse_transition = None

		# Multinomial distribution over emissions given label
		self.emission = CounterMap()
		# p(label | emission)
		self.label_emissions = CounterMap()

	def _pad_sequence(self, sequence, pairs=False):
		if pairs: yield (START_LABEL, START_LABEL)
		else: yield START_LABEL

		for item in sequence: yield item

		# Pad the end so we'll decode the whole thing
		for _ in xrange(self.label_history_size):
			if pairs: yield (STOP_LABEL, STOP_LABEL)
			else: yield STOP_LABEL

	@classmethod
	def _extend_labels(cls, sequence, label_history_size):
		'''
		>>> foo = HiddenMarkovModel()
		>>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 1)
		[('A', (), 3), ('B', (), 4), ('C', (), 5)]
		>>> foo._extend_labels((('A', 3), ('B', 4), ('C', 5)), 2)
		[('A', ('<START>',), 3),
		 ('B', ('A',), 4),
		 ('C', ('B',), 5)]
		'''
		last_labels = [START_LABEL for _ in xrange(label_history_size)]

		for label, emission in sequence:
			last_labels.append(label)
			last_labels.pop(0)

			if label == START_LABEL:
				last_labels = [START_LABEL for _ in xrange(label_history_size)]

			all_labels = ('::'.join(last_labels[label_history_size-length-2:-1])
						  for length in xrange(label_history_size-1))
			yield (label, tuple(all_labels), emission)

	@property
	def start_label(self):
		return '::'.join(repeat(START_LABEL, self.label_history_size))

	@property
	def stop_label(self):
		return '::'.join(repeat(STOP_LABEL, self.label_history_size))

	def push_label(self, history, label):
		return '::'.join(history.split('::')[1:] + [label,])

	@classmethod
	def _linear_smooth(cls, labels, fallback_transition, label_history_size):
		transition = CounterMap()
		linear_smoothing_weights = [1.0 - 0.1 * (label_history_size-1)]
		linear_smoothing_weights.extend(0.1 for _ in xrange(label_history_size-1))

		# This is super inefficient - it should be caching smoothings involving the less-specific counters
		# e.g. smoothed['NN']['CD'] = cnter['NN']['CD'] * \lambda * smoothed['NN'] and so on
		all_label_histories = set(permutations(labels, label_history_size-1))
		for label_history in all_label_histories:
			histories = [history for history in (label_history[i:] for i in xrange(label_history_size))]
			# >>> label_history = ('WDT', 'RBR')
			# histories = [('WDT', 'RBR'), ('RBR')]

			history_strings = ['::'.join(history) for history in histories]
			history_scores = [fallback_transition[len(history)][history_string] for history, history_string in izip(histories, history_strings)]

			transition[history_strings[0]] = Counter()
			for smoothing, history_score in izip(linear_smoothing_weights, history_scores):
				transition[history_strings[0]] += history_score * smoothing

		transition.normalize()

		return transition

	def train(self, labeled_sequence, fallback_model=None, fallback_training_limit=None, use_linear_smoothing=True):
		label_counts = [Counter() for _ in xrange(self.label_history_size)]
		self.fallback_transition = [CounterMap() for _ in xrange(self.label_history_size)]
		self.fallback_reverse_transition = [CounterMap() for _ in xrange(self.label_history_size)]

		labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True)
		labeled_sequence = list(HiddenMarkovModel._extend_labels(labeled_sequence, self.label_history_size+1))

		# Load emission and transition counters from the raw data
		for label, label_histories, emission in labeled_sequence:
			full_label = self.push_label(label_histories[-1], label)

			self.emission[full_label][emission] += 1.0
			self.label_emissions[emission][full_label] += 1.0

			for history_size, label_history in enumerate(label_histories):
				label_counts[history_size][label_history] += 1.0
				self.fallback_transition[history_size][label_history][full_label] += 1.0

		# Make the counters distributions
		for transition in self.fallback_transition:	transition.normalize()
		self.label_emissions.normalize()
		self.emission.normalize()
		self.labels = self.emission.keys()

		# Smooth transitions using fallback data
		# Doesn't work with label history size 1!
		if use_linear_smoothing and self.label_history_size > 1:
			self.transition = \
				HiddenMarkovModel._linear_smooth(self.labels,
												 self.fallback_transition,
												 self.label_history_size)
		else:
			self.transition = self.fallback_transition[-1]

		# Convert to log score counters
		self.transition.log()
		self.label_emissions.log()
		self.emission.log()

		self.reverse_transition = self.transition.inverted()

		# Train the fallback model on the label-emission pairs
		if fallback_model:
			try:
				start = time()
				pickle_file = open("fallback_model.pickle")
				self.fallback_emissions_model, training_pairs_length = pickle.load(pickle_file)
				pickle_file.close()

				if fallback_training_limit and fallback_training_limit != training_pairs_length:
					raise IOError()
				elif not fallback_training_limit and len(labeled_sequence) != training_pairs_length:
					raise IOError()

				print "Unpickling fallback model: %f" % (time() - start)
			except (IOError, EOFError), e:
				print "Training fallback model"
				self.fallback_emissions_model = fallback_model()

				emissions_training_pairs = [(emission_history[-1] + '::' + label, emission) for label, emission_history, emission in labeled_sequence if label != START_LABEL and label != STOP_LABEL]

				if fallback_training_limit:
					emissions_training_pairs = islice(emissions_training_pairs, fallback_training_limit)

				self.fallback_emissions_model.train(emissions_training_pairs)

				serialized = (self.fallback_emissions_model, len(labeled_sequence))
				pickle_file = open("fallback_model.pickle", "w")
				pickle.dump(serialized, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
				pickle_file.close()

		self._post_training()
コード例 #28
0
    def _sample_datum(self, datum):
        likelihoods = Counter(float("-inf"))
        priors = Counter(float("-inf"))
        posteriors = Counter(float("-inf"))
        sizes = Counter()

        # Regenerate all the cluster params (should be caching this,
        # not doing it inline)
        for c_idx, cluster in self._cluster_to_datum.iteritems():
            if not cluster:
                continue
            sizes[c_idx] = len(cluster)
            cluster_mean = sum(cluster) / float(sizes[c_idx])
            cluster_covariance = 1.0 / float(len(cluster) + 1) * sum(
                outer_product((pt - cluster_mean), (pt - cluster_mean))
                for pt in cluster)

            posteriors[c_idx], priors[c_idx], likelihoods[
                c_idx] = self._cluster_log_probs(cluster, sizes[c_idx],
                                                 cluster_mean,
                                                 cluster_covariance, datum)

            if all(prob == float("-inf")
                   for prob in (priors[c_idx], likelihoods[c_idx],
                                posteriors[c_idx])):
                del priors[c_idx]
                del likelihoods[c_idx]
                del posteriors[c_idx]
                del sizes[c_idx]
                continue

        # Now generate probs for the new cluster
        # prefer to reuse an old cluster # if possible
        new_cluster = min(
            [c for c, d in self._cluster_to_datum.iteritems() if not d],
            len(self._cluster_to_datum))

        sizes[new_cluster] = self._concentration

        # build a really lame covariance matrix for single points
        covariance = CounterMap()
        for axis in datum:
            covariance[axis] = 1.0

        posteriors[new_cluster], priors[new_cluster], likelihoods[
            new_cluster] = self._cluster_log_probs([], sizes[new_cluster],
                                                   datum, covariance, datum)

        for dist in priors, likelihoods, posteriors:
            if not all(v <= 0.0 for v in dist.itervalues()):
                print "Not a log distribution: %s" % dist
                print "(new cluster %d)" % new_cluster
                print datum
                for k, scores in dist.iteritems():
                    if all(v <= 0.0 for v in scores.itervalues()): continue
                    print "error on cluster %d" % k
                    print "posteriors: %r" % posteriors[k]
                    print "priors: %r" % priors[k]
                    print "likelihoods: %r" % likelihoods[k]
                    print "sizes: %r" % sizes[k]
                raise Exception()

        probs = likelihoods + priors - posteriors
        probs.exp()
        probs *= sizes

        # filter out nan
        for k, v in probs.items():
            if v != v:
                del probs[k]

        probs.normalize()

        assert all(
            0.0 <= p <= 1.0
            for p in probs.itervalues()), "Not a distribution: %s" % probs
        return probs.sample()
コード例 #29
0
    def train(self,
              labeled_sequence,
              fallback_model=None,
              fallback_training_limit=None,
              use_linear_smoothing=True):
        label_counts = [Counter() for _ in xrange(self.label_history_size)]
        self.fallback_transition = [
            CounterMap() for _ in xrange(self.label_history_size)
        ]
        self.fallback_reverse_transition = [
            CounterMap() for _ in xrange(self.label_history_size)
        ]

        labeled_sequence = self._pad_sequence(labeled_sequence, pairs=True)
        labeled_sequence = list(
            HiddenMarkovModel._extend_labels(labeled_sequence,
                                             self.label_history_size + 1))

        # Load emission and transition counters from the raw data
        for label, label_histories, emission in labeled_sequence:
            full_label = self.push_label(label_histories[-1], label)

            self.emission[full_label][emission] += 1.0
            self.label_emissions[emission][full_label] += 1.0

            for history_size, label_history in enumerate(label_histories):
                label_counts[history_size][label_history] += 1.0
                self.fallback_transition[history_size][label_history][
                    full_label] += 1.0

        # Make the counters distributions
        for transition in self.fallback_transition:
            transition.normalize()
        self.label_emissions.normalize()
        self.emission.normalize()
        self.labels = self.emission.keys()

        # Smooth transitions using fallback data
        # Doesn't work with label history size 1!
        if use_linear_smoothing and self.label_history_size > 1:
            self.transition = \
             HiddenMarkovModel._linear_smooth(self.labels,
                      self.fallback_transition,
                      self.label_history_size)
        else:
            self.transition = self.fallback_transition[-1]

        # Convert to log score counters
        self.transition.log()
        self.label_emissions.log()
        self.emission.log()

        self.reverse_transition = self.transition.inverted()

        # Train the fallback model on the label-emission pairs
        if fallback_model:
            try:
                start = time()
                pickle_file = open("fallback_model.pickle")
                self.fallback_emissions_model, training_pairs_length = pickle.load(
                    pickle_file)
                pickle_file.close()

                if fallback_training_limit and fallback_training_limit != training_pairs_length:
                    raise IOError()
                elif not fallback_training_limit and len(
                        labeled_sequence) != training_pairs_length:
                    raise IOError()

                print "Unpickling fallback model: %f" % (time() - start)
            except (IOError, EOFError), e:
                print "Training fallback model"
                self.fallback_emissions_model = fallback_model()

                emissions_training_pairs = [
                    (emission_history[-1] + '::' + label, emission)
                    for label, emission_history, emission in labeled_sequence
                    if label != START_LABEL and label != STOP_LABEL
                ]

                if fallback_training_limit:
                    emissions_training_pairs = islice(emissions_training_pairs,
                                                      fallback_training_limit)

                self.fallback_emissions_model.train(emissions_training_pairs)

                serialized = (self.fallback_emissions_model,
                              len(labeled_sequence))
                pickle_file = open("fallback_model.pickle", "w")
                pickle.dump(serialized,
                            pickle_file,
                            protocol=pickle.HIGHEST_PROTOCOL)
                pickle_file.close()
コード例 #30
0
def toy_problem(args):
    #pragma: no cover
    # Simulate a 3 state markov chain with transition matrix (given states in row vector):
    #  (destination)
    #   1    2    3
    # 1 0.7  0.3  0
    # 2 0.05 0.4  0.55
    # 3 0.25 0.25 0.5
    transitions = CounterMap()

    transitions['1']['1'] = 0.7
    transitions['1']['2'] = 0.3
    transitions['1']['3'] = 0.0

    transitions['2']['1'] = 0.05
    transitions['2']['2'] = 0.4
    transitions['2']['3'] = 0.55

    transitions['3']['1'] = 0.25
    transitions['3']['2'] = 0.25
    transitions['3']['3'] = 0.5

    def sample_transition(label):
        sample = random.random()

        for next, prob in transitions[label].iteritems():
            sample -= prob
            if sample <= 0.0: return next

        assert False, "Should have returned a next state"

    # And emissions (state, (counter distribution)): {1 : (yes : 0.5, sure : 0.5), 2 : (maybe : 0.75, who_knows : 0.25), 3 : (no : 1)}
    emissions = {
        '1': {
            'yes': 0.5,
            'sure': 0.5
        },
        '2': {
            'maybe': 0.75,
            'who_knows': 0.25
        },
        '3': {
            'no': 1.0
        }
    }

    def sample_emission(label):
        if label in [START_LABEL, STOP_LABEL]: return label
        choice = random.random()

        for emission, prob in emissions[label].iteritems():
            choice -= prob
            if choice <= 0.0: return emission

        assert False, "Should have returned an emission"

    # Create the training/test data
    states = ['1', '2', '3']
    start = random.choice(states)

    # Burn-in (easier than hand-calculating stationary distribution & sampling)
    for i in xrange(10000):
        start = sample_transition(start)

    def label_generator(start_label):
        next = start_label
        while True:
            yield next
            next = sample_transition(next)

    training_labels = [
        val for _, val in izip(xrange(1000), label_generator('1'))
    ]
    training_labels.extend((START_LABEL, STOP_LABEL))
    training_labels.extend(
        [val for _, val in izip(xrange(1000), label_generator('2'))])
    training_labels.extend((START_LABEL, STOP_LABEL))
    training_labels.extend(
        [val for _, val in izip(xrange(1000), label_generator('3'))])

    training_emissions = [sample_emission(label) for label in training_labels]

    training_signal = zip(training_labels, training_emissions)

    # Training phase
    signal_decoder = HiddenMarkovModel(label_history_size=1)
    signal_decoder.train(training_signal)

    # Labeling phase: given a set of emissions, guess the correct states
    start = random.choice(states)
    for i in xrange(10000):
        start = sample_transition(start)
    test_labels = [val for _, val in izip(xrange(500), label_generator(start))]
    test_emissions = [sample_emission(label) for label in training_labels]

    guessed_labels = signal_decoder.label(test_emissions)
    correct = sum(1 for guessed, correct in izip(guessed_labels, test_labels)
                  if guessed == correct)

    print "%d labels recovered correctly (%.2f%% correct out of %d)" % (
        correct, 100.0 * float(correct) / float(len(test_labels)),
        len(test_labels))