def label(self, datum): datum_features = Counter() for feature in ngrams(datum, 1): datum_features[feature] += 1.0 log_probs = self.get_log_probabilities(datum_features) return log_probs.arg_max()
def label_distribution(self, datum): datum_features = Counter() for feature in ngrams(datum, 1): datum_features[feature] += 1.0 log_probs = self.get_log_probabilities(datum_features) return log_probs
def label(self, datum): distribution = None for feature in ngrams(datum, 3): if distribution: distribution += self.feature_distribution[feature] else: distribution = copy(self.feature_distribution[feature]) return distribution.arg_max()
def test_one_gram(self): test_string = "hello" test_features = set(tuple(x) for x in features.ngrams(test_string, 1)) expected_features = set(tuple(x) for x in "hello") for f in expected_features: self.assertTrue(f in test_features) for f in test_features: self.assertTrue(f in expected_features)
def label_distribution(self, datum): distribution = None for feature in ngrams(datum, 3): if distribution: distribution += self.feature_distribution[feature] else: distribution = copy(self.feature_distribution[feature]) distribution.log_normalize() return distribution
def test_three_grams(self): test_string = "hello" start = "<START>" stop = "<STOP>" test_features = set(tuple(x) for x in features.ngrams(test_string, 3, start, stop)) expected_features = set(tuple(x) for x in ([start, start, 'h'], [start, 'h'], ['h'], [start, 'h', 'e'], ['h', 'e'], ['e'], ['h', 'e', 'l'], ['e', 'l'], ['l'], ['e', 'l', 'l'], ['l', 'l'], ['l'], ['l', 'l', 'o'], ['l', 'o'], ['o'], ['l', 'o', stop], ['o', stop], ['o', stop, stop]))
def train(self, labeled_data): self.feature_distribution = CounterMap() labels = set() for label, datum in labeled_data: labels.add(label) for feature in ngrams(datum, 3) self.feature_distribution[feature][label] += 1 for feature in self.feature_distribution.iterkeys(): self.feature_distribution[feature].default = 0.01 self.feature_distribution.normalize() self.feature_distribution.log()
def test_three_grams(self): test_string = "hello" start = "<START>" stop = "<STOP>" test_features = set( tuple(x) for x in features.ngrams(test_string, 3, start, stop)) expected_features = set( tuple(x) for x in ([start, start, 'h'], [start, 'h'], ['h'], [start, 'h', 'e'], ['h', 'e'], ['e'], ['h', 'e', 'l'], ['e', 'l'], ['l'], ['e', 'l', 'l'], ['l', 'l'], ['l'], ['l', 'l', 'o'], ['l', 'o'], ['o'], ['l', 'o', stop], ['o', stop], ['o', stop, stop]))
def train(self, labeled_data): self.labels, self.features = set(), set() print "Building features..." labeled_features = [] for label, datum in labeled_data: self.labels.add(label) features = Counter() for feature in ngrams(datum, 1): features[feature] += 1.0 self.features.add(feature) labeled_features.append((label, features)) print "%d features" % len(self.features) print "%d labels" % len(self.labels) self.train_with_features(labeled_features)