def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary=False, feature_values=None, verbose=False): if len(labeled_featuresets) <= support_cutoff: return if self._fname is None: return if depth_cutoff <= 0: return for fval in self._decisions: fval_featuresets = [(featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) == fval] label_freqs = FreqDist(label for (featureset, label) in fval_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._decisions[fval] = DecisionTreeClassifier.train( fval_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose) if self._default is not None: default_featuresets = [ (featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) not in self._decisions ] label_freqs = FreqDist(label for (featureset, label) in default_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._default = DecisionTreeClassifier.train( default_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose)
def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary=False, feature_values=None, verbose=False): if len(labeled_featuresets) <= support_cutoff: return if self._fname is None: return if depth_cutoff <= 0: return for fval in self._decisions: fval_featuresets = [(featureset,label) for (featureset,label) in labeled_featuresets if featureset.get(self._fname) == fval] label_freqs = FreqDist(label for (featureset,label) in fval_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._decisions[fval] = DecisionTreeClassifier.train( fval_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose) if self._default is not None: default_featuresets = [(featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) not in self._decisions] label_freqs = FreqDist(label for (featureset,label) in default_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._default = DecisionTreeClassifier.train( default_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose)
def get_features(self): """get sausage features as a list of dicts""" features = [] for i,slot in enumerate(self.sausage): feats = {} best = slot.max() probs = [slot.prob(s) for s in slot.samples()] # length of the sausage feats['sausage_length'] = len(self.sausage) # position of this slot in the sausage feats['slot_position'] = i # mean of slot arc posteriors feats['slot_mean'] = np.mean(probs) # standard deviation of slot arc posteriors feats['slot_stdev'] = np.std(probs) # entropy of slot arc posteriors feats['slot_entropy'] = entropy(slot) # highest posterior in slot feats['slot_highest'] = slot.prob(best) # length of highest posterior word in slot feats['slot_best_length'] = 0 if best == '*DELETE*' else len(best) # 1 if highest posterior arc is *DELETE* feats['delete'] = int(best == '*DELETE*') features.append(feats) return features
def get_features(self): """get sausage features as a list of dicts""" features = [] for i, slot in enumerate(self.sausage): feats = {} best = slot.max() probs = [slot.prob(s) for s in slot.samples()] # length of the sausage feats['sausage_length'] = len(self.sausage) # position of this slot in the sausage feats['slot_position'] = i # mean of slot arc posteriors feats['slot_mean'] = np.mean(probs) # standard deviation of slot arc posteriors feats['slot_stdev'] = np.std(probs) # entropy of slot arc posteriors feats['slot_entropy'] = entropy(slot) # highest posterior in slot feats['slot_highest'] = slot.prob(best) # length of highest posterior word in slot feats['slot_best_length'] = 0 if best == '*DELETE*' else len(best) # 1 if highest posterior arc is *DELETE* feats['delete'] = int(best == '*DELETE*') features.append(feats) return features
def classification_feature_vector(hyp, aligned_hyp, sausage, score, ascore, lscore): '''returns a feature vector for the logistic regression''' entropies = [entropy(slot) for slot in sausage.sausage] len_ratio = len(hyp) / len(aligned_hyp) num_deletes = sum(1 for tok in aligned_hyp if tok == DELETE_TOKEN) vec = [sausage.score_hyp(aligned_hyp), score, ascore, lscore, min(entropies), max(entropies), num_deletes, len_ratio] return vec
def get_stats(slot): probs = [slot.prob(s) for s in slot.samples()] mean = np.mean(probs) stdev = np.std(probs) ent = entropy(slot) return mean, stdev, ent
lfeats = label_feats_from_corpus(movie_reviews) print(lfeats.keys()) # dict_keys(['neg', 'pos']) train_feats, test_feats = split_label_feats(lfeats, split=0.75) print("Training Set: " + str(len(train_feats))) # Training Set: 1500 print("Test Set: " + str(len(test_feats))) # Test Set: 500 dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30) print("Accuracy: " + str(accuracy(dt_classifier, test_feats))) # Accuracy: 0.688 fd = FreqDist({'pos': 30, 'neg': 10}) print(entropy(MLEProbDist(fd))) fd['neg'] = 25 print(entropy(MLEProbDist(fd))) fd['neg'] = 30 print(entropy(MLEProbDist(fd))) fd['neg'] = 1 print(entropy(MLEProbDist(fd)))