Exemple #1
0
    def refine(self,
               labeled_featuresets,
               entropy_cutoff,
               depth_cutoff,
               support_cutoff,
               binary=False,
               feature_values=None,
               verbose=False):
        if len(labeled_featuresets) <= support_cutoff: return
        if self._fname is None: return
        if depth_cutoff <= 0: return
        for fval in self._decisions:
            fval_featuresets = [(featureset, label)
                                for (featureset, label) in labeled_featuresets
                                if featureset.get(self._fname) == fval]

            label_freqs = FreqDist(label
                                   for (featureset, label) in fval_featuresets)
            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
                self._decisions[fval] = DecisionTreeClassifier.train(
                    fval_featuresets, entropy_cutoff, depth_cutoff,
                    support_cutoff, binary, feature_values, verbose)
        if self._default is not None:
            default_featuresets = [
                (featureset, label)
                for (featureset, label) in labeled_featuresets
                if featureset.get(self._fname) not in self._decisions
            ]
            label_freqs = FreqDist(label for (featureset,
                                              label) in default_featuresets)
            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
                self._default = DecisionTreeClassifier.train(
                    default_featuresets, entropy_cutoff, depth_cutoff,
                    support_cutoff, binary, feature_values, verbose)
Exemple #2
0
    def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff,
               support_cutoff, binary=False, feature_values=None,
               verbose=False):
        if len(labeled_featuresets) <= support_cutoff: return
        if self._fname is None: return
        if depth_cutoff <= 0: return
        for fval in self._decisions:
            fval_featuresets = [(featureset,label) for (featureset,label)
                                in labeled_featuresets
                                if featureset.get(self._fname) == fval]

            label_freqs = FreqDist(label for (featureset,label)
                                   in fval_featuresets)
            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
                self._decisions[fval] = DecisionTreeClassifier.train(
                    fval_featuresets, entropy_cutoff, depth_cutoff,
                    support_cutoff, binary, feature_values, verbose)
        if self._default is not None:
            default_featuresets = [(featureset, label) for (featureset, label)
                                   in labeled_featuresets
                                   if featureset.get(self._fname) not in
                                   self._decisions]
            label_freqs = FreqDist(label for (featureset,label)
                                   in default_featuresets)
            if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
                self._default = DecisionTreeClassifier.train(
                    default_featuresets, entropy_cutoff, depth_cutoff,
                    support_cutoff, binary, feature_values, verbose)
 def get_features(self):
     """get sausage features as a list of dicts"""
     features = []
     for i,slot in enumerate(self.sausage):
         feats = {}
         best = slot.max()
         probs = [slot.prob(s) for s in slot.samples()]
         # length of the sausage
         feats['sausage_length'] = len(self.sausage)
         # position of this slot in the sausage
         feats['slot_position'] = i
         # mean of slot arc posteriors
         feats['slot_mean'] = np.mean(probs)
         # standard deviation of slot arc posteriors
         feats['slot_stdev'] = np.std(probs)
         # entropy of slot arc posteriors
         feats['slot_entropy'] = entropy(slot)
         # highest posterior in slot
         feats['slot_highest'] = slot.prob(best)
         # length of highest posterior word in slot
         feats['slot_best_length'] = 0 if best == '*DELETE*' else len(best)
         # 1 if highest posterior arc is *DELETE*
         feats['delete'] = int(best == '*DELETE*')
         features.append(feats)
     return features
 def get_features(self):
     """get sausage features as a list of dicts"""
     features = []
     for i, slot in enumerate(self.sausage):
         feats = {}
         best = slot.max()
         probs = [slot.prob(s) for s in slot.samples()]
         # length of the sausage
         feats['sausage_length'] = len(self.sausage)
         # position of this slot in the sausage
         feats['slot_position'] = i
         # mean of slot arc posteriors
         feats['slot_mean'] = np.mean(probs)
         # standard deviation of slot arc posteriors
         feats['slot_stdev'] = np.std(probs)
         # entropy of slot arc posteriors
         feats['slot_entropy'] = entropy(slot)
         # highest posterior in slot
         feats['slot_highest'] = slot.prob(best)
         # length of highest posterior word in slot
         feats['slot_best_length'] = 0 if best == '*DELETE*' else len(best)
         # 1 if highest posterior arc is *DELETE*
         feats['delete'] = int(best == '*DELETE*')
         features.append(feats)
     return features
Exemple #5
0
def classification_feature_vector(hyp, aligned_hyp, sausage, score, ascore, lscore):
    '''returns a feature vector for the logistic regression'''
    entropies = [entropy(slot) for slot in sausage.sausage]
    len_ratio = len(hyp) / len(aligned_hyp)
    num_deletes = sum(1 for tok in aligned_hyp if tok == DELETE_TOKEN)
    vec = [sausage.score_hyp(aligned_hyp), score, ascore, lscore,
           min(entropies), max(entropies), num_deletes, len_ratio]
    return vec
Exemple #6
0
def get_stats(slot):
    probs = [slot.prob(s) for s in slot.samples()]
    mean = np.mean(probs)
    stdev = np.std(probs)
    ent = entropy(slot)
    return mean, stdev, ent
lfeats = label_feats_from_corpus(movie_reviews)
print(lfeats.keys())
# dict_keys(['neg', 'pos'])

train_feats, test_feats = split_label_feats(lfeats, split=0.75)

print("Training Set: " + str(len(train_feats)))
# Training Set: 1500
print("Test Set: " + str(len(test_feats)))
# Test Set: 500

dt_classifier = DecisionTreeClassifier.train(train_feats,
                                             binary=True,
                                             entropy_cutoff=0.8,
                                             depth_cutoff=5,
                                             support_cutoff=30)
print("Accuracy: " + str(accuracy(dt_classifier, test_feats)))
# Accuracy: 0.688

fd = FreqDist({'pos': 30, 'neg': 10})
print(entropy(MLEProbDist(fd)))

fd['neg'] = 25
print(entropy(MLEProbDist(fd)))

fd['neg'] = 30
print(entropy(MLEProbDist(fd)))

fd['neg'] = 1
print(entropy(MLEProbDist(fd)))