def classify_multilabel(self, unlabeled_token): old_tsl = type_safety_level(0) labels = [] for j in range(len(self._labels)): sum = 0 for classifier, weight in zip(self._classifiers, self._weights): p = classifier[j].prob(Token(LabeledText(unlabeled_token.type(), self._labels[j]), unlabeled_token.loc())) sum += (p * 2.0 - 1.0) * weight if sum >= 0: labels.append(self._labels[j]) type_safety_level(old_tsl) return Token(LabeledText(unlabeled_token.type(), tuple(labels)), unlabeled_token.loc())
def classify(self, unlabeled_token): old_tsl = type_safety_level(0) labels = [] for j in range(len(self._labels)): sum = 0 for classifier, weight in zip(self._classifiers, self._weights): p = classifier[j].prob(Token(LabeledText(unlabeled_token.type(), 'positive'), unlabeled_token.loc())) sum += (p * 2.0 - 1.0) * weight labels.append((sum, self._labels[j])) labels.sort() best = labels[-1][1] type_safety_level(old_tsl) return Token(LabeledText(unlabeled_token.type(), (labels[-1][1],)), unlabeled_token.loc())
def train(self, labeled_tokens, **kwargs): """ Build a new C{NBClassifier} from the given training data. @type labeled_tokens: C{list} of (C{Token} with type C{LabeledText}) @param labeled_tokens: A list of correctly labeled texts. These texts will be used as training samples to construct new classifiers. @return: A new classifier, trained from the given labeled tokens. @rtype: C{ClassifierI} """ assert _chktype(1, labeled_tokens, [Token], (Token,)) # Process the keyword arguments estimator = 'ELE' labels = None weights = [1 for tk in labeled_tokens] all_kwargs = dict(self._kwargs) all_kwargs.update(kwargs) for key, val in all_kwargs.items(): if key == 'estimator': estimator = val elif key == 'labels': labels = val elif key == 'weights': weights = val else: raise TypeError('Unknown keyword arg %s' % key) if labels is None: labels = find_labels(labeled_tokens) # work around for bug in freq dist tsl = type_safety_level(0) # Construct a frequency distribution from the training data label_freqdist = FreqDist() fval_freqdist = ConditionalFreqDist() for labeled_token, weight in zip(labeled_tokens, weights): labeled_type = labeled_token.type() label = labeled_type.label() label_freqdist.inc(label, weight) fv_list = self._fd_list.detect(labeled_type.text()) # only care about assignments - or do we? for fid, fval in fv_list.assignments(): fval_freqdist[label].inc((fid, fval), weight) # work around for bug in freq dist type_safety_level(tsl) # Construct a probability distribution from the freq dist if type(estimator) != type(""): if estimator[0].lower() == 'lidstone': l = estimator[1] label_probdist = LidstoneProbDist(label_freqdist, l) def f(fdist, l=l): return LidstoneProbDist(fdist, l) fval_probdist = ConditionalProbDist(fval_freqdist, f) elif estimator.lower() == 'mle': label_probdist = MLEProbDist(label_freqdist) fval_probdist = ConditionalProbDist(fval_freqdist, MLEProbDist) elif estimator.lower() == 'laplace': label_probdist = LaplaceProbDist(label_freqdist) fval_probdist = ConditionalProbDist(fval_freqdist, LaplaceProbDist) elif estimator.lower() == 'ele': label_probdist = ELEProbDist(label_freqdist) fval_probdist = ConditionalProbDist(fval_freqdist, ELEProbDist) else: raise ValueError('Unknown estimator type %r' % estimator) return NBClassifier(self._fd_list, labels, label_probdist, fval_probdist)