def train(cls, labels, multi_label_feats, trainf, **train_kwargs): labelset = set(labels) label_feats = collections.defaultdict(list) pos_label_feats = collections.defaultdict(set) for feat, multi_labels in multi_label_feats: for label in multi_labels: label_feats[label].append((feat, True)) # dicts are unhashable, so use a normalized tuple of key-values pos_label_feats[label].add(tuple(sorted(feat.items()))) for label in labelset - set(multi_labels): label_feats[label].append((feat, False)) for label in label_feats.keys(): feats = [] # this re-creates the feats list by ignoring any negative feat dicts # that are also in pos_label_feats[label] so we don't create # training conflicts for feat, l in label_feats[label]: if l or tuple(sorted(feat.items())) not in pos_label_feats[label]: feats.append((feat, l)) label_feats[label] = feats label_classifiers = {} for label, feats in iteritems(label_feats): label_classifiers[label] = trainf(feats, **train_kwargs) return cls(label_classifiers)
def train(cls, labels, multi_label_feats, trainf, **train_kwargs): labelset = set(labels) label_feats = collections.defaultdict(list) pos_label_feats = collections.defaultdict(set) for feat, multi_labels in multi_label_feats: for label in multi_labels: label_feats[label].append((feat, True)) # dicts are unhashable, so use a normalized tuple of key-values pos_label_feats[label].add(tuple(sorted(feat.items()))) for label in labelset - set(multi_labels): label_feats[label].append((feat, False)) for label in label_feats.keys(): feats = [] # this re-creates the feats list by ignoring any negative feat dicts # that are also in pos_label_feats[label] so we don't create # training conflicts for feat, l in label_feats[label]: if l or tuple(sorted( feat.items())) not in pos_label_feats[label]: feats.append((feat, l)) label_feats[label] = feats label_classifiers = {} for label, feats in iteritems(label_feats): label_classifiers[label] = trainf(feats, **train_kwargs) return cls(label_classifiers)
def classify(self, feats): lbls = set() for label, classifier in iteritems(self._label_classifiers): if classifier.classify(feats) is True: lbls.add(label) return lbls
def category_words(): ''' return an iteration of tuples of category and list of all words in instances of that category. Used if we are scoring the words for correlation to categories for feature selection (i.e., score_fn and max_feats are set) ''' return ((cat, (word for i in instance_list for word in i)) for cat, instance_list in iteritems(train_instances))
def feature_detector(self, tokens, index, history): feats = ClassifierBasedPOSTagger.feature_detector(self, tokens, index, history) s = tokens[index] for key, fun in iteritems(self.funs): feats[key] = fun(s) return feats
def feature_detector(self, tokens, index, history): feats = ClassifierBasedPOSTagger.feature_detector( self, tokens, index, history) s = tokens[index] for key, fun in iteritems(self.funs): feats[key] = fun(s) return feats
def extract_features(label_instances, featx): if isinstance(label_instances, dict): # for not (args.multi and args.binary) # e.g., li = { 'spam': [ ['hello','world',...], ... ], 'ham': [ ['lorem','ipsum'...], ... ] } feats = [] for label, instances in iteritems(label_instances): feats.extend([(featx(i), label) for i in instances]) else: # for arg.multi and args.binary # e.g., li = [ (['hello','world',...],label1), (['lorem','ipsum'],label2) ] feats = [(featx(i), label) for i, label in label_instances] return feats
def extract_features(label_instances, featx): if isinstance(label_instances, dict): # for not (args.multi and args.binary) # e.g., li = { 'spam': [ ['hello','world',...], ... ], 'ham': [ ['lorem','ipsum'...], ... ] } feats = [] for label, instances in iteritems(label_instances): feats.extend([(featx(i), label) for i in instances]) else: # for arg.multi and args.binary # e.g., li = [ (['hello','world',...],label1), (['lorem','ipsum'],label2) ] feats = [(featx(i), label) for i, label in label_instances ] return feats
def category_words(): ''' return an iteration of tuples of category and list of all words in instances of that category. Used if we are scoring the words for correlation to categories for feature selection (i.e., score_fn and max_feats are set) ''' cat_words = defaultdict([]) for (words, cats) in train_instances: if isinstance(cats, collections.Iterable): for cat in cats: cat_words[cat].extend(words) else: cat_words[cats].extend(words) return iteritems(cat_words)
def sum_category_word_scores(categorized_words, score_fn): word_fd = FreqDist() category_word_fd = ConditionalFreqDist() for category, words in categorized_words: for word in words: word_fd.inc(word) category_word_fd[category].inc(word) scores = collections.defaultdict(int) n_xx = category_word_fd.N() for category in category_word_fd.conditions(): n_xi = category_word_fd[category].N() for word, n_ii in iteritems(category_word_fd[category]): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) return scores
def sum_category_word_scores(categorized_words, score_fn): word_fd = collections.Counter() category_word_fd = collections.defaultdict(collections.Counter) for category, words in categorized_words: for word in words: word_fd[word] += 1 category_word_fd[category][word] += 1 scores = collections.defaultdict(int) n_xx = sum(itertools.chain(*[fd.values() for fd in category_word_fd.values()])) for category in category_word_fd.keys(): n_xi = sum(category_word_fd[category].values()) for word, n_ii in iteritems(category_word_fd[category]): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) return scores
def sum_category_word_scores(categorized_words, score_fn): word_fd = collections.Counter() category_word_fd = collections.defaultdict(collections.Counter) for category, words in categorized_words: for word in words: word_fd[word] += 1 category_word_fd[category][word] += 1 scores = collections.defaultdict(int) n_xx = sum( itertools.chain(*[fd.values() for fd in category_word_fd.values()])) for category in category_word_fd.keys(): n_xi = sum(category_word_fd[category].values()) for word, n_ii in iteritems(category_word_fd[category]): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) return scores
def cross_fold(instances, trainf, testf, folds=10, trace=1, metrics=True, informative=0): if folds < 2: raise ValueError('must have at least 3 folds') # ensure isn't an exhaustible iterable instances = list(instances) # randomize so get an even distribution, in case labeled instances are # ordered by label random.shuffle(instances) l = len(instances) step = int(l / folds) if trace: print('step %d over %d folds of %d instances' % (step, folds, l)) accuracies = [] precisions = collections.defaultdict(list) recalls = collections.defaultdict(list) f_measures = collections.defaultdict(list) for f in range(folds): if trace: print('\nfold %d' % (f + 1)) print('-----%s' % ('-' * len('%s' % (f + 1)))) start = f * step end = start + step train_instances = instances[:start] + instances[end:] test_instances = instances[start:end] if trace: print('training on %d:%d + %d:%d' % (0, start, end, l)) obj = trainf(train_instances) if trace: print('testing on %d:%d' % (start, end)) if metrics: refsets, testsets = ref_test_sets(obj, test_instances) for key in set(refsets.keys()) | set(testsets.keys()): ref = refsets[key] test = testsets[key] p = precision(ref, test) or 0 r = recall(ref, test) or 0 f = f_measure(ref, test) or 0 precisions[key].append(p) recalls[key].append(r) f_measures[key].append(f) if trace: print('%s precision: %f' % (key, p)) print('%s recall: %f' % (key, r)) print('%s f-measure: %f' % (key, f)) accuracy = testf(obj, test_instances) if trace: print('accuracy: %f' % accuracy) accuracies.append(accuracy) if trace and informative and hasattr(obj, 'show_most_informative_features'): obj.show_most_informative_features(informative) if trace: print('\nmean and variance across folds') print('------------------------------') print('accuracy mean: %f' % (sum(accuracies) / folds)) print('accuracy variance: %f' % array(accuracies).var()) for key, ps in iteritems(precisions): print('%s precision mean: %f' % (key, sum(ps) / folds)) print('%s precision variance: %f' % (key, array(ps).var())) for key, rs in iteritems(recalls): print('%s recall mean: %f' % (key, sum(rs) / folds)) print('%s recall variance: %f' % (key, array(rs).var())) for key, fs in iteritems(f_measures): print('%s f_measure mean: %f' % (key, sum(fs) / folds)) print('%s f_measure variance: %f' % (key, array(fs).var())) return accuracies, precisions, recalls, f_measures
def cross_fold(instances, trainf, testf, folds=10, trace=1, metrics=True, informative=0): if folds < 2: raise ValueError('must have at least 3 folds') # ensure isn't an exhaustible iterable instances = list(instances) # randomize so get an even distribution, in case labeled instances are # ordered by label random.shuffle(instances) l = len(instances) step = l / folds if trace: print('step %d over %d folds of %d instances' % (step, folds, l)) accuracies = [] precisions = collections.defaultdict(list) recalls = collections.defaultdict(list) f_measures = collections.defaultdict(list) for f in range(folds): if trace: print('\nfold %d' % (f+1)) print('-----%s' % ('-'*len('%s' % (f+1)))) start = f * step end = start + step train_instances = instances[:start] + instances[end:] test_instances = instances[start:end] if trace: print('training on %d:%d + %d:%d' % (0, start, end, l)) obj = trainf(train_instances) if trace: print('testing on %d:%d' % (start, end)) if metrics: refsets, testsets = ref_test_sets(obj, test_instances) for key in set(refsets.keys() + testsets.keys()): ref = refsets[key] test = testsets[key] p = precision(ref, test) or 0 r = recall(ref, test) or 0 f = f_measure(ref, test) or 0 precisions[key].append(p) recalls[key].append(r) f_measures[key].append(f) if trace: print('%s precision: %f' % (key, p)) print('%s recall: %f' % (key, r)) print('%s f-measure: %f' % (key, f)) accuracy = testf(obj, test_instances) if trace: print('accuracy: %f' % accuracy) accuracies.append(accuracy) if trace and informative and hasattr(obj, 'show_most_informative_features'): obj.show_most_informative_features(informative) if trace: print('\nmean and variance across folds') print('------------------------------') print('accuracy mean: %f' % (sum(accuracies) / folds)) print('accuracy variance: %f' % array(accuracies).var()) for key, ps in iteritems(precisions): print('%s precision mean: %f' % (key, sum(ps) / folds)) print('%s precision variance: %f' % (key, array(ps).var())) for key, rs in iteritems(recalls): print('%s recall mean: %f' % (key, sum(rs) / folds)) print('%s recall variance: %f' % (key, array(rs).var())) for key, fs in iteritems(f_measures): print('%s f_measure mean: %f' % (key, sum(fs) / folds)) print('%s f_measure variance: %f' % (key, array(fs).var())) return accuracies, precisions, recalls, f_measures