예제 #1
0
def _censor_sparse_vectors_gen(vecs, idxs):
    for vec in vecs:
        new_vec = hashabledict()
        for idx in vec:
            if idx not in idxs:
                new_vec[idx] = vec[idx]
        yield new_vec
예제 #2
0
def _learning_curve_test_data_set(classifiers,
                                  train,
                                  test,
                                  worker_pool,
                                  verbose=False,
                                  no_simstring_cache=False,
                                  use_test_set=False,
                                  folds=10,
                                  min_perc=5,
                                  max_perc=100,
                                  step_perc=5,
                                  it_factor=1):

    # XXX: Not necessary any more!
    if verbose:
        print >> stderr, 'Calculating train set size...',
    train_size = 0
    for d in train:
        for s in d:
            for a in s:
                train_size += 1
    if verbose:
        print >> stderr, 'Done!'
    # XXX:

    if not no_simstring_cache:
        simstring_caching(classifiers, (train, test), verbose=verbose)

    # Collect the seen type to iterate over later
    seen_types = set()
    results_by_classifier = {}

    for classifier_id, classifier_class in classifiers.iteritems():
        if verbose:
            print >> stderr, 'Classifier:', classifier_id, '...',

        from classifier.liblinear import hashabledict

        classifier = classifier_class()
        if verbose:
            print >> stderr, 'featurising train:', '...',
        train_lbls, train_vecs = classifier._gen_lbls_vecs(train)
        train_set = [e for e in izip(train_lbls, train_vecs)]
        assert len(train_lbls) == train_size, '{} != {}'.format(
            len(train_lbls), train_size)
        assert len(train_vecs) == train_size, '{} != {}'.format(
            len(train_vecs), train_size)
        assert len(train_set) == train_size, '{} != {}'.format(
            len(train_set), train_size)
        del train_lbls
        del train_vecs
        if verbose:
            print >> stderr, 'Done!',
            print >> stderr, 'featurising test', '...',
        test_lbls, test_vecs = classifier._gen_lbls_vecs(test)
        test_vecs = [hashabledict(d) for d in test_vecs]
        if verbose:
            print >> stderr, 'Done!',

        # Fix the seed so that we get comparable folds
        seed(0xd5347d33)
        args = ((classifier, fold) for fold in _train_fold_gen(
            train_set, min_perc, max_perc, step_perc, it_factor))

        if worker_pool is None:
            res_it = (_train_fold(*arg) for arg in args)
        else:
            res_it = worker_pool.imap(__train_fold, args)

        classifier_results = defaultdict(list)

        print >> stderr, 'Training and evaluating models: ...',

        i = 0
        for sample_size, fold_classifier in res_it:
            score, new_score = _score_classifier(fold_classifier, test_lbls,
                                                 test_vecs)
            classifier_results[sample_size].append((score, new_score))
            i += 1
            if i % 10 == 0:
                print >> stderr, i, '...',
        print >> stderr, 'Done!'

        # Process the results
        for sample_size in sorted(e for e in classifier_results):
            results = classifier_results[sample_size]
            scores = [score for score, _ in results]
            new_scores = [new_score for _, new_score in results]

            macro_scores = [ms for ms, _, _, _, _ in scores]
            micro_scores = [ms for _, ms, _, _, _ in scores]
            tps = [tp for _, _, tp, _, _ in scores]
            fns = [fn for _, _, _, fn, _ in scores]
            res_dics = [d for _, _, _, _, d in scores]

            # New metrics
            ranks = [mean(rs) for rs, _, _ in new_scores]
            ambiguities = [mean(ambs) for _, ambs, _ in new_scores]
            recalls = [r for _, _, r in new_scores]

            # These are means of means
            ranks_mean = mean(ranks)
            ranks_stddev = stddev(ranks)
            ambiguities_mean = mean(ambiguities)
            ambiguities_stddev = stddev(ambiguities)
            recalls_mean = mean(recalls)
            recalls_stddev = stddev(recalls)

            classifier_result = (
                mean(macro_scores),
                stddev(macro_scores),
                mean(micro_scores),
                stddev(micro_scores),
                mean(tps),
                stddev(tps),
                mean(fns),
                stddev(fns),
                res_dics,
                # New metrics
                ranks_mean,
                ranks_stddev,
                ambiguities_mean,
                ambiguities_stddev,
                recalls_mean,
                recalls_stddev)

            classifier_results[sample_size] = classifier_result

            if verbose:
                res_str = (
                    'Results {size}: '
                    'MACRO: {0:.3f} MACRO_STDDEV: {1:.3f} '
                    'MICRO: {2:.3f} MICRO_STDDEV: {3:.3f} '
                    'TP: {4:.3f} FP: {5:.3f} '
                    'MEAN_RANK: {mean_rank:.3f} MEAN_RANK_STDDEV: {mean_rank_stddev:.3f} '
                    'AVG_AMB: {avg_amb:.3f} AVG_AMB_STDDEV: {avg_amb_stddev:.3f} '
                    'RECALL: {recall:.3f} RECALL_STDDEV: {recall_stddev:.3f}'
                ).format(*classifier_result,
                         size=sample_size,
                         mean_rank=ranks_mean,
                         mean_rank_stddev=ranks_stddev,
                         avg_amb=ambiguities_mean,
                         avg_amb_stddev=ambiguities_stddev,
                         recall=recalls_mean,
                         recall_stddev=recalls_stddev)
                print res_str

        results_by_classifier[classifier_id] = classifier_results
    return results_by_classifier
예제 #3
0
def _learning_curve_test_data_set(classifiers, train, test,
        worker_pool, verbose=False, no_simstring_cache=False,
        use_test_set=False, folds=10, min_perc=5, max_perc=100, step_perc=5,
        it_factor=1):

    # XXX: Not necessary any more!
    if verbose:
        print >> stderr, 'Calculating train set size...',
    train_size = 0
    for d in train:
        for s in d:
            for a in s:
                train_size += 1
    if verbose:
        print >> stderr, 'Done!'
    # XXX:

    if not no_simstring_cache:
        simstring_caching(classifiers, (train, test), verbose=verbose)

    # Collect the seen type to iterate over later
    seen_types = set()
    results_by_classifier = {}

    for classifier_id, classifier_class in classifiers.iteritems():
        if verbose:
            print >> stderr, 'Classifier:', classifier_id, '...',

        from classifier.liblinear import hashabledict

        classifier = classifier_class()
        if verbose:
            print >> stderr, 'featurising train:', '...',
        train_lbls, train_vecs = classifier._gen_lbls_vecs(train)
        train_set = [e for e in izip(train_lbls, train_vecs)]
        assert len(train_lbls) == train_size, '{} != {}'.format(
                len(train_lbls), train_size)
        assert len(train_vecs) == train_size, '{} != {}'.format(
                len(train_vecs), train_size)
        assert len(train_set) == train_size, '{} != {}'.format(
                len(train_set), train_size)
        del train_lbls
        del train_vecs
        if verbose:
            print >> stderr, 'Done!',
            print >> stderr, 'featurising test', '...',
        test_lbls, test_vecs = classifier._gen_lbls_vecs(test)
        test_vecs = [hashabledict(d) for d in test_vecs]
        if verbose:
            print >> stderr, 'Done!',

        # Fix the seed so that we get comparable folds
        seed(0xd5347d33)
        args = ((classifier, fold) for fold in _train_fold_gen(train_set,
            min_perc, max_perc, step_perc, it_factor))

        if worker_pool is None:
            res_it = (_train_fold(*arg) for arg in args)
        else:
            res_it = worker_pool.imap(__train_fold, args)

        classifier_results = defaultdict(list)

        print >> stderr, 'Training and evaluating models: ...',

        i = 0
        for sample_size, fold_classifier in res_it:
            score, new_score = _score_classifier(fold_classifier, test_lbls,
                    test_vecs)
            classifier_results[sample_size].append((score, new_score))
            i += 1
            if i % 10 == 0:
                print >> stderr, i, '...',
        print >> stderr, 'Done!'

        # Process the results
        for sample_size in sorted(e for e in classifier_results):
            results = classifier_results[sample_size]
            scores = [score for score, _ in results]
            new_scores = [new_score for _, new_score in results]

            macro_scores = [ms for ms, _, _, _, _ in scores]
            micro_scores = [ms for _, ms, _, _, _ in scores]
            tps = [tp for _, _, tp, _, _ in scores]
            fns = [fn for _, _, _, fn, _ in scores]
            res_dics = [d for _, _, _, _, d in scores]

            # New metrics
            ranks = [mean(rs) for rs, _, _ in new_scores]
            ambiguities = [mean(ambs) for _, ambs, _ in new_scores]
            recalls = [r for  _, _, r in new_scores]

            # These are means of means
            ranks_mean = mean(ranks)
            ranks_stddev = stddev(ranks)
            ambiguities_mean = mean(ambiguities)
            ambiguities_stddev = stddev(ambiguities)
            recalls_mean = mean(recalls)
            recalls_stddev = stddev(recalls)

            classifier_result = (
                    mean(macro_scores), stddev(macro_scores),
                    mean(micro_scores), stddev(micro_scores),
                    mean(tps), stddev(tps),
                    mean(fns), stddev(fns),
                    res_dics,
                    # New metrics
                    ranks_mean, ranks_stddev,
                    ambiguities_mean, ambiguities_stddev,
                    recalls_mean, recalls_stddev
                    )


            classifier_results[sample_size] = classifier_result
            
            if verbose:
                res_str = ('Results {size}: '
                        'MACRO: {0:.3f} MACRO_STDDEV: {1:.3f} '
                        'MICRO: {2:.3f} MICRO_STDDEV: {3:.3f} '
                        'TP: {4:.3f} FP: {5:.3f} '
                        'MEAN_RANK: {mean_rank:.3f} MEAN_RANK_STDDEV: {mean_rank_stddev:.3f} '
                        'AVG_AMB: {avg_amb:.3f} AVG_AMB_STDDEV: {avg_amb_stddev:.3f} '
                        'RECALL: {recall:.3f} RECALL_STDDEV: {recall_stddev:.3f}'
                        ).format(*classifier_result,
                                size=sample_size,
                                mean_rank=ranks_mean,
                                mean_rank_stddev=ranks_stddev,
                                avg_amb=ambiguities_mean,
                                avg_amb_stddev=ambiguities_stddev,
                                recall=recalls_mean,
                                recall_stddev=recalls_stddev
                                )
                print res_str

        results_by_classifier[classifier_id] = classifier_results
    return results_by_classifier
예제 #4
0
def _lexical_descent(classifiers, datasets, outdir, verbose=False,
        worker_pool=None, no_simstring_cache=False, use_test_set=False):
    # Check that we can in fact do a lexical descent for the classifier
    for classifier_name in classifiers:
        assert ('SIMSTRING' in classifier_name
                or 'TSURUOKA' in classifier_name
                or 'GAZETTER' in classifier_name)

    for classifier_name, classifier_class in classifiers.iteritems():
        print 'Classifier:', classifier_name
        classifier =  classifier_class()

        for dataset_name, dataset_getter in datasets.iteritems():
            print 'Dataset:', dataset_name
            if verbose:
                print >> stderr, 'Reading data...',

            train_set, dev_set, test_set = dataset_getter()
            if use_test_set:
                train, test = list(chain(train_set, dev_set)), list(test_set)
            else:
                train, test = list(train_set), list(dev_set)
            del train_set, dev_set, test_set

            if verbose:
                print >> stderr, 'Done!'

            if not no_simstring_cache:
                simstring_caching((classifier_name, ),
                    (train, test, ), verbose=verbose)


            train_lbls, train_vecs = classifier._gen_lbls_vecs(train)
            test_lbls, test_vecs = classifier._gen_lbls_vecs(test)
            train_vecs = [hashabledict(d) for d in train_vecs]
            test_vecs = [hashabledict(d) for d in test_vecs]
            train_uncensored_vecs = deepcopy(train_vecs)

            # Generate the folds for all iterations
            folds = [f for f in _k_folds(5,
                set(izip(train_lbls, train_vecs)))] #XXX: Constant

            # XXX: This is an ugly hack and bound to break:
            # Locate which vector ID;s that are used by SimString features and
            # by which feature
            from classifier.simstring.features import SIMSTRING_FEATURES
            sf_ids = [f().get_id() for f in SIMSTRING_FEATURES]

            vec_idxs_by_feat_id = defaultdict(set)
            for sf_id in sf_ids:
                for f_id in classifier.vec_index_by_feature_id:
                    # NOTE: Not 100% safe check, could match by accident
                    if sf_id in f_id:
                        vec_idxs_by_feat_id[sf_id].add(
                                classifier.vec_index_by_feature_id[f_id])

            # Which ones never fired?
            i = 0
            for i, sf_id in enumerate((id for id in sf_ids
                if id not in vec_idxs_by_feat_id), start=1):
                print sf_id, 'never fired'
            else:
                print '{} SimString feature(s) never fired'.format(i)

            res_dic = defaultdict(lambda : defaultdict(lambda : '-'))

            # Iteratively find the best candidate
            to_evaluate = set((f_id for f_id in vec_idxs_by_feat_id))
            removed = set()
            iteration = 1
            last_macro_score = None
            while to_evaluate:
                print 'Iteration:', iteration

                print 'Censoring vectors...',
                # Censor everything we have removed so far
                idxs_to_censor = set(i for i in chain(
                    *(vec_idxs_by_feat_id[f_id] for f_id in removed)))
                train_vecs = [d for d in _censor_sparse_vectors_gen(
                    train_vecs, idxs_to_censor)]

                train_data = set(izip(train_lbls, train_vecs))

                train_folds = []
                for fold in folds:
                    f_lbls = (l for l, _ in fold)
                    f_vecs = (d for d in _censor_sparse_vectors_gen(
                        (v for _, v in fold), idxs_to_censor))
                    train_folds.append(set(izip(f_lbls, f_vecs)))
                print 'Done!'
                
                print 'Training and evaluating a model of our current state...',
                classifier._liblinear_train(train_lbls, train_vecs)
                print 'Done!'

                test_censored_vecs = [d for d in _censor_sparse_vectors_gen(
                    test_vecs, idxs_to_censor)]
                curr_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_censored_vecs))[0]

                print 'Current state on test is: {}'.format(curr_macro_score)
                if last_macro_score is not None:
                    print 'Last state was: {} (diff: {})'.format(last_macro_score,
                        curr_macro_score - last_macro_score)
                last_macro_score = curr_macro_score

                # Prepare to go parallel
                f_args = ((f_id, classifier, train_data, train_folds,
                    to_censor) for f_id, to_censor
                    in vec_idxs_by_feat_id.iteritems() if f_id in to_evaluate)
                # Also cram in our non-censored one in there
                f_args = chain(((None, classifier, train_data, train_folds,
                    set()), ), f_args)

                score_by_knockout = {}
                print 'Evaluating knockouts ({} in total)'.format(
                        len(to_evaluate) + 1)
                # TODO: A bit reduntant, prettify!
                if worker_pool is not None:
                    i = 1
                    for f_id, mean in worker_pool.imap_unordered(
                            __knockout_pass, f_args):
                        score_by_knockout[f_id] = mean
                        print 'it: {} k: {} res: {} {}'.format(
                                iteration, i, f_id, mean)
                        i += 1
                else:
                    for i, args in enumerate(f_args, start=1):
                        f_id, mean = _knockout_pass(*args)
                        score_by_knockout[f_id] = mean
                        print 'it: {} k: {} res: {} {}'.format(
                                iteration, i, f_id, mean)

                # Set the result dictionary
                for f_id, mean in score_by_knockout.iteritems():
                    res_dic[str(iteration)][f_id] = mean
                # And write the results incrementally for each round
                with open(join_path(outdir, 'descent_{}_{}.md'.format(
                    classifier_name, dataset_name)), 'w') as md_file:
                    from md import dict_to_table
                    md_file.write(dict_to_table(res_dic, total=False, perc=False))
                    md_file.write('\n')
                
                # Find the best scoring one...
                scores = [(s, f_id)
                        for f_id, s in score_by_knockout.iteritems()]
                scores.sort()
                scores.reverse()

                best_score, best_f_id = scores[0]

                print 'Round winner: {} with {}'.format(best_f_id, best_score)

                if best_f_id is None:
                    # We are done, no removal gave a better score
                    break

                removed.add(best_f_id)
                to_evaluate.remove(best_f_id)
                
                iteration += 1

            if removed:
                # TODO: Could do more metrics here?

                print 'Training and evaluating a model of our previous state...',
                classifier._liblinear_train(train_lbls, train_uncensored_vecs)
                before_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_vecs))[0]
                print 'Done!'

                print 'Training and evaluating a model of our current state...',
                train_censored_vecs = [d for d in _censor_sparse_vectors_gen(train_vecs,
                    set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))]
                classifier._liblinear_train(train_lbls, train_censored_vecs)
                print 'Done!'

                test_censored_vecs = [d for d in _censor_sparse_vectors_gen(test_vecs,
                    set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))]
                after_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_censored_vecs))[0]

                res_str = 'Before: {} After: {}'.format(before_macro_score,
                        after_macro_score)
                print res_str
                print 'Happy?'
            else:
                res_str = 'Unable to remove any lexical resource to make improvements...'
                print res_str

            # Ugly but saves the final result safely
            with open(join_path(outdir, 'descent_{}_{}.txt'.format(
                classifier_name, dataset_name)), 'w') as res_file:
                res_file.write(res_str)