Exemple #1
0
def _knockout_pass(f_id, classifier, train_data, folds, to_censor):
    macro_scores = []
    for fold_num, fold in enumerate(folds, start=1):
        train_set = train_data - fold
        test_set = fold

        assert len(train_set) + len(test_set) == len(train_data)

        train_vecs = [d for d in _censor_sparse_vectors_gen(
            (v for _, v in train_set), to_censor)]
        train_lbls = [l for l, _ in train_set]

        classifier._liblinear_train(train_lbls, train_vecs)

        test_vecs = [d for d in _censor_sparse_vectors_gen(
            (v for _, v in test_set), to_censor)]
        test_lbls = (l for l, _ in test_set)
        res_tup =score_classifier_by_tup(classifier, (test_lbls, test_vecs))
        macro_scores.append(res_tup[0])

    mean = mean(macro_scores)

    return f_id, mean
Exemple #2
0
def _lexical_descent(classifiers, datasets, outdir, verbose=False,
        worker_pool=None, no_simstring_cache=False, use_test_set=False):
    # Check that we can in fact do a lexical descent for the classifier
    for classifier_name in classifiers:
        assert ('SIMSTRING' in classifier_name
                or 'TSURUOKA' in classifier_name
                or 'GAZETTER' in classifier_name)

    for classifier_name, classifier_class in classifiers.iteritems():
        print 'Classifier:', classifier_name
        classifier =  classifier_class()

        for dataset_name, dataset_getter in datasets.iteritems():
            print 'Dataset:', dataset_name
            if verbose:
                print >> stderr, 'Reading data...',

            train_set, dev_set, test_set = dataset_getter()
            if use_test_set:
                train, test = list(chain(train_set, dev_set)), list(test_set)
            else:
                train, test = list(train_set), list(dev_set)
            del train_set, dev_set, test_set

            if verbose:
                print >> stderr, 'Done!'

            if not no_simstring_cache:
                simstring_caching((classifier_name, ),
                    (train, test, ), verbose=verbose)


            train_lbls, train_vecs = classifier._gen_lbls_vecs(train)
            test_lbls, test_vecs = classifier._gen_lbls_vecs(test)
            train_vecs = [hashabledict(d) for d in train_vecs]
            test_vecs = [hashabledict(d) for d in test_vecs]
            train_uncensored_vecs = deepcopy(train_vecs)

            # Generate the folds for all iterations
            folds = [f for f in _k_folds(5,
                set(izip(train_lbls, train_vecs)))] #XXX: Constant

            # XXX: This is an ugly hack and bound to break:
            # Locate which vector ID;s that are used by SimString features and
            # by which feature
            from classifier.simstring.features import SIMSTRING_FEATURES
            sf_ids = [f().get_id() for f in SIMSTRING_FEATURES]

            vec_idxs_by_feat_id = defaultdict(set)
            for sf_id in sf_ids:
                for f_id in classifier.vec_index_by_feature_id:
                    # NOTE: Not 100% safe check, could match by accident
                    if sf_id in f_id:
                        vec_idxs_by_feat_id[sf_id].add(
                                classifier.vec_index_by_feature_id[f_id])

            # Which ones never fired?
            i = 0
            for i, sf_id in enumerate((id for id in sf_ids
                if id not in vec_idxs_by_feat_id), start=1):
                print sf_id, 'never fired'
            else:
                print '{} SimString feature(s) never fired'.format(i)

            res_dic = defaultdict(lambda : defaultdict(lambda : '-'))

            # Iteratively find the best candidate
            to_evaluate = set((f_id for f_id in vec_idxs_by_feat_id))
            removed = set()
            iteration = 1
            last_macro_score = None
            while to_evaluate:
                print 'Iteration:', iteration

                print 'Censoring vectors...',
                # Censor everything we have removed so far
                idxs_to_censor = set(i for i in chain(
                    *(vec_idxs_by_feat_id[f_id] for f_id in removed)))
                train_vecs = [d for d in _censor_sparse_vectors_gen(
                    train_vecs, idxs_to_censor)]

                train_data = set(izip(train_lbls, train_vecs))

                train_folds = []
                for fold in folds:
                    f_lbls = (l for l, _ in fold)
                    f_vecs = (d for d in _censor_sparse_vectors_gen(
                        (v for _, v in fold), idxs_to_censor))
                    train_folds.append(set(izip(f_lbls, f_vecs)))
                print 'Done!'
                
                print 'Training and evaluating a model of our current state...',
                classifier._liblinear_train(train_lbls, train_vecs)
                print 'Done!'

                test_censored_vecs = [d for d in _censor_sparse_vectors_gen(
                    test_vecs, idxs_to_censor)]
                curr_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_censored_vecs))[0]

                print 'Current state on test is: {}'.format(curr_macro_score)
                if last_macro_score is not None:
                    print 'Last state was: {} (diff: {})'.format(last_macro_score,
                        curr_macro_score - last_macro_score)
                last_macro_score = curr_macro_score

                # Prepare to go parallel
                f_args = ((f_id, classifier, train_data, train_folds,
                    to_censor) for f_id, to_censor
                    in vec_idxs_by_feat_id.iteritems() if f_id in to_evaluate)
                # Also cram in our non-censored one in there
                f_args = chain(((None, classifier, train_data, train_folds,
                    set()), ), f_args)

                score_by_knockout = {}
                print 'Evaluating knockouts ({} in total)'.format(
                        len(to_evaluate) + 1)
                # TODO: A bit reduntant, prettify!
                if worker_pool is not None:
                    i = 1
                    for f_id, mean in worker_pool.imap_unordered(
                            __knockout_pass, f_args):
                        score_by_knockout[f_id] = mean
                        print 'it: {} k: {} res: {} {}'.format(
                                iteration, i, f_id, mean)
                        i += 1
                else:
                    for i, args in enumerate(f_args, start=1):
                        f_id, mean = _knockout_pass(*args)
                        score_by_knockout[f_id] = mean
                        print 'it: {} k: {} res: {} {}'.format(
                                iteration, i, f_id, mean)

                # Set the result dictionary
                for f_id, mean in score_by_knockout.iteritems():
                    res_dic[str(iteration)][f_id] = mean
                # And write the results incrementally for each round
                with open(join_path(outdir, 'descent_{}_{}.md'.format(
                    classifier_name, dataset_name)), 'w') as md_file:
                    from md import dict_to_table
                    md_file.write(dict_to_table(res_dic, total=False, perc=False))
                    md_file.write('\n')
                
                # Find the best scoring one...
                scores = [(s, f_id)
                        for f_id, s in score_by_knockout.iteritems()]
                scores.sort()
                scores.reverse()

                best_score, best_f_id = scores[0]

                print 'Round winner: {} with {}'.format(best_f_id, best_score)

                if best_f_id is None:
                    # We are done, no removal gave a better score
                    break

                removed.add(best_f_id)
                to_evaluate.remove(best_f_id)
                
                iteration += 1

            if removed:
                # TODO: Could do more metrics here?

                print 'Training and evaluating a model of our previous state...',
                classifier._liblinear_train(train_lbls, train_uncensored_vecs)
                before_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_vecs))[0]
                print 'Done!'

                print 'Training and evaluating a model of our current state...',
                train_censored_vecs = [d for d in _censor_sparse_vectors_gen(train_vecs,
                    set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))]
                classifier._liblinear_train(train_lbls, train_censored_vecs)
                print 'Done!'

                test_censored_vecs = [d for d in _censor_sparse_vectors_gen(test_vecs,
                    set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))]
                after_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_censored_vecs))[0]

                res_str = 'Before: {} After: {}'.format(before_macro_score,
                        after_macro_score)
                print res_str
                print 'Happy?'
            else:
                res_str = 'Unable to remove any lexical resource to make improvements...'
                print res_str

            # Ugly but saves the final result safely
            with open(join_path(outdir, 'descent_{}_{}.txt'.format(
                classifier_name, dataset_name)), 'w') as res_file:
                res_file.write(res_str)