Exemple #1
0
def _ranking(classifiers, datasets, outdir, verbose=False, worker_pool=None,
        no_simstring_cache=False, use_test_set=False):

    if worker_pool is not None:
        raise NotImplementedError

    for dataset_id, dataset_getter in datasets.iteritems():
        if verbose:
            print >> stderr, 'Data set:', dataset_id

        if verbose:
            print >> stderr, 'Caching data set...',

        train_set, dev_set, test_set = dataset_getter()
        if use_test_set:
            train, dev = list(chain(train_set, dev_set)), list(test_set)
        else:
            train, dev = list(train_set), list(dev_set)
        del train_set, dev_set, test_set

        if verbose:
            print >> stderr, 'Done!'

        if not no_simstring_cache:
            simstring_caching(classifiers, (train, dev), verbose=verbose)

        for classifier_id, classifier_class in classifiers.iteritems():
            if verbose:
                print >> stderr, 'Classifier:', classifier_id

            classifier = classifier_class()

            classifier.train(train)

            test_lbls, test_vecs = classifier._gen_lbls_vecs(dev)

            
            res_tup = score_classifier_by_tup_ranked(classifier,
                    (test_lbls, test_vecs), conf_threshold=0.995)
            mean, median, truncated_mean, avg_ambiguity, lost_by_threshold = res_tup

            res_str = ('Results: '
                    '{0:.3f}/'
                    '{1:.3f}/'
                    '{2:.3f}/'
                    '{3:.3f}/'
                    '{4:.3f} '
                    '(MEAN/MEDIAN/DISC_MEAN/AMBIGUITY/LOST)'
                    ).format(mean, median, truncated_mean, avg_ambiguity, lost_by_threshold)

            if verbose:
                print res_str

            with open(join_path(outdir, 'ranked_{}_{}.txt'.format(
                classifier_id, dataset_id)), 'w') as out_file:

                out_file.write(res_str)
Exemple #2
0
def _quick_test(classifiers, datasets, outdir, verbose=False, worker_pool=None,
        no_simstring_cache=False, use_test_set=False):
    
    if worker_pool is not None:
        raise NotImplementedError

    results_file_path = _get_quick_pickle_path(outdir)
    results_by_dataset = {}

    for dataset_id, dataset_getter in datasets.iteritems():
        if verbose:
            print >> stderr, 'Data set:', dataset_id

        if verbose:
            print >> stderr, 'Caching data set...',
        train, dev, test = dataset_getter()
        if use_test_set:
            train, dev = list(chain(train, dev)), list(test)
        else:
            train, dev = list(train), list(dev)
        if verbose:
            print >> stderr, 'Done!'
      
        if not no_simstring_cache:
            simstring_caching(classifiers, (train, dev), verbose=verbose)
    
        # Collect the seen type to iterate over later
        seen_types = set()
        results_by_classifier = {}

        for classifier_id, classifier_class in classifiers.iteritems():
            if verbose:
                print >> stderr, 'Classifier:', classifier_id
            
            classifier = classifier_class()

            classifier.train(train)

            score = score_classifier(classifier, dev)
            results_by_classifier[classifier_id] = score
            macro_score, micro_score, tp_sum, fn_sum, _ = score
            
            if verbose:
                res_str = ('Results: '
                        '{0:.3f}/'
                        '{1:.3f}/'
                        '{2}/{3} (MACRO/MICRO/TP/FN)'
                        ).format(macro_score, micro_score, tp_sum, fn_sum)
                print res_str

        results_by_dataset[dataset_id] = results_by_classifier

    with open(results_file_path, 'wb') as results_file:
        pickle_dump(results_by_dataset, results_file)

    if verbose:
        print >> stderr, 'Results written to:', results_file_path
Exemple #3
0
def _cache(datasets, verbose=False):
    dataset_getters = []
    for dataset_name, dataset_getter in datasets.iteritems():
        if verbose:
            print >> stderr, 'Reading data for {}...'.format(dataset_name),
        for getter in dataset_getter():
            dataset_getters.append(getter)
        if verbose:
            print >> stderr, 'Done!'

    # Hi-jack by fooling it into believing we are a SimString classifier
    simstring_caching(('SIMSTRING', ),
        dataset_getters, verbose=verbose)
Exemple #4
0
def _lexical_descent(classifiers, datasets, outdir, verbose=False,
        worker_pool=None, no_simstring_cache=False, use_test_set=False):
    # Check that we can in fact do a lexical descent for the classifier
    for classifier_name in classifiers:
        assert ('SIMSTRING' in classifier_name
                or 'TSURUOKA' in classifier_name
                or 'GAZETTER' in classifier_name)

    for classifier_name, classifier_class in classifiers.iteritems():
        print 'Classifier:', classifier_name
        classifier =  classifier_class()

        for dataset_name, dataset_getter in datasets.iteritems():
            print 'Dataset:', dataset_name
            if verbose:
                print >> stderr, 'Reading data...',

            train_set, dev_set, test_set = dataset_getter()
            if use_test_set:
                train, test = list(chain(train_set, dev_set)), list(test_set)
            else:
                train, test = list(train_set), list(dev_set)
            del train_set, dev_set, test_set

            if verbose:
                print >> stderr, 'Done!'

            if not no_simstring_cache:
                simstring_caching((classifier_name, ),
                    (train, test, ), verbose=verbose)


            train_lbls, train_vecs = classifier._gen_lbls_vecs(train)
            test_lbls, test_vecs = classifier._gen_lbls_vecs(test)
            train_vecs = [hashabledict(d) for d in train_vecs]
            test_vecs = [hashabledict(d) for d in test_vecs]
            train_uncensored_vecs = deepcopy(train_vecs)

            # Generate the folds for all iterations
            folds = [f for f in _k_folds(5,
                set(izip(train_lbls, train_vecs)))] #XXX: Constant

            # XXX: This is an ugly hack and bound to break:
            # Locate which vector ID;s that are used by SimString features and
            # by which feature
            from classifier.simstring.features import SIMSTRING_FEATURES
            sf_ids = [f().get_id() for f in SIMSTRING_FEATURES]

            vec_idxs_by_feat_id = defaultdict(set)
            for sf_id in sf_ids:
                for f_id in classifier.vec_index_by_feature_id:
                    # NOTE: Not 100% safe check, could match by accident
                    if sf_id in f_id:
                        vec_idxs_by_feat_id[sf_id].add(
                                classifier.vec_index_by_feature_id[f_id])

            # Which ones never fired?
            i = 0
            for i, sf_id in enumerate((id for id in sf_ids
                if id not in vec_idxs_by_feat_id), start=1):
                print sf_id, 'never fired'
            else:
                print '{} SimString feature(s) never fired'.format(i)

            res_dic = defaultdict(lambda : defaultdict(lambda : '-'))

            # Iteratively find the best candidate
            to_evaluate = set((f_id for f_id in vec_idxs_by_feat_id))
            removed = set()
            iteration = 1
            last_macro_score = None
            while to_evaluate:
                print 'Iteration:', iteration

                print 'Censoring vectors...',
                # Censor everything we have removed so far
                idxs_to_censor = set(i for i in chain(
                    *(vec_idxs_by_feat_id[f_id] for f_id in removed)))
                train_vecs = [d for d in _censor_sparse_vectors_gen(
                    train_vecs, idxs_to_censor)]

                train_data = set(izip(train_lbls, train_vecs))

                train_folds = []
                for fold in folds:
                    f_lbls = (l for l, _ in fold)
                    f_vecs = (d for d in _censor_sparse_vectors_gen(
                        (v for _, v in fold), idxs_to_censor))
                    train_folds.append(set(izip(f_lbls, f_vecs)))
                print 'Done!'
                
                print 'Training and evaluating a model of our current state...',
                classifier._liblinear_train(train_lbls, train_vecs)
                print 'Done!'

                test_censored_vecs = [d for d in _censor_sparse_vectors_gen(
                    test_vecs, idxs_to_censor)]
                curr_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_censored_vecs))[0]

                print 'Current state on test is: {}'.format(curr_macro_score)
                if last_macro_score is not None:
                    print 'Last state was: {} (diff: {})'.format(last_macro_score,
                        curr_macro_score - last_macro_score)
                last_macro_score = curr_macro_score

                # Prepare to go parallel
                f_args = ((f_id, classifier, train_data, train_folds,
                    to_censor) for f_id, to_censor
                    in vec_idxs_by_feat_id.iteritems() if f_id in to_evaluate)
                # Also cram in our non-censored one in there
                f_args = chain(((None, classifier, train_data, train_folds,
                    set()), ), f_args)

                score_by_knockout = {}
                print 'Evaluating knockouts ({} in total)'.format(
                        len(to_evaluate) + 1)
                # TODO: A bit reduntant, prettify!
                if worker_pool is not None:
                    i = 1
                    for f_id, mean in worker_pool.imap_unordered(
                            __knockout_pass, f_args):
                        score_by_knockout[f_id] = mean
                        print 'it: {} k: {} res: {} {}'.format(
                                iteration, i, f_id, mean)
                        i += 1
                else:
                    for i, args in enumerate(f_args, start=1):
                        f_id, mean = _knockout_pass(*args)
                        score_by_knockout[f_id] = mean
                        print 'it: {} k: {} res: {} {}'.format(
                                iteration, i, f_id, mean)

                # Set the result dictionary
                for f_id, mean in score_by_knockout.iteritems():
                    res_dic[str(iteration)][f_id] = mean
                # And write the results incrementally for each round
                with open(join_path(outdir, 'descent_{}_{}.md'.format(
                    classifier_name, dataset_name)), 'w') as md_file:
                    from md import dict_to_table
                    md_file.write(dict_to_table(res_dic, total=False, perc=False))
                    md_file.write('\n')
                
                # Find the best scoring one...
                scores = [(s, f_id)
                        for f_id, s in score_by_knockout.iteritems()]
                scores.sort()
                scores.reverse()

                best_score, best_f_id = scores[0]

                print 'Round winner: {} with {}'.format(best_f_id, best_score)

                if best_f_id is None:
                    # We are done, no removal gave a better score
                    break

                removed.add(best_f_id)
                to_evaluate.remove(best_f_id)
                
                iteration += 1

            if removed:
                # TODO: Could do more metrics here?

                print 'Training and evaluating a model of our previous state...',
                classifier._liblinear_train(train_lbls, train_uncensored_vecs)
                before_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_vecs))[0]
                print 'Done!'

                print 'Training and evaluating a model of our current state...',
                train_censored_vecs = [d for d in _censor_sparse_vectors_gen(train_vecs,
                    set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))]
                classifier._liblinear_train(train_lbls, train_censored_vecs)
                print 'Done!'

                test_censored_vecs = [d for d in _censor_sparse_vectors_gen(test_vecs,
                    set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))]
                after_macro_score = score_classifier_by_tup(classifier,
                        (test_lbls, test_censored_vecs))[0]

                res_str = 'Before: {} After: {}'.format(before_macro_score,
                        after_macro_score)
                print res_str
                print 'Happy?'
            else:
                res_str = 'Unable to remove any lexical resource to make improvements...'
                print res_str

            # Ugly but saves the final result safely
            with open(join_path(outdir, 'descent_{}_{}.txt'.format(
                classifier_name, dataset_name)), 'w') as res_file:
                res_file.write(res_str)
Exemple #5
0
def _confusion_matrix_test(classifiers, datasets, outdir,
        verbose=False, no_simstring_cache=False, worker_pool=None):
    results_by_dataset = {}

    if worker_pool is not None:
        raise NotImplementedError

    for dataset_id, dataset_getter in datasets.iteritems():
        if verbose:
            print >> stderr, 'Data set:', dataset_id

        if verbose:
            print >> stderr, 'Caching data set...',
        train, dev, _ = dataset_getter()
        train, dev = list(train), list(dev)
        if verbose:
            print >> stderr, 'Done!'

        if not no_simstring_cache:
            simstring_caching(classifiers, (train, dev), verbose=verbose)

        # Collect the seen type to iterate over later
        seen_types = set()
        results_by_classifier = {}

        for classifier_id, classifier_class in classifiers.iteritems():
            if verbose:
                print >> stderr, 'Classifier:', classifier_id

            classifier = classifier_class()

            if verbose:
                print >> stderr, 'Training...',
            classifier.train(train)
            if verbose:
                print >> stderr, 'Done!'

            confusion_matrix = defaultdict(lambda : defaultdict(int))
            if verbose:
                print >> stderr, 'Evaluating...',
            for document in dev:
                for sentence in document:
                    for annotation in sentence:
                        predicted = classifier.classify(document,
                                sentence, annotation)

                        #if predicted != annotation.type:
                        confusion_matrix[annotation.type][predicted] += 1

                        seen_types.add(annotation.type)
            if verbose:
                print >> stderr, 'Done!'

            results_by_classifier[classifier_id] = confusion_matrix

        for _, confusion_matrix in results_by_classifier.iteritems():
            for from_type in confusion_matrix:
                for type in seen_types:
                    if type not in confusion_matrix:
                        confusion_matrix[type] = 0
                # Set the already existing values to the same and other unseen
                #       but existing to 0
                #XXX: HACK!
                #if type not in confusion_matrix:
                #    confusion_matrix[type] = 0
                #confusion_matrix[type] = confusion_matrix[type]

        results_by_dataset[dataset_id] = results_by_classifier

    json_dic = defaultdict(lambda : defaultdict(dict))
    for dataset_id, results_by_classifier in results_by_dataset.iteritems():
        for classifier_id, confusion_matrix in results_by_classifier.iteritems():
            # Get the column names and order
            col_names = [k for k in confusion_matrix]
            col_names.sort()

            max_by_col = defaultdict(int)
            min_by_col = defaultdict(lambda : 2**32)

            table_data = []
            for row_name in col_names:
                row_data = confusion_matrix[row_name]

                # Not yet normalised
                raw_row = [row_data[k] for k in col_names]
                row_sum = float(sum(raw_row))

                table_data.append([e / row_sum for e in raw_row])

                for k in col_names:
                    max_by_col[k] = max(row_data[k], max_by_col[k])
                    min_by_col[k] = min(row_data[k], min_by_col[k])

            col_ranges = [None for _ in col_names]
            col_frmts = ['%.2f' for _ in col_names]

            # Now append a name to each row
            col_ranges.insert(0, None)
            col_frmts.insert(0, '%s')

            for row_name, row in izip(col_names, table_data):
                row.insert(0, row_name)

            col_names.insert(0, 'From \\ To')

            table_ftr = [-1 for _ in col_names]

            # NAMES, FRMT, RANGES, TABLE_DATA, TABLE_FTR
            json_dic[classifier_id][dataset_id] = (
                    col_names,
                    col_frmts,
                    col_ranges,
                    table_data,
                    table_ftr,
                    )

            # XXX: Will overwrite!
    with open(join_path(outdir, 'data.js'), 'w') as data_file:
        data_file.write('data = ')
        data_file.write(json_dumps(json_dic, encoding='utf-8',
            indent=2, sort_keys=True))

        if verbose:
            print >> stderr, 'Output: file://{0}'.format(
                    abspath(join_path(outdir, 'index.html')))