def _knockout_pass(f_id, classifier, train_data, folds, to_censor): macro_scores = [] for fold_num, fold in enumerate(folds, start=1): train_set = train_data - fold test_set = fold assert len(train_set) + len(test_set) == len(train_data) train_vecs = [d for d in _censor_sparse_vectors_gen( (v for _, v in train_set), to_censor)] train_lbls = [l for l, _ in train_set] classifier._liblinear_train(train_lbls, train_vecs) test_vecs = [d for d in _censor_sparse_vectors_gen( (v for _, v in test_set), to_censor)] test_lbls = (l for l, _ in test_set) res_tup =score_classifier_by_tup(classifier, (test_lbls, test_vecs)) macro_scores.append(res_tup[0]) mean = mean(macro_scores) return f_id, mean
def _lexical_descent(classifiers, datasets, outdir, verbose=False, worker_pool=None, no_simstring_cache=False, use_test_set=False): # Check that we can in fact do a lexical descent for the classifier for classifier_name in classifiers: assert ('SIMSTRING' in classifier_name or 'TSURUOKA' in classifier_name or 'GAZETTER' in classifier_name) for classifier_name, classifier_class in classifiers.iteritems(): print 'Classifier:', classifier_name classifier = classifier_class() for dataset_name, dataset_getter in datasets.iteritems(): print 'Dataset:', dataset_name if verbose: print >> stderr, 'Reading data...', train_set, dev_set, test_set = dataset_getter() if use_test_set: train, test = list(chain(train_set, dev_set)), list(test_set) else: train, test = list(train_set), list(dev_set) del train_set, dev_set, test_set if verbose: print >> stderr, 'Done!' if not no_simstring_cache: simstring_caching((classifier_name, ), (train, test, ), verbose=verbose) train_lbls, train_vecs = classifier._gen_lbls_vecs(train) test_lbls, test_vecs = classifier._gen_lbls_vecs(test) train_vecs = [hashabledict(d) for d in train_vecs] test_vecs = [hashabledict(d) for d in test_vecs] train_uncensored_vecs = deepcopy(train_vecs) # Generate the folds for all iterations folds = [f for f in _k_folds(5, set(izip(train_lbls, train_vecs)))] #XXX: Constant # XXX: This is an ugly hack and bound to break: # Locate which vector ID;s that are used by SimString features and # by which feature from classifier.simstring.features import SIMSTRING_FEATURES sf_ids = [f().get_id() for f in SIMSTRING_FEATURES] vec_idxs_by_feat_id = defaultdict(set) for sf_id in sf_ids: for f_id in classifier.vec_index_by_feature_id: # NOTE: Not 100% safe check, could match by accident if sf_id in f_id: vec_idxs_by_feat_id[sf_id].add( classifier.vec_index_by_feature_id[f_id]) # Which ones never fired? i = 0 for i, sf_id in enumerate((id for id in sf_ids if id not in vec_idxs_by_feat_id), start=1): print sf_id, 'never fired' else: print '{} SimString feature(s) never fired'.format(i) res_dic = defaultdict(lambda : defaultdict(lambda : '-')) # Iteratively find the best candidate to_evaluate = set((f_id for f_id in vec_idxs_by_feat_id)) removed = set() iteration = 1 last_macro_score = None while to_evaluate: print 'Iteration:', iteration print 'Censoring vectors...', # Censor everything we have removed so far idxs_to_censor = set(i for i in chain( *(vec_idxs_by_feat_id[f_id] for f_id in removed))) train_vecs = [d for d in _censor_sparse_vectors_gen( train_vecs, idxs_to_censor)] train_data = set(izip(train_lbls, train_vecs)) train_folds = [] for fold in folds: f_lbls = (l for l, _ in fold) f_vecs = (d for d in _censor_sparse_vectors_gen( (v for _, v in fold), idxs_to_censor)) train_folds.append(set(izip(f_lbls, f_vecs))) print 'Done!' print 'Training and evaluating a model of our current state...', classifier._liblinear_train(train_lbls, train_vecs) print 'Done!' test_censored_vecs = [d for d in _censor_sparse_vectors_gen( test_vecs, idxs_to_censor)] curr_macro_score = score_classifier_by_tup(classifier, (test_lbls, test_censored_vecs))[0] print 'Current state on test is: {}'.format(curr_macro_score) if last_macro_score is not None: print 'Last state was: {} (diff: {})'.format(last_macro_score, curr_macro_score - last_macro_score) last_macro_score = curr_macro_score # Prepare to go parallel f_args = ((f_id, classifier, train_data, train_folds, to_censor) for f_id, to_censor in vec_idxs_by_feat_id.iteritems() if f_id in to_evaluate) # Also cram in our non-censored one in there f_args = chain(((None, classifier, train_data, train_folds, set()), ), f_args) score_by_knockout = {} print 'Evaluating knockouts ({} in total)'.format( len(to_evaluate) + 1) # TODO: A bit reduntant, prettify! if worker_pool is not None: i = 1 for f_id, mean in worker_pool.imap_unordered( __knockout_pass, f_args): score_by_knockout[f_id] = mean print 'it: {} k: {} res: {} {}'.format( iteration, i, f_id, mean) i += 1 else: for i, args in enumerate(f_args, start=1): f_id, mean = _knockout_pass(*args) score_by_knockout[f_id] = mean print 'it: {} k: {} res: {} {}'.format( iteration, i, f_id, mean) # Set the result dictionary for f_id, mean in score_by_knockout.iteritems(): res_dic[str(iteration)][f_id] = mean # And write the results incrementally for each round with open(join_path(outdir, 'descent_{}_{}.md'.format( classifier_name, dataset_name)), 'w') as md_file: from md import dict_to_table md_file.write(dict_to_table(res_dic, total=False, perc=False)) md_file.write('\n') # Find the best scoring one... scores = [(s, f_id) for f_id, s in score_by_knockout.iteritems()] scores.sort() scores.reverse() best_score, best_f_id = scores[0] print 'Round winner: {} with {}'.format(best_f_id, best_score) if best_f_id is None: # We are done, no removal gave a better score break removed.add(best_f_id) to_evaluate.remove(best_f_id) iteration += 1 if removed: # TODO: Could do more metrics here? print 'Training and evaluating a model of our previous state...', classifier._liblinear_train(train_lbls, train_uncensored_vecs) before_macro_score = score_classifier_by_tup(classifier, (test_lbls, test_vecs))[0] print 'Done!' print 'Training and evaluating a model of our current state...', train_censored_vecs = [d for d in _censor_sparse_vectors_gen(train_vecs, set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))] classifier._liblinear_train(train_lbls, train_censored_vecs) print 'Done!' test_censored_vecs = [d for d in _censor_sparse_vectors_gen(test_vecs, set(i for i in chain(*(vec_idxs_by_feat_id[f_id] for f_id in removed))))] after_macro_score = score_classifier_by_tup(classifier, (test_lbls, test_censored_vecs))[0] res_str = 'Before: {} After: {}'.format(before_macro_score, after_macro_score) print res_str print 'Happy?' else: res_str = 'Unable to remove any lexical resource to make improvements...' print res_str # Ugly but saves the final result safely with open(join_path(outdir, 'descent_{}_{}.txt'.format( classifier_name, dataset_name)), 'w') as res_file: res_file.write(res_str)