def calc_results(): toks, tags, groups = load(tag_prefix_masks=[]) # [l[:3000] for l in load(tag_prefix_masks=[])] # ctr = Counter(groups) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation toks_t, tags_t, groups_t = [l[train] for l in [toks, tags, groups]] # take the training data for train/eval cross-validation toks_e, tags_e, groups_e = [l[test] for l in [toks, tags, groups]] d = {} for doc, _ in filter(lambda t: t[1] > 7000, ctr.items()): mask = groups != doc mask_e = groups_e == doc Xtrain, ytrain = toks[mask], tags[mask] Xeval, yeval = toks_e[mask_e], tags_e[mask_e] print(doc, np.sum(mask), np.sum(mask_e)) maskedtoks, maskedtags = np.vstack((Xtrain, Xeval)), np.append(ytrain, yeval) for clf in clfs: istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) clfname = 'baseline' if clf.get('clf', None) == MostFrequentTag else ( 'TreeTagger' if clf.get('clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}') docname = ''.join(((r'\textsc{' + c.lower() + '}' if re.match('[A-Z]', c) != None else c) for c in doc)) # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util # table (clf row, dataset col, subtable in cell for each metric) with stopwatch(): _, scores = evaluate(clf, maskedtoks, maskedtags, istrain) d[(docname, clfname)] = scores print(scores) print() return d
def calc_results(): doc = 'St2' parts = 50 toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation x_t, y_t, g_t = [l[train] for l in [toks, tags, groups]] x_e, y_e, g_e = [l[test] for l in [toks, tags, groups]] x_t, y_t = [l[g_t == doc] for l in [x_t, y_t]] x_e, y_e = [l[g_e == doc] for l in [x_e, y_e]] d = {} for frac in (1 + np.array([-.5, 0, 1, 2, 5, 10, 19, 34, 49 ])) * x_t.shape[0] / parts: frac = int(frac) print(frac, x_e.shape[0]) x_t_, y_t_ = [l[:frac] for l in [x_t, y_t]] maskedtoks, maskedtags = np.vstack((x_t_, x_e)), np.append(y_t_, y_e) for clf in clfs: istrain = np.append(np.full(y_t_.shape, True), np.full(y_e.shape, False)) clfname = 'baseline' if clf.get( 'clf', None) == MostFrequentTag else r'\textsc{c}ore\textsc{nlp}' # two clfs, 10 fractions, 1 doc, 1 metrics -> lineplots (both clfs) (over fractions) with marks and dashed line # table (clf row, fracs col) with stopwatch(): _, scores = evaluate(clf, maskedtoks, maskedtags, istrain) d[(clfname, frac)] = scores print(scores) print() return d
def get_ytrain(): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, _ = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation _, ytrain, _ = [l[train] for l in [toks, tags, groups]] return ytrain
def calc_results(): toks, tags, groups = load( tag_prefix_masks=[] ) # [l[:3000] for l in load(tag_prefix_masks=[])] # # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval)), np.append(ytrain, yeval) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) d = {} for params in clfs: if params.get('clf', None) == MostFrequentTag: clfname = 'baseline' clf, total_score = evaluate(params, maskedtoks, maskedtags, istrain) print('known word frac:', clf.knownwords(Xeval)) clf.scope = 'known' _, known_scores = evaluate(params, maskedtoks, maskedtags, istrain, oldclf=clf) clf.scope = 'unk' _, unk_scores = evaluate(params, maskedtoks, maskedtags, istrain, oldclf=clf) df = pd.DataFrame([unk_scores, known_scores, total_score]).T else: clfname = r'\textsc{c}ore\textsc{nlp}' with stopwatch(): # 2clf, 3 scopes, 3 metrics _, score = evaluate(params, maskedtoks, maskedtags, istrain, raw=True) df = pd.DataFrame([ score.loc[measure, :] for measure in ['accuracy', 'avg util', 'avg setsize'] ]) df.index = ['Accuracy', 'Utility', 'Set size'] df.columns = ['Unknown', 'Known', 'Total'] for col in df: d[(clfname, col)] = df[col] print() return d
def evaluate(rules, n=1): with stopwatch(False) as sw: clf = CoreNLPTagger(tag_expansion_rules=rules) scores = pd.Series( cross_val_score(clf, toks, tags, groups, cv=MCInDocSplitter(train_frac=.8, splits=n), n_jobs=-1)) acc, ci = scores.mean(), scores.sem() * 1.96 # print accuracy and current ruleset print(f'{acc:.2%}±{ci:.2%} with {curr_rules} took {sw.interval:.2f}s') return dict(acc=acc, ci=ci, n=n)
def calculate_setsizes(): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]] clf = CoreNLPTagger() clf.fit(Xtrain, ytrain) return clf.setsizes(Xeval)
def calc_results(): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval)), np.append(ytrain, yeval) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) with stopwatch(): df, _ = evaluate(clf, maskedtoks, maskedtags, istrain, raw=True) return df
def optimize(): SEED = 1 toks, tags, groups = load( tag_prefix_masks=[] ) # [l[:3000] for l in load(tag_prefix_masks=[])] # # train - test split train, _ = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation toks, tags, groups = [l[train] for l in [toks, tags, groups]] def tae(cfg, seed=0): return callee.evaluate(cfg, toks, tags, groups, timeout=1200, seed=seed, k=5) scenario_dict = { 'run_obj': 'quality', # we optimize quality (alternatively runtime) 'runcount-limit': 100, 'algo_runs_timelimit': 60 * 60 * 14, 'cutoff': 1200, # stop algorithms after 10x default runtime "cs": cs, # configuration space "deterministic": 'true', 'output_dir': 'smac3_test_treetagger' } smac = create_or_restore_smac(scenario_dict=scenario_dict, rng=np.random.RandomState(SEED), tae=tae) incumbent = smac.optimize() inc_value = tae(incumbent) print("Optimized Value: %.2f" % (inc_value))
def calc_results(betas=None): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, eval_g = [l[test] for l in [toks, tags, groups]] d = {} print(Xtrain.shape[0], Xeval.shape[0]) for clf in clfs: clfname = 'baseline' if clf.get( 'clf', None) == MostFrequentTag else ('TreeTagger' if clf.get( 'clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}') # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util # table (clf row, dataset col, subtable in cell for each metric) with stopwatch(): old_clf = None for g, _ in groupby(eval_g): Xeval_, yeval_ = [l[eval_g == g] for l in [Xeval, yeval]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval_)), np.append(ytrain, yeval_) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval_.shape, False)) old_clf, scores = evaluate(clf, maskedtoks, maskedtags, istrain, oldclf=old_clf, betas=betas) d[(clfname, g)] = scores print(pd.DataFrame(d).T.loc[clfname].mean().tolist()) print() return d
clf.predict_proba( sents_to_dataset( ['schap vnde dar to hebbe ik ere gegeuen'.split()]))) print( clf.setpredict( sents_to_dataset( ['schap vnde dar to hebbe ik ere gegeuen'.split()]))) results = clf.score(toks[np.isin(groups, g[1:2])], tags[np.isin(groups, g[1:2])], raw=True) print_classical_pred_stats(*results) toks, tags, groups = load() train, test = next(MCInDocSplitter(seed=1).split(toks, tags, groups)) clf = CoreNLPTagger( set_valued=False, corenlp_train_params=[ '--arch', 'left5words,suffix(1),prefix(1),suffix(2),prefix(2),suffix(3),prefix(3)' ]) clf.fit(toks[train], tags[train]) print(f'meansetsize: {clf.meansetsize(toks[test]):.2f}') print( f'accuracy: {cross_val_score(clf, toks, tags, groups, cv=KFoldInDocSplitter(5, seed=1), n_jobs=-1).mean():.2%}' ) clf.set_valued = True print(
import pandas as pd import seaborn as sns from sklearn.metrics import confusion_matrix from setpos.tagger import CoreNLPTagger from setpos.data.split import MCInDocSplitter, load from setpos.util import stopwatch if __name__ == '__main__': SEED, n = 7, 6 toks, tags, groups = load() # [l[:3000] for l in load()] # with stopwatch(): targets = [] preds = [] for train, test in MCInDocSplitter(seed=SEED).split( toks, tags, groups): clf = CoreNLPTagger() clf.fit(toks[train], tags[train]) targets.extend( [list(json.loads(tags_).keys())[0] for tags_ in tags[test]]) preds.extend(clf.predict(toks[test])) tags = set(targets) | set(preds) print(len(tags)) conf_mat = confusion_matrix(targets, preds) with np.errstate(divide='ignore'): conf_mat = conf_mat / conf_mat.sum(axis=1) df = pd.DataFrame(conf_mat.T, columns=sorted(tags), index=sorted(tags)) # y axis is true, x axis is pred
# dict(tag_expansion_rules=[{'DSG', 'DGN'}, {'DDSA', 'DDD'}]), # dict(augment_setvalued_targets=True), # dict(corenlp_train_params=['-arch', 'bidirectional5words']), # dict(corenlp_train_params=['-arch', 'words(-2,1),order(-2,1),twoTags(-1,1), wordTag(0,-1),wordTag(0,1),biwords(-1,1)'], memlimit='32g'), # dict(corenlp_train_params=['-arch', 'words(-1,1),order(-1,1),twoTags(-1,1), wordTag(0,-1),wordTag(0,1),biwords(-1,1)']), # dict(corenlp_train_params=['-arch', 'bidirectional'], memlimit='16g'), # dict(corenlp_train_params=['-arch', 'naacl2003unknowns']), # dict(corenlp_train_params=['-arch', 'left3words,suffix(1,0),suffix(2,0),suffix(3,0)']), # dict(corenlp_train_params=['-arch', 'left3words,suffix(1,0),suffix(2,0),suffix(3,0),prefix(1,0),prefix(2,0),prefix(3,0)']), ] # check if all training parameters for all runs are identical. If yes, we can reuse the fitted classifier reuse_training = len(set([json.dumps(d.get('corenlp_train_params', [])) for d in paramspace])) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation toks, tags, groups = [l[train] for l in [toks, tags, groups]] with shelve.open(f'result_seed={SEED}_n={len(toks)}_smac-validation2.pkl') as db: for params in paramspace: param_as_key = param_to_key(params) if 'filter_tags' in params: mask = np.vectorize(partial(is_masked, prefixes=params['filter_tags']))(tags) toks, tags, groups = [a[mask] for a in [toks, tags, groups]] clf_class = params.get('clf', CoreNLPTagger) # remove higher level parameters params = {k: v for k, v in params.items() if k not in {'filter_tags', 'clf'}}