def calc_results(): doc = 'St2' parts = 50 toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation x_t, y_t, g_t = [l[train] for l in [toks, tags, groups]] x_e, y_e, g_e = [l[test] for l in [toks, tags, groups]] x_t, y_t = [l[g_t == doc] for l in [x_t, y_t]] x_e, y_e = [l[g_e == doc] for l in [x_e, y_e]] d = {} for frac in (1 + np.array([-.5, 0, 1, 2, 5, 10, 19, 34, 49 ])) * x_t.shape[0] / parts: frac = int(frac) print(frac, x_e.shape[0]) x_t_, y_t_ = [l[:frac] for l in [x_t, y_t]] maskedtoks, maskedtags = np.vstack((x_t_, x_e)), np.append(y_t_, y_e) for clf in clfs: istrain = np.append(np.full(y_t_.shape, True), np.full(y_e.shape, False)) clfname = 'baseline' if clf.get( 'clf', None) == MostFrequentTag else r'\textsc{c}ore\textsc{nlp}' # two clfs, 10 fractions, 1 doc, 1 metrics -> lineplots (both clfs) (over fractions) with marks and dashed line # table (clf row, fracs col) with stopwatch(): _, scores = evaluate(clf, maskedtoks, maskedtags, istrain) d[(clfname, frac)] = scores print(scores) print() return d
def calc_results(): toks, tags, groups = load(tag_prefix_masks=[]) # [l[:3000] for l in load(tag_prefix_masks=[])] # ctr = Counter(groups) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation toks_t, tags_t, groups_t = [l[train] for l in [toks, tags, groups]] # take the training data for train/eval cross-validation toks_e, tags_e, groups_e = [l[test] for l in [toks, tags, groups]] d = {} for doc, _ in filter(lambda t: t[1] > 7000, ctr.items()): mask = groups != doc mask_e = groups_e == doc Xtrain, ytrain = toks[mask], tags[mask] Xeval, yeval = toks_e[mask_e], tags_e[mask_e] print(doc, np.sum(mask), np.sum(mask_e)) maskedtoks, maskedtags = np.vstack((Xtrain, Xeval)), np.append(ytrain, yeval) for clf in clfs: istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) clfname = 'baseline' if clf.get('clf', None) == MostFrequentTag else ( 'TreeTagger' if clf.get('clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}') docname = ''.join(((r'\textsc{' + c.lower() + '}' if re.match('[A-Z]', c) != None else c) for c in doc)) # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util # table (clf row, dataset col, subtable in cell for each metric) with stopwatch(): _, scores = evaluate(clf, maskedtoks, maskedtags, istrain) d[(docname, clfname)] = scores print(scores) print() return d
def calc_results(): toks, tags, groups = load( tag_prefix_masks=[] ) # [l[:3000] for l in load(tag_prefix_masks=[])] # # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval)), np.append(ytrain, yeval) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) d = {} for params in clfs: if params.get('clf', None) == MostFrequentTag: clfname = 'baseline' clf, total_score = evaluate(params, maskedtoks, maskedtags, istrain) print('known word frac:', clf.knownwords(Xeval)) clf.scope = 'known' _, known_scores = evaluate(params, maskedtoks, maskedtags, istrain, oldclf=clf) clf.scope = 'unk' _, unk_scores = evaluate(params, maskedtoks, maskedtags, istrain, oldclf=clf) df = pd.DataFrame([unk_scores, known_scores, total_score]).T else: clfname = r'\textsc{c}ore\textsc{nlp}' with stopwatch(): # 2clf, 3 scopes, 3 metrics _, score = evaluate(params, maskedtoks, maskedtags, istrain, raw=True) df = pd.DataFrame([ score.loc[measure, :] for measure in ['accuracy', 'avg util', 'avg setsize'] ]) df.index = ['Accuracy', 'Utility', 'Set size'] df.columns = ['Unknown', 'Known', 'Total'] for col in df: d[(clfname, col)] = df[col] print() return d
def evaluate(rules, n=1): with stopwatch(False) as sw: clf = CoreNLPTagger(tag_expansion_rules=rules) scores = pd.Series( cross_val_score(clf, toks, tags, groups, cv=MCInDocSplitter(train_frac=.8, splits=n), n_jobs=-1)) acc, ci = scores.mean(), scores.sem() * 1.96 # print accuracy and current ruleset print(f'{acc:.2%}±{ci:.2%} with {curr_rules} took {sw.interval:.2f}s') return dict(acc=acc, ci=ci, n=n)
def calc_results(): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval)), np.append(ytrain, yeval) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) with stopwatch(): df, _ = evaluate(clf, maskedtoks, maskedtags, istrain, raw=True) return df
def calc_results(betas=None): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, eval_g = [l[test] for l in [toks, tags, groups]] d = {} print(Xtrain.shape[0], Xeval.shape[0]) for clf in clfs: clfname = 'baseline' if clf.get( 'clf', None) == MostFrequentTag else ('TreeTagger' if clf.get( 'clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}') # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util # table (clf row, dataset col, subtable in cell for each metric) with stopwatch(): old_clf = None for g, _ in groupby(eval_g): Xeval_, yeval_ = [l[eval_g == g] for l in [Xeval, yeval]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval_)), np.append(ytrain, yeval_) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval_.shape, False)) old_clf, scores = evaluate(clf, maskedtoks, maskedtags, istrain, oldclf=old_clf, betas=betas) d[(clfname, g)] = scores print(pd.DataFrame(d).T.loc[clfname].mean().tolist()) print() return d
def evaluate(cfg, toks, tags, groups, timeout=1200, seed=0, k=5): params = parse_params(cfg) params_key = json.dumps(params) print(params_key) with stopwatch(verbose=False) as sw: if 'filter_tags' in params: mask = np.vectorize(partial(is_masked, prefixes=params['filter_tags']))(tags) toks, tags, groups = [a[mask] for a in [toks, tags, groups]] clf_class = params.get('clf', TreeTagger) # remove higher level parameters params = {k: v for k, v in params.items() if k not in {'filter_tags', 'clf'}} clf = clf_class(**params) # Note: because of a SMAC bug, using loky in parallel will yield many error messages. # however they do not effect execution scores = cross_val_score(clf, toks, tags, groups, cv=KFoldInDocSplitter(k=k, seed=seed), n_jobs=-1) elapsed = sw.elapsed score = scores.mean() print(dict(score=score, seed=seed, elapsed=elapsed, params=params_key, hash_of_param=hash(params_key))) return 1 - score
import json import numpy as np import pandas as pd import seaborn as sns from sklearn.metrics import confusion_matrix from setpos.tagger import CoreNLPTagger from setpos.data.split import MCInDocSplitter, load from setpos.util import stopwatch if __name__ == '__main__': SEED, n = 7, 6 toks, tags, groups = load() # [l[:3000] for l in load()] # with stopwatch(): targets = [] preds = [] for train, test in MCInDocSplitter(seed=SEED).split( toks, tags, groups): clf = CoreNLPTagger() clf.fit(toks[train], tags[train]) targets.extend( [list(json.loads(tags_).keys())[0] for tags_ in tags[test]]) preds.extend(clf.predict(toks[test])) tags = set(targets) | set(preds) print(len(tags)) conf_mat = confusion_matrix(targets, preds) with np.errstate(divide='ignore'): conf_mat = conf_mat / conf_mat.sum(axis=1)
def merge(rules: List[frozenset]) -> Iterable[frozenset]: merged = rules.copy() for rule in rules: if any((rule < r for r in merged)): merged.remove(rule) return merged def hand_crafted_rules(): return [{'DSG', 'DGN'}, {'DDSA', 'DDD'}] if __name__ == '__main__': with shelve.open('results_handcraft.pkl') as db, stopwatch(): try: # generate all base-rules # rules = [r for r in gen_base_rules(max_len_powerset=1)] rules = [r for r in hand_crafted_rules()] print(f'generated {len(rules)} rules', flush=True) toks, tags, groups = load() # [l[:3000] for l in load()] # curr_rules = [] if 'rejected_rules' not in db: db['rejected_rules'] = [] if 'results' not in db: db['results'] = {} else: print(f'loading {len(db["results"])} previous results')