def calc_results(): toks, tags, groups = load(tag_prefix_masks=[]) # [l[:3000] for l in load(tag_prefix_masks=[])] # ctr = Counter(groups) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation toks_t, tags_t, groups_t = [l[train] for l in [toks, tags, groups]] # take the training data for train/eval cross-validation toks_e, tags_e, groups_e = [l[test] for l in [toks, tags, groups]] d = {} for doc, _ in filter(lambda t: t[1] > 7000, ctr.items()): mask = groups != doc mask_e = groups_e == doc Xtrain, ytrain = toks[mask], tags[mask] Xeval, yeval = toks_e[mask_e], tags_e[mask_e] print(doc, np.sum(mask), np.sum(mask_e)) maskedtoks, maskedtags = np.vstack((Xtrain, Xeval)), np.append(ytrain, yeval) for clf in clfs: istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) clfname = 'baseline' if clf.get('clf', None) == MostFrequentTag else ( 'TreeTagger' if clf.get('clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}') docname = ''.join(((r'\textsc{' + c.lower() + '}' if re.match('[A-Z]', c) != None else c) for c in doc)) # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util # table (clf row, dataset col, subtable in cell for each metric) with stopwatch(): _, scores = evaluate(clf, maskedtoks, maskedtags, istrain) d[(docname, clfname)] = scores print(scores) print() return d
def calc_results(): doc = 'St2' parts = 50 toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation x_t, y_t, g_t = [l[train] for l in [toks, tags, groups]] x_e, y_e, g_e = [l[test] for l in [toks, tags, groups]] x_t, y_t = [l[g_t == doc] for l in [x_t, y_t]] x_e, y_e = [l[g_e == doc] for l in [x_e, y_e]] d = {} for frac in (1 + np.array([-.5, 0, 1, 2, 5, 10, 19, 34, 49 ])) * x_t.shape[0] / parts: frac = int(frac) print(frac, x_e.shape[0]) x_t_, y_t_ = [l[:frac] for l in [x_t, y_t]] maskedtoks, maskedtags = np.vstack((x_t_, x_e)), np.append(y_t_, y_e) for clf in clfs: istrain = np.append(np.full(y_t_.shape, True), np.full(y_e.shape, False)) clfname = 'baseline' if clf.get( 'clf', None) == MostFrequentTag else r'\textsc{c}ore\textsc{nlp}' # two clfs, 10 fractions, 1 doc, 1 metrics -> lineplots (both clfs) (over fractions) with marks and dashed line # table (clf row, fracs col) with stopwatch(): _, scores = evaluate(clf, maskedtoks, maskedtags, istrain) d[(clfname, frac)] = scores print(scores) print() return d
def get_ytrain(): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, _ = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation _, ytrain, _ = [l[train] for l in [toks, tags, groups]] return ytrain
def calc_results(): toks, tags, groups = load( tag_prefix_masks=[] ) # [l[:3000] for l in load(tag_prefix_masks=[])] # # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval)), np.append(ytrain, yeval) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) d = {} for params in clfs: if params.get('clf', None) == MostFrequentTag: clfname = 'baseline' clf, total_score = evaluate(params, maskedtoks, maskedtags, istrain) print('known word frac:', clf.knownwords(Xeval)) clf.scope = 'known' _, known_scores = evaluate(params, maskedtoks, maskedtags, istrain, oldclf=clf) clf.scope = 'unk' _, unk_scores = evaluate(params, maskedtoks, maskedtags, istrain, oldclf=clf) df = pd.DataFrame([unk_scores, known_scores, total_score]).T else: clfname = r'\textsc{c}ore\textsc{nlp}' with stopwatch(): # 2clf, 3 scopes, 3 metrics _, score = evaluate(params, maskedtoks, maskedtags, istrain, raw=True) df = pd.DataFrame([ score.loc[measure, :] for measure in ['accuracy', 'avg util', 'avg setsize'] ]) df.index = ['Accuracy', 'Utility', 'Set size'] df.columns = ['Unknown', 'Known', 'Total'] for col in df: d[(clfname, col)] = df[col] print() return d
def calculate_setsizes(): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]] clf = CoreNLPTagger() clf.fit(Xtrain, ytrain) return clf.setsizes(Xeval)
def calc_results(): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval)), np.append(ytrain, yeval) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False)) with stopwatch(): df, _ = evaluate(clf, maskedtoks, maskedtags, istrain, raw=True) return df
def optimize(): SEED = 1 toks, tags, groups = load( tag_prefix_masks=[] ) # [l[:3000] for l in load(tag_prefix_masks=[])] # # train - test split train, _ = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation toks, tags, groups = [l[train] for l in [toks, tags, groups]] def tae(cfg, seed=0): return callee.evaluate(cfg, toks, tags, groups, timeout=1200, seed=seed, k=5) scenario_dict = { 'run_obj': 'quality', # we optimize quality (alternatively runtime) 'runcount-limit': 100, 'algo_runs_timelimit': 60 * 60 * 14, 'cutoff': 1200, # stop algorithms after 10x default runtime "cs": cs, # configuration space "deterministic": 'true', 'output_dir': 'smac3_test_treetagger' } smac = create_or_restore_smac(scenario_dict=scenario_dict, rng=np.random.RandomState(SEED), tae=tae) incumbent = smac.optimize() inc_value = tae(incumbent) print("Optimized Value: %.2f" % (inc_value))
def calc_results(betas=None): toks, tags, groups = load(tag_prefix_masks=[]) # train - test split train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups)) # take the training data for train/eval cross-validation Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]] Xeval, yeval, eval_g = [l[test] for l in [toks, tags, groups]] d = {} print(Xtrain.shape[0], Xeval.shape[0]) for clf in clfs: clfname = 'baseline' if clf.get( 'clf', None) == MostFrequentTag else ('TreeTagger' if clf.get( 'clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}') # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util # table (clf row, dataset col, subtable in cell for each metric) with stopwatch(): old_clf = None for g, _ in groupby(eval_g): Xeval_, yeval_ = [l[eval_g == g] for l in [Xeval, yeval]] maskedtoks, maskedtags = np.vstack( (Xtrain, Xeval_)), np.append(ytrain, yeval_) istrain = np.append(np.full(ytrain.shape, True), np.full(yeval_.shape, False)) old_clf, scores = evaluate(clf, maskedtoks, maskedtags, istrain, oldclf=old_clf, betas=betas) d[(clfname, g)] = scores print(pd.DataFrame(d).T.loc[clfname].mean().tolist()) print() return d
import json import re from collections import Counter from itertools import chain import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from natsort import natsorted from setpos.data.split import load if __name__ == '__main__': _, tags, groups = load() docs = natsorted(set(groups), key=lambda x: str(int(not x.startswith('REN'))) + x) result = {} for doc in docs: t = list(chain.from_iterable([json.loads(tags_).keys() for tags_ in tags[groups == doc]])) counter = pd.Series(dict(Counter(t))) counter /= len(t) result[doc] = counter df = pd.DataFrame(result).sort_index().T t = list(chain.from_iterable([json.loads(tags_).keys() for tags_ in tags])) prior = pd.Series(dict(Counter(t))).sort_index() prior /= prior.sum() doc_sizes = pd.Series([len(tags[groups == doc]) for doc in docs], index=docs) prior_df = prior.sort_values(ascending=False).head(10) prior_df.name = 'Probability'
import json import numpy as np import pandas as pd import seaborn as sns from sklearn.metrics import confusion_matrix from setpos.tagger import CoreNLPTagger from setpos.data.split import MCInDocSplitter, load from setpos.util import stopwatch if __name__ == '__main__': SEED, n = 7, 6 toks, tags, groups = load() # [l[:3000] for l in load()] # with stopwatch(): targets = [] preds = [] for train, test in MCInDocSplitter(seed=SEED).split( toks, tags, groups): clf = CoreNLPTagger() clf.fit(toks[train], tags[train]) targets.extend( [list(json.loads(tags_).keys())[0] for tags_ in tags[test]]) preds.extend(clf.predict(toks[test])) tags = set(targets) | set(preds) print(len(tags)) conf_mat = confusion_matrix(targets, preds) with np.errstate(divide='ignore'): conf_mat = conf_mat / conf_mat.sum(axis=1)
# pad the probabilities with zero rows for every tag that has been unseen during training tag_idx = {t: i for i, t in enumerate(self.clf.classes_)} new_probas = np.zeros((probas.shape[0], len(self._tags))) for i, t in enumerate(self._tags): if t in tag_idx: new_probas[:, i] = probas[:, tag_idx[t]] return new_probas def predict(self, X): if self.clf is None: raise NotFittedError return super().predict(X) if __name__ == '__main__': toks, tags, groups = [l[:] for l in load()] # load() train, test = next(MCInDocSplitter(seed=1).split(toks, tags, groups)) # train, test = next(LeaveOneGroupOut().split(toks, tags, groups)) clf = SimpleTagger() clf.fit(toks[train], tags[train]) print(f'meansetsize: {clf.meansetsize(toks[test]):.2f}') print(f'knownwords: {clf.knownwords(toks[test]):.2%}') print( f'accuracy: {cross_val_score(clf, toks, tags, groups, cv=KFoldInDocSplitter(5, seed=1), n_jobs=3).mean():.2%}' ) clf.set_valued = True print( f'utility: {cross_val_score(clf, toks, tags, groups, cv=KFoldInDocSplitter(5, seed=1), n_jobs=4).mean():.2%}' )
import json import shelve from functools import partial import numpy as np import pandas as pd from sklearn.model_selection import cross_val_score from scripts.util.func import param_to_key from setpos.data.split import load, MCInDocSplitter, is_masked from setpos.tagger import CoreNLPTagger from setpos.util import stopwatch, draw_cd_diagram if __name__ == '__main__': SEED, n = 1, 15 toks, tags, groups = load( tag_prefix_masks=[]) # load(tag_prefix_masks=[]) #[l[:3000] for l in load(tag_prefix_masks=[])] # opt = {"augment_setvalued_targets": False, "filter_tags": [], "corenlp_train_params": ["-curWordMinFeatureThreshold", "4", "-minFeatureThreshold", "1", "-rareWordMinFeatureThresh", "1", "-rareWordThresh", "6", "-sigmaSquared", "0.7676194187745077", "-veryCommonWordThresh", "234", "-arch", "order(-2,0),prefix(1,0),prefix(2,0),prefix(3,0),suffix(2,0),suffix(3,0),suffix(4,0),suffix(5,0),wordTag(0,-1),words(-3,2)"]} opt_set_open_classes = opt.copy() opt_set_open_classes['corenlp_train_params'] = opt_set_open_classes['corenlp_train_params'] + ['-openClassTags', '"ADJA ADJA<VVPP ADJA<VVPS ADJD ADJN ADJN<VVPP ADJS ADJV CARDA CARDN CARDS NA NE VAFIN VAFIN.* VAFIN.ind VAFIN.konj VAINF VAPP VKFIN.* VKFIN.ind VKFIN.konj VKINF VKPP VKPS VMFIN.* VMFIN.ind VMFIN.konj VMINF VVFIN.* VVFIN.ind VVFIN.konj VVIMP VVINF VVPP VVPS OA XY FM"', '-lang', ''] opt_lang_english = opt.copy() opt_lang_english['corenlp_train_params'] = opt_lang_english['corenlp_train_params'] + ['-lang', 'english'] opt_lang_german = opt.copy() opt_lang_german['corenlp_train_params'] = opt_lang_german['corenlp_train_params'] + ['-lang', 'german'] opt_learn_closed = opt.copy() opt_learn_closed['corenlp_train_params'] = opt_learn_closed['corenlp_train_params'] + ['-lang', '',
import pandas as pd from setpos.data.split import load, join_to_sents if __name__ == '__main__': X, y, groups = load() # [l[:3000] for l in load()] # results = [] sents = join_to_sents(X, y) for sent in sents: new_sents_count_full_carth = pd.Series([len(tags) * 1. for _, tags in sent]).prod() new_sents_count_ambig_carth = pd.Series( [1.] + [max(1., len(list(filter(lambda v: v >= 1, list(tags.values()))))) for _, tags in sent]).prod() new_sents_count_each_tag_used = pd.Series([len(tags) for _, tags in sent]).max() results.append(dict(before=len(sent), carth=new_sents_count_full_carth * len(sent), ambig_carth=new_sents_count_ambig_carth * len(sent), each_tag_used=new_sents_count_each_tag_used * len(sent))) df = pd.DataFrame(results) with pd.option_context('display.float_format', '{:,.0f}'.format): print(df.sum())
import shelve from functools import partial import numpy as np import pandas as pd from sklearn.model_selection import cross_val_score from scripts.util.func import param_to_key from setpos.data.split import load, is_masked, MCInDocSplitter, KFoldInDocSplitter from setpos.tagger import CoreNLPTagger from setpos.util import stopwatch, draw_cd_diagram if __name__ == '__main__': SEED, n = 1, 2 toks, tags, groups = [ l[:3000] for l in load(tag_prefix_masks=[]) ] # load() # load(tag_prefix_masks=[]) #[l[:3000] for l in load(tag_prefix_masks=[])] # paramspace = [ # dict(clf=IntergrammTagger), {}, dict( augment_setvalued_targets='True', corenlp_train_params=[ '--arch', 'left5words,suffix(1),prefix(1),suffix(2),prefix(2),suffix(3),prefix(3)' ], filter_tags=()), dict( augment_setvalued_targets='False', corenlp_train_params=[ '--arch', 'left5words,suffix(1),prefix(1),suffix(2),prefix(2),suffix(3),prefix(3)'
results = pd.concat([results, results2]) if long_result: return df, results if score is None: score = 'accuracy' if self.set_valued == False else 'avg util' return results[scope].loc[score] if __name__ == '__main__': import numpy as np from setpos.data.split import load, sents_to_dataset import logging logging.basicConfig() toks, tags, groups = load() g = list(set(groups)) clf = CoreNLPTagger(loglevel=logging.INFO) clf.fit(toks[np.isin(groups, g[1:2])], tags[np.isin(groups, g[1:2])]) clf.fit(toks[np.isin(groups, g[0:1])], tags[np.isin(groups, g[0:1])]) # clf.fit(toks, tags) # schap/NA vnde/KON dar/PAVD to/PAVAP hebbe/VAFIN ik/PPER ere/PPER gegeuen/VVPP print( clf.predict_proba( sents_to_dataset( ['schap vnde dar to hebbe ik ere gegeuen'.split()]))) print( clf.setpredict( sents_to_dataset(
): dist = dists[end_index][0] acc = evaluate(train_docs, eval_doc) print("curr max dist:", dist) print("newly added:", dists[end_index][1]) print("training docs:", train_docs) print("excluded docs:", set(doc_titles - train_docs)) for i, val in acc.items(): print(f'{i}: {val:.4f}') print() return dict(dist=dist, train_docs=train_docs, **acc) if __name__ == '__main__': X, y, groups = load() doc_titles = set(groups) metadata = pd.read_csv( "corpus.metadata.csv", index_col='sigle', usecols=["sigle", 'year_norm', 'lon_norm', 'lat_norm']) os.makedirs("out", exist_ok=True) for seed in range(4): for selector in doc_titles: prefix = f'{selector}-seed:{seed}' if isfile(prefix + '.csv'): continue print('test doc:', prefix, end="\n-------------------------\n\n")
print_stats(data, "avg util", lambda x: x[prefix + "const_util"].mean())) result.update( print_stats(data, "avg recall", lambda x: x[prefix + "const_recall"].mean())) print( f"avg sent recall {data.groupby(['sentID'])[[prefix + 'const_recall']].all().mean().iloc[0]:.2%}" ) result.update( print_stats(data, "avg precision", lambda x: x[prefix + "const_precision"].mean())) result.update( print_stats( data, "avg agreement of set-prediction with default prediction", lambda x: x[prefix + "original_pred_in_set_pred"].mean())) return data, pd.DataFrame.from_dict(result, 'index') if __name__ == '__main__': print(score(dict(a=1, b=1), {'b'})) print(score(dict(a=.6, b=.3), {'b'})) print(score(dict(a=1), {'b'})) print(score(dict(b=1), {'b'})) df, tagsdict = load() df, _ = print_classical_pred_stats(df, tagsdict) df, _ = print_set_valued_pred_stats(df, tagsdict, True) df.to_excel("out.xlsx")
import json from collections import defaultdict import matplotlib.pyplot as plt import pandas as pd from setpos.data.split import load if __name__ == '__main__': X, y, g = load() # dict of tag -> set of words word_tags_map = defaultdict(set) for tags, (_, word) in zip(y, X): tags = json.loads(tags).keys() for tag in tags: word_tags_map[word].add(tag) series = pd.Series(word_tags_map).apply(len) print(series.describe()) series.hist() plt.show()
import logging from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from setpos.tagger import CoreNLPTagger, TreeTagger from setpos.data.split import load if __name__ == '__main__': logging.basicConfig(level=logging.INFO) toks, tags, groups = load(n=5000) clf = CoreNLPTagger() scores = cross_val_score(clf, toks, tags, groups, cv=LeaveOneGroupOut(), n_jobs=-1) print(f"total accuracy: {scores.mean():.2%} ± {scores.std():.2%}")