Esempio n. 1
0
def calc_results():
    doc = 'St2'
    parts = 50
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    x_t, y_t, g_t = [l[train] for l in [toks, tags, groups]]
    x_e, y_e, g_e = [l[test] for l in [toks, tags, groups]]
    x_t, y_t = [l[g_t == doc] for l in [x_t, y_t]]
    x_e, y_e = [l[g_e == doc] for l in [x_e, y_e]]

    d = {}
    for frac in (1 + np.array([-.5, 0, 1, 2, 5, 10, 19, 34, 49
                               ])) * x_t.shape[0] / parts:
        frac = int(frac)
        print(frac, x_e.shape[0])
        x_t_, y_t_ = [l[:frac] for l in [x_t, y_t]]
        maskedtoks, maskedtags = np.vstack((x_t_, x_e)), np.append(y_t_, y_e)
        for clf in clfs:
            istrain = np.append(np.full(y_t_.shape, True),
                                np.full(y_e.shape, False))
            clfname = 'baseline' if clf.get(
                'clf',
                None) == MostFrequentTag else r'\textsc{c}ore\textsc{nlp}'
            # two clfs, 10 fractions, 1 doc, 1 metrics -> lineplots (both clfs) (over fractions) with marks and dashed line
            #           table (clf row, fracs col)
            with stopwatch():
                _, scores = evaluate(clf, maskedtoks, maskedtags, istrain)
            d[(clfname, frac)] = scores
            print(scores)
        print()
    return d
Esempio n. 2
0
def calc_results():
    toks, tags, groups = load(tag_prefix_masks=[])  # [l[:3000] for l in load(tag_prefix_masks=[])]  #
    ctr = Counter(groups)

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    toks_t, tags_t, groups_t = [l[train] for l in [toks, tags, groups]]

    # take the training data for train/eval cross-validation
    toks_e, tags_e, groups_e = [l[test] for l in [toks, tags, groups]]

    d = {}
    for doc, _ in filter(lambda t: t[1] > 7000, ctr.items()):
        mask = groups != doc
        mask_e = groups_e == doc
        Xtrain, ytrain = toks[mask], tags[mask]
        Xeval, yeval = toks_e[mask_e], tags_e[mask_e]
        print(doc, np.sum(mask), np.sum(mask_e))
        maskedtoks, maskedtags = np.vstack((Xtrain, Xeval)), np.append(ytrain, yeval)
        for clf in clfs:
            istrain = np.append(np.full(ytrain.shape, True), np.full(yeval.shape, False))

            clfname = 'baseline' if clf.get('clf', None) == MostFrequentTag else (
                'TreeTagger' if clf.get('clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}')
            docname = ''.join(((r'\textsc{' + c.lower() + '}' if re.match('[A-Z]', c) != None else c) for c in doc))
            # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util
            #           table (clf row, dataset col, subtable in cell for each metric)
            with stopwatch():
                _, scores = evaluate(clf, maskedtoks, maskedtags, istrain)
            d[(docname, clfname)] = scores
            print(scores)
    print()
    return d
Esempio n. 3
0
def calc_results():
    toks, tags, groups = load(
        tag_prefix_masks=[]
    )  # [l[:3000] for l in load(tag_prefix_masks=[])]  #

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]]

    maskedtoks, maskedtags = np.vstack(
        (Xtrain, Xeval)), np.append(ytrain, yeval)
    istrain = np.append(np.full(ytrain.shape, True),
                        np.full(yeval.shape, False))

    d = {}
    for params in clfs:
        if params.get('clf', None) == MostFrequentTag:
            clfname = 'baseline'
            clf, total_score = evaluate(params, maskedtoks, maskedtags,
                                        istrain)
            print('known word frac:', clf.knownwords(Xeval))
            clf.scope = 'known'
            _, known_scores = evaluate(params,
                                       maskedtoks,
                                       maskedtags,
                                       istrain,
                                       oldclf=clf)
            clf.scope = 'unk'
            _, unk_scores = evaluate(params,
                                     maskedtoks,
                                     maskedtags,
                                     istrain,
                                     oldclf=clf)
            df = pd.DataFrame([unk_scores, known_scores, total_score]).T
        else:
            clfname = r'\textsc{c}ore\textsc{nlp}'
            with stopwatch():
                # 2clf, 3 scopes, 3 metrics
                _, score = evaluate(params,
                                    maskedtoks,
                                    maskedtags,
                                    istrain,
                                    raw=True)
            df = pd.DataFrame([
                score.loc[measure, :]
                for measure in ['accuracy', 'avg util', 'avg setsize']
            ])
        df.index = ['Accuracy', 'Utility', 'Set size']
        df.columns = ['Unknown', 'Known', 'Total']
        for col in df:
            d[(clfname, col)] = df[col]
    print()
    return d
Esempio n. 4
0
def evaluate(rules, n=1):
    with stopwatch(False) as sw:
        clf = CoreNLPTagger(tag_expansion_rules=rules)

        scores = pd.Series(
            cross_val_score(clf, toks, tags, groups, cv=MCInDocSplitter(train_frac=.8, splits=n), n_jobs=-1))
        acc, ci = scores.mean(), scores.sem() * 1.96

    # print accuracy and current ruleset
    print(f'{acc:.2%}±{ci:.2%} with {curr_rules} took {sw.interval:.2f}s')
    return dict(acc=acc, ci=ci, n=n)
Esempio n. 5
0
def calc_results():
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, _ = [l[test] for l in [toks, tags, groups]]

    maskedtoks, maskedtags = np.vstack(
        (Xtrain, Xeval)), np.append(ytrain, yeval)
    istrain = np.append(np.full(ytrain.shape, True),
                        np.full(yeval.shape, False))
    with stopwatch():
        df, _ = evaluate(clf, maskedtoks, maskedtags, istrain, raw=True)
    return df
Esempio n. 6
0
def calc_results(betas=None):
    toks, tags, groups = load(tag_prefix_masks=[])

    # train - test split
    train, test = next(MCInDocSplitter(seed=SEED).split(toks, tags, groups))

    # take the training data for train/eval cross-validation
    Xtrain, ytrain, _ = [l[train] for l in [toks, tags, groups]]
    Xeval, yeval, eval_g = [l[test] for l in [toks, tags, groups]]

    d = {}
    print(Xtrain.shape[0], Xeval.shape[0])

    for clf in clfs:
        clfname = 'baseline' if clf.get(
            'clf', None) == MostFrequentTag else ('TreeTagger' if clf.get(
                'clf', None) == TreeTagger else r'\textsc{c}ore\textsc{nlp}')
        # two clfs, 9 docs, 3 metrics -> two grouped barplots (over datasets), one for acc; util
        #           table (clf row, dataset col, subtable in cell for each metric)
        with stopwatch():
            old_clf = None
            for g, _ in groupby(eval_g):
                Xeval_, yeval_ = [l[eval_g == g] for l in [Xeval, yeval]]
                maskedtoks, maskedtags = np.vstack(
                    (Xtrain, Xeval_)), np.append(ytrain, yeval_)
                istrain = np.append(np.full(ytrain.shape, True),
                                    np.full(yeval_.shape, False))
                old_clf, scores = evaluate(clf,
                                           maskedtoks,
                                           maskedtags,
                                           istrain,
                                           oldclf=old_clf,
                                           betas=betas)

                d[(clfname, g)] = scores
        print(pd.DataFrame(d).T.loc[clfname].mean().tolist())
    print()
    return d
Esempio n. 7
0
def evaluate(cfg, toks, tags, groups, timeout=1200, seed=0, k=5):
    params = parse_params(cfg)
    params_key = json.dumps(params)
    print(params_key)
    with stopwatch(verbose=False) as sw:
        if 'filter_tags' in params:
            mask = np.vectorize(partial(is_masked, prefixes=params['filter_tags']))(tags)
            toks, tags, groups = [a[mask] for a in [toks, tags, groups]]
        clf_class = params.get('clf', TreeTagger)

        # remove higher level parameters
        params = {k: v for k, v in params.items() if k not in {'filter_tags', 'clf'}}

        clf = clf_class(**params)
        # Note: because of a SMAC bug, using loky in parallel will yield many error messages.
        # however they do not effect execution
        scores = cross_val_score(clf, toks, tags, groups, cv=KFoldInDocSplitter(k=k, seed=seed), n_jobs=-1)

        elapsed = sw.elapsed
    score = scores.mean()
    print(dict(score=score, seed=seed, elapsed=elapsed, params=params_key, hash_of_param=hash(params_key)))

    return 1 - score
Esempio n. 8
0
import json

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix

from setpos.tagger import CoreNLPTagger
from setpos.data.split import MCInDocSplitter, load
from setpos.util import stopwatch

if __name__ == '__main__':
    SEED, n = 7, 6
    toks, tags, groups = load()  # [l[:3000] for l in load()]  #

    with stopwatch():
        targets = []
        preds = []
        for train, test in MCInDocSplitter(seed=SEED).split(
                toks, tags, groups):
            clf = CoreNLPTagger()
            clf.fit(toks[train], tags[train])
            targets.extend(
                [list(json.loads(tags_).keys())[0] for tags_ in tags[test]])
            preds.extend(clf.predict(toks[test]))

    tags = set(targets) | set(preds)
    print(len(tags))
    conf_mat = confusion_matrix(targets, preds)
    with np.errstate(divide='ignore'):
        conf_mat = conf_mat / conf_mat.sum(axis=1)
Esempio n. 9
0

def merge(rules: List[frozenset]) -> Iterable[frozenset]:
    merged = rules.copy()
    for rule in rules:
        if any((rule < r for r in merged)):
            merged.remove(rule)
    return merged


def hand_crafted_rules():
    return [{'DSG', 'DGN'}, {'DDSA', 'DDD'}]


if __name__ == '__main__':
    with shelve.open('results_handcraft.pkl') as db, stopwatch():
        try:
            # generate all base-rules
            # rules = [r for r in gen_base_rules(max_len_powerset=1)]
            rules = [r for r in hand_crafted_rules()]
            print(f'generated {len(rules)} rules', flush=True)

            toks, tags, groups = load()  # [l[:3000] for l in load()]  #
            curr_rules = []
            if 'rejected_rules' not in db:
                db['rejected_rules'] = []
            if 'results' not in db:
                db['results'] = {}
            else:
                print(f'loading {len(db["results"])} previous results')