Esempio n. 1
0
def linear_cv_score(dataset, alpha, l1_ratio, constraints):

    fn = cache_fname("linear_cv_score",
                     (dataset, alpha, l1_ratio, constraints))
    if os.path.exists(fn):
        logging.info("Loading {}".format(fn))
        with open(fn, "rb") as f:
            return dill.load(f)

    load, ids = get_dataset_loader(dataset, split="train")
    n_folds = 5 if dataset == 'ukp' else 3

    scores = []
    for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
        Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio)

        val_docs = list(load(ids[val]))
        Y_true = [doc.label for doc in val_docs]
        Y_pred = bl.fast_decode(Y_marg, val_docs, constraints)

        scores.append(bl._score(Y_true, Y_pred))

    with open(fn, "wb") as f:
        logging.info("Saving {}".format(fn))
        dill.dump(scores, f)
    return scores
Esempio n. 2
0
def baseline_argrnn_cv_score(dataset, dynet_weight_decay, mlp_dropout,
                             rnn_dropout, prop_layers, constraints):

    fn = cache_fname("baseline_argrnn_cv_score", (dataset, dynet_weight_decay,
                                                  mlp_dropout, rnn_dropout,
                                                  prop_layers, constraints))
    if os.path.exists(fn):
        logging.info("Cached file already exists.")
        with open(fn, "rb") as f:
            return dill.load(f)

    load, ids = get_dataset_loader(dataset, split="train")
    embeds = load_embeds(dataset)

    scores = []
    Y_pred = []
    score_at_iter = [10, 25, 50, 75, 100]

    n_folds = 5 if dataset == 'ukp' else 3

    for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
        docs_train = list(load(ids[tr]))
        docs_val = list(load(ids[val]))

        Y_train = [doc.label for doc in docs_train]
        Y_val = [doc.label for doc in docs_val]

        rnn = BaselineArgumentLSTM(lstm_dropout=rnn_dropout,
                                   mlp_dropout=mlp_dropout,
                                   prop_mlp_layers=prop_layers,
                                   max_iter=100,
                                   score_at_iter=score_at_iter,
                                   n_mlp=128,
                                   n_lstm=128,
                                   lstm_layers=2,
                                   link_mlp_layers=1,
                                   embeds=embeds,
                                   link_bilinear=True,
                                   constraints=constraints)

        rnn.fit(docs_train, Y_train, docs_val, Y_val)
        Y_val_pred = rnn.predict(docs_val)
        Y_pred.extend(Y_val_pred)

        scores.append(rnn.scores_)

    with open(fn, "wb") as f:
        dill.dump((scores, score_at_iter, Y_pred), f)

    return scores, score_at_iter, Y_pred
Esempio n. 3
0
def store_optimized_embeddings(dataset, glove_path):

    from marseille.datasets import get_dataset_loader

    out_path = os.path.join('data', '{}-glove.npz'.format(dataset))
    vocab = set()
    load, ids = get_dataset_loader(dataset, "train")
    for doc in load(ids):
        vocab.update(doc.tokens())
    res = optimize_glove(glove_path, vocab)
    glove_vocab, glove_embeds = res
    coverage = len(glove_vocab) / len(vocab)
    np.savez(out_path, vocab=glove_vocab, embeds=glove_embeds)
    logging.info("GloVe coverage: {:.2f}%".format(100 * coverage))
Esempio n. 4
0
def test_merge_spans():
    from collections import Counter
    from marseille.datasets import get_dataset_loader

    load, ids = get_dataset_loader("cdcp", "train")
    n_nones = 0
    label_counts = Counter()
    for doc in load(ids):
        label_counts.update(doc.prop_labels)
        # drops 14 links in training and 8 in test split
        n_nones += sum(1 for x in doc.prop_labels if x is None)

    print(label_counts.most_common())
    print(n_nones)
Esempio n. 5
0
def svmstruct_cv_score(dataset, C, class_weight, constraints,
                       compat_features, second_order_features):

    fn = cache_fname("svmstruct_cv_score", (dataset, C, class_weight,
                                            constraints, compat_features,
                                            second_order_features))

    if os.path.exists(fn):
        logging.info("Cached file already exists.")
        with open(fn, "rb") as f:
            return dill.load(f)

    load, ids = get_dataset_loader(dataset, split="train")

    n_folds = 5 if dataset == 'ukp' else 3

    # below are boolean logical ops
    grandparents = second_order_features and dataset == 'ukp'
    coparents = second_order_features
    siblings = second_order_features and dataset == 'cdcp'

    scores = []
    all_Y_pred = []

    for k, (tr, val) in enumerate(KFold(n_folds).split(ids)):
        train_docs = list(load(ids[tr]))
        val_docs = list(load(ids[val]))

        clf, Y_val, Y_pred = fit_predict(train_docs, val_docs, dataset, C,
                                         class_weight,
                                         constraints, compat_features,
                                         second_order_features, grandparents,
                                         coparents, siblings)
        all_Y_pred.extend(Y_pred)
        scores.append(clf.model._score(Y_val, Y_pred))

    with open(fn, "wb") as f:
        dill.dump((scores, all_Y_pred), f)

    return scores, all_Y_pred
Esempio n. 6
0
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio):

    fn = cache_fname("linear_val_df",
                     (dataset, k, link_alpha, prop_alpha, l1_ratio))

    if os.path.exists(fn):
        logging.info("Loading {}".format(fn))
        with open(fn, "rb") as f:
            return dill.load(f)

    ds = 'erule' if dataset == 'cdcp' else 'ukp-essays'  # sorry
    path = os.path.join("data", "process", ds, "folds", "{}", "{}")

    # sorry again: get val docs
    n_folds = 5 if dataset == 'ukp' else 3
    load, ids = get_dataset_loader(dataset, "train")
    for k_, (_, val) in enumerate(KFold(n_folds).split(ids)):
        if k_ == k:
            break
    val_docs = list(load(ids[val]))

    X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True)
    X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True)

    X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'),
                                    return_y=True)
    X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'),
                                    return_y=True)

    baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio)
    baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop)

    Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs)

    with open(fn, "wb") as f:
        logging.info("Saving {}".format(fn))
        dill.dump((Y_marg, baseline), f)

    return Y_marg, baseline
Esempio n. 7
0
            'Cannot create svg representation by running dot from string: {}'
            ''.format(dot_string))

    return out


def render_prediction(doc, Y):
    labels = ['({}) {:.2}'.format(i, lbl) for i, lbl in enumerate(Y.nodes, 1)]
    links = doc.link_to_prop[Y.links]

    return _svg(labels, links)


if __name__ == '__main__':
    dataset = sys.argv[1]
    load, ids = get_dataset_loader(dataset, split="test")
    docs = list(load(ids))
    Y_true = [doc.label for doc in docs]

    prop_labels = (['MajorClaim', 'Claim', 'Premise'] if dataset == 'ukp'
                   else ['value', 'policy', 'testimony', 'fact', 'reference'])

    predictions = dict()
    model_names = []
    doc_scores = []
    for method in ("linear", "linear-struct", "rnn", "rnn-struct"):
        for model in ("bare", "full", "strict"):

            fn = tpl.format(dataset, method, model)

            if not os.path.isfile(fn):
Esempio n. 8
0
import numpy as np

from marseille.datasets import get_dataset_loader, load_embeds
from marseille.custom_logging import logging
from marseille.argrnn import BaselineArgumentLSTM, ArgumentLSTM
from marseille.io import load_csr

from .exp_svmstruct import fit_predict as fit_pred_pystruct
from .exp_linear import BaselineStruct

if __name__ == '__main__':
    exact_test = True
    dataset = 'cdcp'

    load_tr, ids_tr = get_dataset_loader(dataset, split="train")
    train_docs = list(load_tr(ids_tr))[:20]

    filename = "pickle_test"

    constraints = ''
    compat_features = False
    second_order = False

    grandparents = coparents = siblings = False

    Y_train = [doc.label for doc in train_docs]

    pkl = False

    if pkl:
Esempio n. 9
0
                   # "exact_predictions",
                   "exact=True_{}_{}_{}.predictions.dill")

if __name__ == '__main__':
    dataset = sys.argv[1]

    if dataset not in ('cdcp', 'ukp'):
        raise ValueError("Unknown dataset {}. "
                         "Supported: ukp|cdcp.".format(dataset))

    link_labels = [False, True]
    prop_labels = (['MajorClaim', 'Claim', 'Premise'] if dataset == 'ukp'
                   else ['value', 'policy', 'testimony', 'fact', 'reference'])

    # get true test labels
    load_te, ids_te = get_dataset_loader(dataset, split='test')
    Y_true = [doc.label for doc in load_te(ids_te)]

    print("dataset={}".format(dataset))

    scores = dict()
    for method in ("linear", "linear-struct", "rnn", "rnn-struct"):
        scores[method] = dict()
        for model in ("bare", "full", "strict"):
            scores_ = scores[method][model] = dict()

            fn = tpl.format(dataset, method, model)

            if not os.path.isfile(fn):
                logging.info("Could not find {}".format(fn))
                continue
Esempio n. 10
0
        exp_train_test (cdcp|ukp) --method=M --model=N [--dynet-seed N --dynet-mem N]

    Options:
        --method: one of (linear, linear-struct, rnn, rnn-struct)
        --model: one of (bare, full, strict)
    """

    args = docopt(usage)

    dataset = 'cdcp' if args['cdcp'] else 'ukp'
    method = args['--method']
    model = args['--model']

    params = hyperparams[method][model][dataset]

    load_tr, ids_tr = get_dataset_loader(dataset, split="train")
    load_te, ids_te = get_dataset_loader(dataset, split="test")

    train_docs = list(load_tr(ids_tr))
    test_docs = list(load_te(ids_te))

    logging.info("{} {} on {} ({})".format(method, model, dataset, params))

    filename = os.path.join(
        'test_results', 'exact={}_{}_{}_{}'.format(exact_test, dataset, method,
                                                   model))
    if not os.path.exists('test_results'):
        os.makedirs('test_results')

    # logic for constraints and compat features
    # note that compat_features and second_order aren't used