def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio): fn = cache_fname("linear_val_df", (dataset, k, link_alpha, prop_alpha, l1_ratio)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) ds = 'erule' if dataset == 'cdcp' else 'ukp-essays' # sorry path = os.path.join("data", "process", ds, "folds", "{}", "{}") # sorry again: get val docs n_folds = 5 if dataset == 'ukp' else 3 load, ids = get_dataset_loader(dataset, "train") for k_, (_, val) in enumerate(KFold(n_folds).split(ids)): if k_ == k: break val_docs = list(load(ids[val])) X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'), return_y=True) baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio) baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop) Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump((Y_marg, baseline), f) return Y_marg, baseline
def main(): from docopt import docopt usage = """ Usage: baselines (cdcp|ukp) [--n-folds=N] Options: --n-folds=N number of cross-val folds to generate. [default: 3] """ args = docopt(usage) n_folds = int(args['--n-folds']) all_true = [] all_false = [] adjacent = [] adjacent_ltr = [] adjacent_rtl = [] if args['cdcp']: path = os.path.join("data", "process", "erule", "folds", "{}", "{}") elif args['ukp']: path = os.path.join("data", "process", "ukp-essays", "folds", "{}", "{}") for k in range(n_folds): fname = path.format(k, 'val.npz') logging.info("Loading sparse vectorized file {}".format(fname)) X_te, y_te = load_csr(fname, return_y=True) with open(path.format(k, "fnames.txt")) as f: fnames = [line.strip() for line in f] props_between = fnames.index('nrm__props_between') src_precedes_trg = fnames.index('raw__src_precedes_trg') trg_precedes_src = fnames.index('raw__trg_precedes_src') y_all_true = np.ones_like(y_te) y_all_false = np.zeros_like(y_te) y_adj = ~(X_te[:, props_between] != 0).A.ravel() is_src_first = X_te[:, src_precedes_trg].astype(np.bool).A.ravel() is_trg_first = X_te[:, trg_precedes_src].astype(np.bool).A.ravel() y_adj_ltr = y_adj & is_src_first y_adj_rtl = y_adj & is_trg_first def _score(y): p, r, f, _ = precision_recall_fscore_support(y_te, y, pos_label=1, average='binary') return p, r, f all_true.append(_score(y_all_true)) all_false.append(_score(y_all_false)) adjacent.append(_score(y_adj)) adjacent_ltr.append(_score(y_adj_ltr)) adjacent_rtl.append(_score(y_adj_rtl)) preds = (all_false, all_true, adjacent, adjacent_ltr, adjacent_rtl) preds = [np.array(x).mean(axis=0) for x in preds] names = ["All false", "All true", "Adjacent", "Adj s -> t", "Adj t <- s"] for name, scores in zip(names, preds): print("{:18} {:.4f} {:.4f} {:.4f}".format(name, *scores))
def saga_cv(which, alphas, l1_ratio): if which == 'cdcp': n_folds = 3 path = os.path.join("data", "process", "erule", "folds", "{}", "{}") elif which == 'ukp': n_folds = 5 path = os.path.join("data", "process", "ukp-essays", "folds", "{}", "{}") else: raise ValueError clf_link = SAGAClassifier(loss='smooth_hinge', penalty='l1', tol=1e-4, max_iter=100, random_state=0, verbose=0) clf_prop = clone(clf_link) link_scores = np.zeros((n_folds, len(alphas))) prop_scores = np.zeros_like(link_scores) for k in range(n_folds): X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'), return_y=True) le = LabelEncoder() y_tr_prop_enc = le.fit_transform(y_tr_prop) y_te_prop_enc = le.transform(y_te_prop) link_sw = compute_sample_weight('balanced', y_tr_link) for j, alpha in enumerate(alphas): beta = alpha * l1_ratio alpha *= 1 - l1_ratio clf_link.set_params(alpha=alpha, beta=beta) clf_prop.set_params(alpha=alpha, beta=beta) clf_link.fit(X_tr_link, y_tr_link, sample_weight=link_sw) y_pred_link = clf_link.predict(X_te_link) clf_prop.fit(X_tr_prop, y_tr_prop_enc) y_pred_prop = clf_prop.predict(X_te_prop) with warnings.catch_warnings() as w: warnings.simplefilter('ignore') link_f = f1_score(y_te_link, y_pred_link, average='binary') prop_f = f1_score(y_te_prop_enc, y_pred_prop, average='macro') link_scores[k, j] = link_f prop_scores[k, j] = prop_f return link_scores, prop_scores
elif model == 'strict': constraints = '{}+strict'.format(dataset) compat_features = True second_order = True else: raise ValueError('Invalid model: {}'.format(model)) # logic for which second order features to use, if any grandparents = second_order and dataset == 'ukp' coparents = second_order siblings = second_order and dataset == 'cdcp' if method == 'linear': ds = 'erule' if dataset == 'cdcp' else 'ukp-essays' path = os.path.join("data", "process", ds, "folds", "traintest", "{}") X_tr_link, y_tr_link = load_csr(path.format('train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format('test.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format('prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format('prop-test.npz'), return_y=True) baseline = BaselineStruct(alpha_link=params['alpha'], alpha_prop=params['alpha'], l1_ratio=0, exact_test=exact_test) baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop) Y_pred = baseline.predict(X_te_link, X_te_prop, test_docs, constraints) with open('{}.model.pickle'.format(filename), "wb") as fp:
def saga_score_struct(which, link_alpha, prop_alpha, l1_ratio, decode=False): if which == 'cdcp': n_folds = 3 ids = np.array(cdcp_train_ids) path = os.path.join("data", "process", "erule", "folds", "{}", "{}") _tpl = os.path.join("data", "process", "erule", "{}", "{:05d}") _load = lambda which, ks: (CdcpArgumentationDoc(_tpl.format(which, k)) for k in ks) elif which == 'ukp': n_folds = 5 ids = np.array(ukp_train_ids) path = os.path.join("data", "process", "ukp-essays", "folds", "{}", "{}") _tpl = os.path.join("data", "process", "ukp-essays", "essay{:03d}") _load = lambda which, ks: (UkpEssayArgumentationDoc(_tpl.format(k)) for k in ks) else: raise ValueError baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio) all_Y_pred = [] scores = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): val_docs = list(_load("train", ids[val])) Y_true = [] for doc in val_docs: y_prop = np.array([str(f['label_']) for f in doc.prop_features]) y_link = np.array([f['label_'] for f in doc.features]) Y_true.append(DocLabel(y_prop, y_link)) X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'), return_y=True) baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop) Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs) zero_compat = np.zeros((baseline.n_prop_states, baseline.n_prop_states, baseline.n_link_states)) if decode: statuses = Counter() Y_pred = [] for doc, y in zip(val_docs, Y_marg): doc.link_to_node_ = np.array( [(f['src__prop_id_'], f['trg__prop_id_']) for f in doc.features], dtype=np.intp) doc.second_order_ = [] potentials = (y.nodes, y.links, zero_compat, [], [], []) y_decoded, status = baseline._inference(doc, potentials, relaxed=False, constraints=which) Y_pred.append(y_decoded) statuses[status] += 1 logging.info("Test inference status: " + ", ".join( "{:.1f}% {}".format(100 * val / len(val_docs), key) for key, val in statuses.most_common())) else: Y_pred = [ baseline._round(y.nodes, y.links, inverse_transform=True) for y in Y_marg ] all_Y_pred.extend(Y_pred) scores.append(baseline._score(Y_true, Y_pred)) return scores, all_Y_pred