def main(): args = parse_args() if args.eval_on_test: train_meta = load_split("train,dev", round_cefr=args.round_cefr) test_meta = load_split("test", round_cefr=args.round_cefr) else: train_meta = load_split("train", round_cefr=args.round_cefr) test_meta = load_split("dev", round_cefr=args.round_cefr) train_x, test_x, num_features = preprocess( args.kind, None, train_meta, test_meta, eval_on_test=args.eval_on_test ) print(train_x.shape) print(test_x.shape) labels = sorted(train_meta.cefr.unique()) train_y = [labels.index(c) for c in train_meta.cefr] test_y = [labels.index(c) for c in test_meta.cefr] print(len(train_y)) print(len(test_y)) print("Fitting classifier ...") if args.algorithm == "logreg": clf = LogisticRegression(solver="lbfgs", multi_class="multinomial") elif args.algorithm == "svc": clf = LinearSVC() elif args.algorithm == "svr": clf = LinearSVR() clf.fit(train_x, train_y) predictions = clf.predict(test_x) if args.algorithm == "svr": predictions = np.clip(np.floor(predictions + 0.5), 0, max(train_y)) report(test_y, predictions, labels) if args.nli: name = "linear_%s_nli" % args.algorithm else: name = "linear_" + args.algorithm name = get_file_name(name) save_results(name, args.__dict__, None, test_y, predictions)
def iterate_docs(split: str = 'train') -> Iterable[Iterable[str]]: def _inner_iter(stream): for line in stream: for token in line.split(): yield token meta = load_split(split) for filename in meta.filename: path = (data_folder / 'txt' / filename).with_suffix('.txt') with path.open(encoding='utf-8') as stream: yield _inner_iter(stream)
def iterate_pos_docs(split: str = 'train') -> Iterable[Iterable[str]]: def _inner_iter(stream): for sent in sents: for (pos,) in sent: yield pos meta = load_split(split) filenames = filename_iter(meta, suffix='conll') for filename in filenames: sents = conll_reader(filename, cols=['UPOS'], tags=False) yield _inner_iter(sents)
def main(): args = parse_args() model_paths = find_model_paths(args.job_ids) print(model_paths) if args.collapsed: test_meta = load_split("test", round_cefr=True) round_target_scores = np.array( [ROUND_CEFR_LABELS.index(c) for c in test_meta["cefr"]], dtype=int) targets = round_target_scores highest_class = 3 labels = ROUND_CEFR_LABELS else: test_meta = load_split("test", round_cefr=False) target_scores = np.array( [CEFR_LABELS.index(c) for c in test_meta["cefr"]], dtype=int) targets = target_scores highest_class = 6 labels = CEFR_LABELS for model_path in model_paths: model, w2i = load_model_and_w2i(model_path) multi_input = isinstance(model.input, list) and len(model.input) == 2 multi_output = isinstance(model.outputs, list) and len(model.outputs) > 1 x = get_input_reps(w2i, multi_input) del w2i predictions = get_predictions(model, x, multi_output) del x del model # Round to integers and clip to score range pred = rescale_regression_results(predictions, highest_class).ravel() report(targets, pred, labels) name = model_path.stem + "_test_eval" save_results(name, {}, {}, targets, pred)
def bag_of_words(split, **kwargs): """Fit a CountVectorizer on a split. Args: split: Name of the split {'train', 'test', 'dev} **kwargs: Are passed to CountVectorizer's constructor Returns: The transformed documents and the trained vectorizer. """ vectorizer = CountVectorizer(input='filename', **kwargs) meta = load_split(split) x = vectorizer.fit_transform(filename_iter(meta)) return x, vectorizer
def iterate_mixed_pos_docs(split: str = 'train'): """Mixed POS-Function Word n-grams. E.g. NOUN kan også VERB NOUN til å VERB dem """ sw = get_stopwords() def _inner_iter(stream): for sent in sents: for (form, pos) in sent: if form.lower() in sw: yield form else: yield pos meta = load_split(split) filenames = filename_iter(meta, suffix='conll') for filename in filenames: sents = conll_reader(filename, cols=['FORM', 'UPOS'], tags=False) yield _inner_iter(sents)
def main(): args = parse_args() set_reproducible(args.seed_delta) train_meta = load_split("train", round_cefr=args.round_cefr) dev_meta = load_split("dev", round_cefr=args.round_cefr) target_col = "lang" if args.nli else "cefr" labels = sorted(train_meta[target_col].unique()) train_x, dev_x, num_pos, w2i = get_sequence_input_reps(args) args.vocab_size = len(w2i) print("Vocabulary size is {}".format(args.vocab_size)) train_target_scores = np.array( [labels.index(c) for c in train_meta[target_col]], dtype=int) dev_target_scores = np.array( [labels.index(c) for c in dev_meta[target_col]], dtype=int) del target_col train_y, dev_y, output_units = get_targets_and_output_units( train_target_scores, dev_target_scores, args.method) optimizer, loss, metrics = get_compile_args(args.method, args.lr) multi_task = args.aux_loss_weight > 0 if multi_task: assert not args.nli, "Both NLI and multi-task specified" lang_labels = sorted(train_meta.lang.unique()) train_y.append( to_categorical([lang_labels.index(l) for l in train_meta.lang])) dev_y.append( to_categorical([lang_labels.index(l) for l in dev_meta.lang])) output_units.append(len(lang_labels)) loss_weights = { AUX_OUTPUT_NAME: args.aux_loss_weight, OUTPUT_NAME: 1.0 - args.aux_loss_weight, } else: loss = loss[OUTPUT_NAME] metrics = metrics[OUTPUT_NAME] loss_weights = None del train_meta, dev_meta model = build_model( args.vocab_size, args.doc_length, output_units, args.embed_dim, windows=args.windows, num_pos=num_pos, constraint=args.constraint, static_embs=args.static_embs, classification=args.method == "classification", ) model.summary() if args.vectors: init_pretrained_embs(model, args.vectors, w2i) model.compile(optimizer=optimizer, loss=loss, loss_weights=loss_weights, metrics=metrics) logger.debug("Train y\n%r", train_y[0][:5]) logger.debug("Model config\n%r", model.get_config()) temp_handle, weights_path = tempfile.mkstemp(suffix=".h5") val_y = dev_target_scores callbacks = [ F1Metrics(dev_x, val_y, weights_path, ranked=args.method == "ranked") ] history = model.fit( train_x, train_y, epochs=args.epochs, batch_size=args.batch_size, callbacks=callbacks, validation_data=(dev_x, dev_y), verbose=2, ) model.load_weights(weights_path) os.close(temp_handle) os.remove(weights_path) true = dev_target_scores if multi_task: predictions = model.predict(dev_x)[0] else: predictions = model.predict(dev_x) if args.method == "classification": pred = np.argmax(predictions, axis=1) elif args.method == "regression": # Round to integers and clip to score range highest_class = train_target_scores.max() pred = rescale_regression_results(predictions, highest_class).ravel() elif args.method == "ranked": pred = K.eval(ranked_prediction(predictions)) try: if multi_task: multi_task_report(history.history, true, pred, labels) else: report(true, pred, labels) except Exception: pass name = get_name(args.nli, multi_task) name = get_file_name(name) if args.save_model: save_model(name, model, w2i) save_results(name, args.__dict__, history.history, true, pred) plt.show()
def main(): args = parse_args() meta = load_split(args.split) meta["lang"].replace(iso639_3, inplace=True) if args.embeddings: representations = get_fingerprints(args.embeddings, meta.filename) elif args.model: representations = get_model_representations(args.model, args.split) logger.info("Computing embeddings ...") if args.decomposition == "tsne": decomposer = TSNE(n_components=2, verbose=True) elif args.decomposition == "pca": decomposer = PCA(n_components=2) embedded = decomposer.fit_transform(representations) meta["Component 1"] = embedded[:, 0] meta["Component 2"] = embedded[:, 1] meta.rename( { "cefr": "CEFR", "testlevel": "Test level", "num_tokens": "Length", "lang": "L1" }, axis="columns", inplace=True) meta["Test level"].replace( { "Språkprøven": "IL test", "Høyere nivå": "AL test" }, inplace=True) fig, ax = (plt.gcf(), plt.gca()) if args.hue == "CEFR": palette = sns.mpl_palette('cool', 7) hue_order = CEFR_LABELS else: palette = None hue_order = None sns.scatterplot( x="Component 1", y="Component 2", hue=args.hue, style="Test level", data=meta, ax=ax, size="Length", palette=palette, hue_order=hue_order, ) ax.tick_params( axis="both", which="both", bottom="off", top="off", labelbottom="off", right="off", left="off", labelleft="off", ) handles, labels = ax.get_legend_handles_labels() cefr_legend = ax.legend(handles[:8], labels[:8], loc="center right", bbox_to_anchor=(-0.1, 0.5)) ax.legend(handles[8:], labels[8:], loc="center left", bbox_to_anchor=(1.05, 0.5)) ax.add_artist(cefr_legend) fig.set_size_inches(5, 3) plt.tight_layout() plt.show()
def main(): args = parse_args() set_reproducible(args.seed_delta) train_meta = load_split('train', round_cefr=args.round_cefr) dev_meta = load_split('dev', round_cefr=args.round_cefr) target_col = 'lang' if args.nli else 'cefr' labels = sorted(train_meta[target_col].unique()) train_x, dev_x, num_pos, w2i = get_sequence_input_reps(args) args.vocab_size = len(w2i) print("Vocabulary size is {}".format(args.vocab_size)) train_target_scores = np.array( [labels.index(c) for c in train_meta[target_col]], dtype=int) dev_target_scores = np.array( [labels.index(c) for c in dev_meta[target_col]], dtype=int) del target_col train_y, dev_y, output_units = get_targets_and_output_units( train_target_scores, dev_target_scores, args.method) optimizer, loss, metrics = get_compile_args(args) multi_task = args.aux_loss_weight > 0 if multi_task: assert not args.nli, "Both NLI and multi-task specified" lang_labels = sorted(train_meta.lang.unique()) train_y.append( to_categorical([lang_labels.index(l) for l in train_meta.lang])) dev_y.append( to_categorical([lang_labels.index(l) for l in dev_meta.lang])) output_units.append(len(lang_labels)) loss_weights = { AUX_OUTPUT_NAME: args.aux_loss_weight, OUTPUT_NAME: 1.0 - args.aux_loss_weight, } else: loss = loss[OUTPUT_NAME] metrics = metrics[OUTPUT_NAME] loss_weights = None model = build_model(args, output_units=output_units, num_pos=num_pos) model.summary() if args.vectors: init_pretrained_embs(model, args.vectors, w2i) model.compile(optimizer=optimizer, loss=loss, loss_weights=loss_weights, metrics=metrics) # Context manager fails on Windows (can't open an open file again) temp_handle, weights_path = tempfile.mkstemp(suffix='.h5') val_y = dev_target_scores callbacks = [ F1Metrics(dev_x, val_y, weights_path, ranked=args.method == 'ranked') ] history = model.fit( train_x, train_y, epochs=args.epochs, batch_size=args.batch_size, callbacks=callbacks, validation_data=(dev_x, dev_y), verbose=2, ) model.load_weights(weights_path) os.close(temp_handle) os.remove(weights_path) predictions = get_predictions(model, dev_x, multi_task) true = dev_target_scores if args.method == 'classification': pred = np.argmax(predictions, axis=1) elif args.method == 'regression': # Round to integers and clip to score range highest_class = train_target_scores.max() pred = rescale_regression_results(predictions, highest_class).ravel() elif args.method == 'ranked': pred = K.eval(ranked_prediction(predictions)) try: if multi_task: multi_task_report(history.history, true, pred, labels) else: report(true, pred, labels) except Exception: pass if args.nli: name = 'rnn-nli' elif multi_task: name = 'rnn-multi' else: name = 'rnn' name = get_file_name(name) if args.save_model: save_model(name, model, w2i) save_results(name, args.__dict__, history.history, true, pred) plt.show()
def main(): args = parse_args() set_reproducible(args.seed_delta) do_classification = args.method == 'classification' train_meta = load_split('train', round_cefr=args.round_cefr) dev_meta = load_split('dev', round_cefr=args.round_cefr) kind = args.featuretype train_x, dev_x, num_features = preprocess( kind, args.max_features, train_meta, dev_meta ) target_col = 'lang' if args.nli else 'cefr' labels = sorted(train_meta[target_col].unique()) train_target_scores = np.array( [labels.index(c) for c in train_meta[target_col]], dtype=int ) dev_target_scores = np.array( [labels.index(c) for c in dev_meta[target_col]], dtype=int ) train_y, dev_y, output_units = get_targets_and_output_units( train_target_scores, dev_target_scores, args.method ) multi_task = args.aux_loss_weight > 0 if multi_task: assert not args.nli, "Both NLI and multi-task specified" lang_labels = sorted(train_meta.lang.unique()) train_y.append(to_categorical([lang_labels.index(l) for l in train_meta.lang])) dev_y.append(to_categorical([lang_labels.index(l) for l in dev_meta.lang])) output_units.append(len(lang_labels)) loss_weights = { AUX_OUTPUT_NAME: args.aux_loss_weight, OUTPUT_NAME: 1.0 - args.aux_loss_weight, } else: loss_weights = None del train_meta, dev_meta model = build_model(num_features, output_units, do_classification) model.summary() optimizer, loss, metrics = get_compile_args(args.method, args.lr) model.compile( optimizer=optimizer, loss=loss, loss_weights=loss_weights, metrics=metrics ) # Context manager fails on Windows (can't open an open file again) temp_handle, weights_path = tempfile.mkstemp(suffix='.h5') val_y = dev_target_scores callbacks = [F1Metrics(dev_x, val_y, weights_path, ranked=args.method == 'ranked')] history = model.fit( train_x, train_y, epochs=args.epochs, batch_size=args.batch_size, callbacks=callbacks, validation_data=(dev_x, dev_y), verbose=2, ) model.load_weights(weights_path) os.close(temp_handle) os.remove(weights_path) true = dev_target_scores if multi_task: predictions = model.predict(dev_x)[0] else: predictions = model.predict(dev_x) if args.method == 'classification': pred = np.argmax(predictions, axis=1) elif args.method == 'regression': # Round to integers and clip to score range highest_class = train_target_scores.max() pred = rescale_regression_results(predictions, highest_class).ravel() elif args.method == 'ranked': pred = K.eval(ranked_prediction(predictions)) if multi_task: multi_task_report(history.history, true, pred, labels) else: report(true, pred, labels) plt.show() prefix = 'mlp_%s' % args.featuretype fname = get_file_name(prefix) save_results(fname, args.__dict__, history.history, true, pred) if args.save_model: save_model(fname, model, None)