Esempio n. 1
0
def main():
    args = parse_args()
    if args.eval_on_test:
        train_meta = load_split("train,dev", round_cefr=args.round_cefr)
        test_meta = load_split("test", round_cefr=args.round_cefr)
    else:
        train_meta = load_split("train", round_cefr=args.round_cefr)
        test_meta = load_split("dev", round_cefr=args.round_cefr)

    train_x, test_x, num_features = preprocess(
        args.kind, None, train_meta, test_meta, eval_on_test=args.eval_on_test
    )
    print(train_x.shape)
    print(test_x.shape)

    labels = sorted(train_meta.cefr.unique())

    train_y = [labels.index(c) for c in train_meta.cefr]
    test_y = [labels.index(c) for c in test_meta.cefr]
    print(len(train_y))
    print(len(test_y))

    print("Fitting classifier ...")
    if args.algorithm == "logreg":
        clf = LogisticRegression(solver="lbfgs", multi_class="multinomial")
    elif args.algorithm == "svc":
        clf = LinearSVC()
    elif args.algorithm == "svr":
        clf = LinearSVR()

    clf.fit(train_x, train_y)

    predictions = clf.predict(test_x)
    if args.algorithm == "svr":
        predictions = np.clip(np.floor(predictions + 0.5), 0, max(train_y))

    report(test_y, predictions, labels)

    if args.nli:
        name = "linear_%s_nli" % args.algorithm
    else:
        name = "linear_" + args.algorithm
    name = get_file_name(name)
    save_results(name, args.__dict__, None, test_y, predictions)
Esempio n. 2
0
def iterate_docs(split: str = 'train') -> Iterable[Iterable[str]]:
    def _inner_iter(stream):
        for line in stream:
            for token in line.split():
                yield token

    meta = load_split(split)
    for filename in meta.filename:
        path = (data_folder / 'txt' / filename).with_suffix('.txt')
        with path.open(encoding='utf-8') as stream:
            yield _inner_iter(stream)
Esempio n. 3
0
def iterate_pos_docs(split: str = 'train') -> Iterable[Iterable[str]]:
    def _inner_iter(stream):
        for sent in sents:
            for (pos,) in sent:
                yield pos

    meta = load_split(split)
    filenames = filename_iter(meta, suffix='conll')
    for filename in filenames:
        sents = conll_reader(filename, cols=['UPOS'], tags=False)
        yield _inner_iter(sents)
Esempio n. 4
0
def main():
    args = parse_args()
    model_paths = find_model_paths(args.job_ids)

    print(model_paths)

    if args.collapsed:
        test_meta = load_split("test", round_cefr=True)
        round_target_scores = np.array(
            [ROUND_CEFR_LABELS.index(c) for c in test_meta["cefr"]], dtype=int)
        targets = round_target_scores
        highest_class = 3
        labels = ROUND_CEFR_LABELS
    else:
        test_meta = load_split("test", round_cefr=False)
        target_scores = np.array(
            [CEFR_LABELS.index(c) for c in test_meta["cefr"]], dtype=int)
        targets = target_scores
        highest_class = 6
        labels = CEFR_LABELS

    for model_path in model_paths:
        model, w2i = load_model_and_w2i(model_path)

        multi_input = isinstance(model.input, list) and len(model.input) == 2
        multi_output = isinstance(model.outputs,
                                  list) and len(model.outputs) > 1

        x = get_input_reps(w2i, multi_input)
        del w2i
        predictions = get_predictions(model, x, multi_output)
        del x
        del model

        # Round to integers and clip to score range
        pred = rescale_regression_results(predictions, highest_class).ravel()
        report(targets, pred, labels)

        name = model_path.stem + "_test_eval"
        save_results(name, {}, {}, targets, pred)
Esempio n. 5
0
def bag_of_words(split, **kwargs):
    """Fit a CountVectorizer on a split.

    Args:
        split: Name of the split {'train', 'test', 'dev}
        **kwargs: Are passed to CountVectorizer's constructor

    Returns:
        The transformed documents and the trained vectorizer.
    """
    vectorizer = CountVectorizer(input='filename', **kwargs)
    meta = load_split(split)
    x = vectorizer.fit_transform(filename_iter(meta))
    return x, vectorizer
Esempio n. 6
0
def iterate_mixed_pos_docs(split: str = 'train'):
    """Mixed POS-Function Word n-grams.

    E.g. NOUN kan også VERB NOUN til å VERB dem
    """
    sw = get_stopwords()

    def _inner_iter(stream):
        for sent in sents:
            for (form, pos) in sent:
                if form.lower() in sw:
                    yield form
                else:
                    yield pos

    meta = load_split(split)
    filenames = filename_iter(meta, suffix='conll')
    for filename in filenames:
        sents = conll_reader(filename, cols=['FORM', 'UPOS'], tags=False)
        yield _inner_iter(sents)
Esempio n. 7
0
def main():
    args = parse_args()

    set_reproducible(args.seed_delta)

    train_meta = load_split("train", round_cefr=args.round_cefr)
    dev_meta = load_split("dev", round_cefr=args.round_cefr)

    target_col = "lang" if args.nli else "cefr"
    labels = sorted(train_meta[target_col].unique())

    train_x, dev_x, num_pos, w2i = get_sequence_input_reps(args)
    args.vocab_size = len(w2i)
    print("Vocabulary size is {}".format(args.vocab_size))

    train_target_scores = np.array(
        [labels.index(c) for c in train_meta[target_col]], dtype=int)
    dev_target_scores = np.array(
        [labels.index(c) for c in dev_meta[target_col]], dtype=int)
    del target_col

    train_y, dev_y, output_units = get_targets_and_output_units(
        train_target_scores, dev_target_scores, args.method)

    optimizer, loss, metrics = get_compile_args(args.method, args.lr)
    multi_task = args.aux_loss_weight > 0
    if multi_task:
        assert not args.nli, "Both NLI and multi-task specified"
        lang_labels = sorted(train_meta.lang.unique())
        train_y.append(
            to_categorical([lang_labels.index(l) for l in train_meta.lang]))
        dev_y.append(
            to_categorical([lang_labels.index(l) for l in dev_meta.lang]))
        output_units.append(len(lang_labels))
        loss_weights = {
            AUX_OUTPUT_NAME: args.aux_loss_weight,
            OUTPUT_NAME: 1.0 - args.aux_loss_weight,
        }
    else:
        loss = loss[OUTPUT_NAME]
        metrics = metrics[OUTPUT_NAME]
        loss_weights = None
    del train_meta, dev_meta

    model = build_model(
        args.vocab_size,
        args.doc_length,
        output_units,
        args.embed_dim,
        windows=args.windows,
        num_pos=num_pos,
        constraint=args.constraint,
        static_embs=args.static_embs,
        classification=args.method == "classification",
    )
    model.summary()

    if args.vectors:
        init_pretrained_embs(model, args.vectors, w2i)

    model.compile(optimizer=optimizer,
                  loss=loss,
                  loss_weights=loss_weights,
                  metrics=metrics)

    logger.debug("Train y\n%r", train_y[0][:5])
    logger.debug("Model config\n%r", model.get_config())

    temp_handle, weights_path = tempfile.mkstemp(suffix=".h5")
    val_y = dev_target_scores
    callbacks = [
        F1Metrics(dev_x, val_y, weights_path, ranked=args.method == "ranked")
    ]
    history = model.fit(
        train_x,
        train_y,
        epochs=args.epochs,
        batch_size=args.batch_size,
        callbacks=callbacks,
        validation_data=(dev_x, dev_y),
        verbose=2,
    )
    model.load_weights(weights_path)
    os.close(temp_handle)
    os.remove(weights_path)

    true = dev_target_scores
    if multi_task:
        predictions = model.predict(dev_x)[0]
    else:
        predictions = model.predict(dev_x)
    if args.method == "classification":
        pred = np.argmax(predictions, axis=1)
    elif args.method == "regression":
        # Round to integers and clip to score range
        highest_class = train_target_scores.max()
        pred = rescale_regression_results(predictions, highest_class).ravel()
    elif args.method == "ranked":
        pred = K.eval(ranked_prediction(predictions))
    try:
        if multi_task:
            multi_task_report(history.history, true, pred, labels)
        else:
            report(true, pred, labels)
    except Exception:
        pass

    name = get_name(args.nli, multi_task)
    name = get_file_name(name)

    if args.save_model:
        save_model(name, model, w2i)

    save_results(name, args.__dict__, history.history, true, pred)

    plt.show()
Esempio n. 8
0
def main():
    args = parse_args()
    meta = load_split(args.split)
    meta["lang"].replace(iso639_3, inplace=True)

    if args.embeddings:
        representations = get_fingerprints(args.embeddings, meta.filename)
    elif args.model:
        representations = get_model_representations(args.model, args.split)

    logger.info("Computing embeddings ...")
    if args.decomposition == "tsne":
        decomposer = TSNE(n_components=2, verbose=True)
    elif args.decomposition == "pca":
        decomposer = PCA(n_components=2)
    embedded = decomposer.fit_transform(representations)
    meta["Component 1"] = embedded[:, 0]
    meta["Component 2"] = embedded[:, 1]
    meta.rename(
        {
            "cefr": "CEFR",
            "testlevel": "Test level",
            "num_tokens": "Length",
            "lang": "L1"
        },
        axis="columns",
        inplace=True)
    meta["Test level"].replace(
        {
            "Språkprøven": "IL test",
            "Høyere nivå": "AL test"
        }, inplace=True)

    fig, ax = (plt.gcf(), plt.gca())
    if args.hue == "CEFR":
        palette = sns.mpl_palette('cool', 7)
        hue_order = CEFR_LABELS
    else:
        palette = None
        hue_order = None
    sns.scatterplot(
        x="Component 1",
        y="Component 2",
        hue=args.hue,
        style="Test level",
        data=meta,
        ax=ax,
        size="Length",
        palette=palette,
        hue_order=hue_order,
    )
    ax.tick_params(
        axis="both",
        which="both",
        bottom="off",
        top="off",
        labelbottom="off",
        right="off",
        left="off",
        labelleft="off",
    )
    handles, labels = ax.get_legend_handles_labels()
    cefr_legend = ax.legend(handles[:8],
                            labels[:8],
                            loc="center right",
                            bbox_to_anchor=(-0.1, 0.5))
    ax.legend(handles[8:],
              labels[8:],
              loc="center left",
              bbox_to_anchor=(1.05, 0.5))
    ax.add_artist(cefr_legend)
    fig.set_size_inches(5, 3)
    plt.tight_layout()
    plt.show()
Esempio n. 9
0
def main():
    args = parse_args()

    set_reproducible(args.seed_delta)

    train_meta = load_split('train', round_cefr=args.round_cefr)
    dev_meta = load_split('dev', round_cefr=args.round_cefr)

    target_col = 'lang' if args.nli else 'cefr'
    labels = sorted(train_meta[target_col].unique())

    train_x, dev_x, num_pos, w2i = get_sequence_input_reps(args)
    args.vocab_size = len(w2i)
    print("Vocabulary size is {}".format(args.vocab_size))

    train_target_scores = np.array(
        [labels.index(c) for c in train_meta[target_col]], dtype=int)
    dev_target_scores = np.array(
        [labels.index(c) for c in dev_meta[target_col]], dtype=int)
    del target_col

    train_y, dev_y, output_units = get_targets_and_output_units(
        train_target_scores, dev_target_scores, args.method)

    optimizer, loss, metrics = get_compile_args(args)
    multi_task = args.aux_loss_weight > 0
    if multi_task:
        assert not args.nli, "Both NLI and multi-task specified"
        lang_labels = sorted(train_meta.lang.unique())
        train_y.append(
            to_categorical([lang_labels.index(l) for l in train_meta.lang]))
        dev_y.append(
            to_categorical([lang_labels.index(l) for l in dev_meta.lang]))
        output_units.append(len(lang_labels))
        loss_weights = {
            AUX_OUTPUT_NAME: args.aux_loss_weight,
            OUTPUT_NAME: 1.0 - args.aux_loss_weight,
        }
    else:
        loss = loss[OUTPUT_NAME]
        metrics = metrics[OUTPUT_NAME]
        loss_weights = None

    model = build_model(args, output_units=output_units, num_pos=num_pos)
    model.summary()

    if args.vectors:
        init_pretrained_embs(model, args.vectors, w2i)

    model.compile(optimizer=optimizer,
                  loss=loss,
                  loss_weights=loss_weights,
                  metrics=metrics)

    # Context manager fails on Windows (can't open an open file again)
    temp_handle, weights_path = tempfile.mkstemp(suffix='.h5')
    val_y = dev_target_scores
    callbacks = [
        F1Metrics(dev_x, val_y, weights_path, ranked=args.method == 'ranked')
    ]
    history = model.fit(
        train_x,
        train_y,
        epochs=args.epochs,
        batch_size=args.batch_size,
        callbacks=callbacks,
        validation_data=(dev_x, dev_y),
        verbose=2,
    )
    model.load_weights(weights_path)
    os.close(temp_handle)
    os.remove(weights_path)

    predictions = get_predictions(model, dev_x, multi_task)
    true = dev_target_scores
    if args.method == 'classification':
        pred = np.argmax(predictions, axis=1)
    elif args.method == 'regression':
        # Round to integers and clip to score range
        highest_class = train_target_scores.max()
        pred = rescale_regression_results(predictions, highest_class).ravel()
    elif args.method == 'ranked':
        pred = K.eval(ranked_prediction(predictions))
    try:
        if multi_task:
            multi_task_report(history.history, true, pred, labels)
        else:
            report(true, pred, labels)
    except Exception:
        pass

    if args.nli:
        name = 'rnn-nli'
    elif multi_task:
        name = 'rnn-multi'
    else:
        name = 'rnn'
    name = get_file_name(name)

    if args.save_model:
        save_model(name, model, w2i)

    save_results(name, args.__dict__, history.history, true, pred)

    plt.show()
Esempio n. 10
0
def main():
    args = parse_args()

    set_reproducible(args.seed_delta)
    do_classification = args.method == 'classification'

    train_meta = load_split('train', round_cefr=args.round_cefr)
    dev_meta = load_split('dev', round_cefr=args.round_cefr)

    kind = args.featuretype
    train_x, dev_x, num_features = preprocess(
        kind, args.max_features, train_meta, dev_meta
    )

    target_col = 'lang' if args.nli else 'cefr'
    labels = sorted(train_meta[target_col].unique())

    train_target_scores = np.array(
        [labels.index(c) for c in train_meta[target_col]], dtype=int
    )
    dev_target_scores = np.array(
        [labels.index(c) for c in dev_meta[target_col]], dtype=int
    )

    train_y, dev_y, output_units = get_targets_and_output_units(
        train_target_scores, dev_target_scores, args.method
    )

    multi_task = args.aux_loss_weight > 0
    if multi_task:
        assert not args.nli, "Both NLI and multi-task specified"
        lang_labels = sorted(train_meta.lang.unique())
        train_y.append(to_categorical([lang_labels.index(l) for l in train_meta.lang]))
        dev_y.append(to_categorical([lang_labels.index(l) for l in dev_meta.lang]))
        output_units.append(len(lang_labels))
        loss_weights = {
            AUX_OUTPUT_NAME: args.aux_loss_weight,
            OUTPUT_NAME: 1.0 - args.aux_loss_weight,
        }
    else:
        loss_weights = None
    del train_meta, dev_meta

    model = build_model(num_features, output_units, do_classification)
    model.summary()

    optimizer, loss, metrics = get_compile_args(args.method, args.lr)
    model.compile(
        optimizer=optimizer, loss=loss, loss_weights=loss_weights, metrics=metrics
    )

    # Context manager fails on Windows (can't open an open file again)
    temp_handle, weights_path = tempfile.mkstemp(suffix='.h5')
    val_y = dev_target_scores
    callbacks = [F1Metrics(dev_x, val_y, weights_path, ranked=args.method == 'ranked')]
    history = model.fit(
        train_x,
        train_y,
        epochs=args.epochs,
        batch_size=args.batch_size,
        callbacks=callbacks,
        validation_data=(dev_x, dev_y),
        verbose=2,
    )
    model.load_weights(weights_path)
    os.close(temp_handle)
    os.remove(weights_path)

    true = dev_target_scores
    if multi_task:
        predictions = model.predict(dev_x)[0]
    else:
        predictions = model.predict(dev_x)
    if args.method == 'classification':
        pred = np.argmax(predictions, axis=1)
    elif args.method == 'regression':
        # Round to integers and clip to score range
        highest_class = train_target_scores.max()
        pred = rescale_regression_results(predictions, highest_class).ravel()
    elif args.method == 'ranked':
        pred = K.eval(ranked_prediction(predictions))
    if multi_task:
        multi_task_report(history.history, true, pred, labels)
    else:
        report(true, pred, labels)

    plt.show()

    prefix = 'mlp_%s' % args.featuretype
    fname = get_file_name(prefix)
    save_results(fname, args.__dict__, history.history, true, pred)

    if args.save_model:
        save_model(fname, model, None)