Ejemplo n.º 1
0
def get_entity_scores(train,
                      dev=None,
                      test=None,
                      weights=None,
                      embeddings=None,
                      avg_embeddings=None):
    # First do the normal thing to make sure our cache is reasonable
    train_X, train_Y, _ = build_feature_array(train, avg_embeddings)
    if test is not None:
        test_X, test_Y, test_words_Y, sents = build_feature_array(
            test, embeddings, get_sents=True)
        logistic_regression(train_X,
                            train_Y,
                            test_X,
                            test_Y,
                            weights,
                            do_print=True,
                            return_preds=True,
                            is_token=True,
                            test_words_Y=test_words_Y)

    # We set is_token to false so that we skip the aggregation -- we don't need it
    preds = logistic_regression(train_X,
                                train_Y,
                                embeddings.m,
                                embeddings.iw,
                                weights,
                                do_print=False,
                                return_preds=True,
                                is_token=False)

    # Out default settings +1 to scores since we use them as idxs. -1 (though it doesn't really matter, since we're always comparing, not
    # really caring about absolute scores)
    return [p - 1 for p in preds]
Ejemplo n.º 2
0
def trainModels(articles, args):
    with open(args.emb_file, 'rb') as embed_fh:
        training_embeddings = pickle.load(embed_fh)
    models = {}
    for operation, load_function in OPERATIONS:
        TRAIN = 0
        DEV = 1
        TEST = 2
        X = 0
        Y = 1
        WORDS = 2

        cf_splits = load_function()
        # TODO: Choose between type vs embedding prediction task
        splits = [buildDataset(split,training_embeddings) for split in cf_splits]
        print("Starting to tune {} model".format(operation))
        dev_score, optimized_weights = find_logistic_regression_weights(
                splits[TRAIN][X], splits[TRAIN][Y],
                splits[DEV][X], splits[DEV][Y],
                verbose=False)
        clf, test_score = logistic_regression(splits[TRAIN][X], splits[TRAIN][Y], splits[TEST][X], splits[TEST][Y], weights=optimized_weights, do_print=True, return_clf = True)
        models[operation] = {'model': clf,
                          'test_score' : test_score,
                          'dev_score': dev_score,
                          'weights': optimized_weights}

    with open(args.model_file, 'wb+') as models_fh:
        pickle.dump(models, models_fh)
Ejemplo n.º 3
0
def avg_token_eval(headers,
                   embeddings,
                   avg_embeddings,
                   sent_to_key,
                   weights=None):
    print(
        "What's the accuracy of using type-level training and token-level test? i.e. what we did in the paper"
    )
    for h, h2 in headers:
        train, test, dev = load_hannah_split(config.CONNO_DIR,
                                             h,
                                             binarize=True,
                                             remove_neutral=False,
                                             plus_one=True)
        sent_to_score = load_raw_annotations(config.RAW_CONNOTATIONS,
                                             h2,
                                             binarize=True,
                                             plus_one=True)

        print(h)

        train_X, train_Y, _ = build_feature_array(train, avg_embeddings)
        test_X, test_Y = build_sent_array(test, embeddings, sent_to_score,
                                          sent_to_key)

        if weights is not None:
            print(weights)
            logistic_regression(train_X,
                                train_Y,
                                test_X,
                                test_Y,
                                weights[h],
                                do_print=True)
        else:
            dev_X, dev_Y = build_sent_array(dev, embeddings, sent_to_score,
                                            sent_to_key)

            score, new_weights = find_logistic_regression_weights(
                train_X, train_Y, dev_X, dev_Y)
            print("Running logistic regression with weights", new_weights,
                  "Dev F1:", score)
            logistic_regression(train_X,
                                train_Y,
                                test_X,
                                test_Y,
                                new_weights,
                                do_print=True)
Ejemplo n.º 4
0
def do_normal_regression(headers, avg_embeddings, weights=None):
    for h, h2 in headers:
        print(h)
        train, test, dev = load_hannah_split(config.CONNO_DIR,
                                             h,
                                             binarize=True,
                                             remove_neutral=False)

        if weights is not None:
            print("Running normal type-level regression")
            train_X, train_Y, _ = build_feature_array(train, avg_embeddings)
            test_X, test_Y, _ = build_feature_array(test, avg_embeddings)
            logistic_regression(train_X,
                                train_Y,
                                test_X,
                                test_Y,
                                weights=weights[h],
                                do_print=True,
                                is_token=False,
                                test_words_Y=None,
                                return_preds=True)
        else:
            do_logistic_regression(train, dev, test, avg_embeddings)
Ejemplo n.º 5
0
def type_to_token_eval(headers, embeddings, avg_embeddings):
    print(
        "What's the accuracy of using the learned type-level predictions on token-level test data?"
    )
    for h, h2 in headers:
        train, test, dev = load_hannah_split(config.CONNO_DIR,
                                             h,
                                             binarize=True,
                                             remove_neutral=False,
                                             plus_one=True)
        sent_to_score = load_raw_annotations(config.RAW_CONNOTATIONS,
                                             h2,
                                             binarize=True,
                                             plus_one=True)

        print(h)
        weights = raw_header_to_weights[h]

        train_X, train_Y, _ = build_feature_array(train, avg_embeddings)
        test_X, test_Y, test_words = build_feature_array(test, avg_embeddings)
        preds = logistic_regression(train_X,
                                    train_Y,
                                    test_X,
                                    test_Y,
                                    weights=weights,
                                    do_print=False,
                                    is_token=False,
                                    test_words_Y=None,
                                    return_preds=True)
        word_to_pred = {}

        for w, s in zip(test_words, preds):
            word_to_pred[w] = s
        print(len(word_to_pred), len(test))

        type_level = []
        sent_level = []
        for key, score in sent_to_score.items():
            if key[0] in word_to_pred:
                type_level.append(word_to_pred[key[0]])
                sent_level.append(score)

        print("Macro F1", f1_score(sent_level, type_level, average='macro'))
        print("Accuracy", accuracy_score(sent_level, type_level))
Ejemplo n.º 6
0
def score_keyed_embeddings(key_to_embeds, key_to_signs, m, avg_embeddings):
    train = load_power_all(cfg.POWER_AGENCY)

    train_X, train_Y, _ = build_feature_array(train, avg_embeddings)

    preds = logistic_regression(train_X,
                                train_Y,
                                m, {},
                                weights.power_token_regression,
                                do_print=False,
                                return_preds=True,
                                is_token=False)
    ent_to_score = {}
    for key, idxs in key_to_embeds.items():
        signs = key_to_signs[key]
        scores = [preds[i] for i in idxs]
        sum_score = sum([p * s for p, s in zip(scores, signs)])
        ent_to_score[key] = sum_score / len(scores)
    return ent_to_score