Esempio n. 1
0
def test(model_props=None,
         model_name=None,
         weights_file='best_weights',
         dataset_name='test',
         save_output=True,
         save_scores=False):
    if model_props is None:
        model_props = model_properties.MentionRankingProps(
            name=model_name,
            load_weights_from=model_name,
            weights_file=weights_file)

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    dataset = datasets.DocumentBatchedDataset(dataset_name,
                                              model_props,
                                              with_ids=True)
    docs = utils.load_pickle(directories.DOCUMENTS + dataset_name +
                             '_docs.pkl')
    stats = {}

    print "Building model"
    model, _ = pairwise_models.get_model(dataset, vectors, model_props)

    print "Evaluating model on", dataset_name
    evaluate_model(dataset,
                   docs,
                   model,
                   model_props,
                   stats,
                   save_output=save_output,
                   save_scores=save_scores)
    timer.clear()
    utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
Esempio n. 2
0
def main(model_props=None, cluster_props=None):
    if model_props is None:
        model_props = model_properties.MentionRankingProps()
    if cluster_props is None:
        cluster_props = model_properties.ClusterRankingProps()

    directories.set_model_name('classification')
    model_props.load_weights_from = None
    model_props.set_mode('classification')
    pairwise_learning.main(model_props=model_props, n_epochs=150)

    directories.set_model_name('top_pairs')
    model_props.set_mode('top_pairs')
    model_props.load_weights_from = 'classification'
    model_props.weights_file = 'weights_140'
    timer.clear()
    pairwise_learning.main(model_props=model_props, n_epochs=50)

    directories.set_model_name('ranking')
    model_props.set_mode('ranking')
    model_props.load_weights_from = 'top_pairs'
    model_props.weights_file = 'weights_40'
    timer.clear()
    pairwise_learning.main(model_props=model_props, n_epochs=50)

    model_props.load_weights_from = 'ranking'
    pairwise_learning.main(model_props=model_props, write_scores=True, test_only=True)
    pairwise_learning.main(model_props=model_props, write_scores=True, test_only=True,
                           validate="test")

    clustering_preprocessing.main('ranking')
    cluster_props.load_weights_from = 'ranking'
    cluster_props.weights_file = 'weights_40'
    timer.celar()
    clustering_learning.main(cluster_props)
Esempio n. 3
0
def test(model_props=None, model_name=None, weights_file='best_weights', dataset_name='test',
         save_output=True, save_scores=False):
    if model_props is None:
        model_props = model_properties.MentionRankingProps(name=model_name,
                                                           load_weights_from=model_name,
                                                           weights_file=weights_file)

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    dataset = datasets.DocumentBatchedDataset(dataset_name, model_props, with_ids=True)
    docs = utils.load_pickle(directories.DOCUMENTS + dataset_name + '_docs.pkl')
    stats = {}

    print "Building model"
    model, _ = pairwise_models.get_model(dataset, vectors, model_props)

    print "Evaluating model on", dataset_name
    evaluate_model(dataset, docs, model, model_props, stats,
                   save_output=save_output, save_scores=save_scores)
    timer.clear()
    utils.write_pickle(stats, model_props.path + dataset_name + "_results.pkl")
Esempio n. 4
0
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'):
    print "Training", model_props.path
    pprint(model_props.__dict__)

    model_props.write(model_props.path + 'model_props.pkl')
    utils.rmkdir(model_props.path + 'src')
    for fname in os.listdir('.'):
        if fname.endswith('.py'):
            shutil.copyfile(fname, model_props.path + 'src/' + fname)
    if model_props.ranking or \
            model_props.top_pairs:
        write_start = 0
        write_every = 10
    else:
        write_start = 80
        write_every = 20

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    train = datasets.DocumentBatchedDataset(
        "train_reduced" if reduced else "train", model_props, with_ids=True)
    dev = datasets.DocumentBatchedDataset(
        dev_set_name + "_reduced" if reduced else dev_set_name,
        model_props,
        with_ids=True)

    print "Building model"
    model, _ = pairwise_models.get_model(dev, vectors, model_props)
    json_string = model.to_json()
    open(model_props.path + 'architecture.json', 'w').write(json_string)

    best_val_score = 1000
    best_val_score_in_window = 1000
    history = []
    for epoch in range(n_epochs):
        timer.start("train")
        print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path)

        epoch_stats = {}
        model_weights = model.get_weights()
        train_docs = utils.load_pickle(directories.DOCUMENTS +
                                       'train_docs.pkl')
        dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name +
                                     '_docs.pkl')
        if reduced:
            dev_docs = dev_docs[:3]

        if model_props.ranking:
            print "Running over training set"
            run_model_over_docs(train, train_docs, model)
            epoch_stats.update(compute_metrics(train_docs, "train"))
            if model_props.use_rewards:
                print "Setting costs"
                set_costs(train, train_docs)

        print "Training"
        prog = utils.Progbar(train.n_batches)
        train.shuffle()
        loss_sum, n_examples = 0, 0
        for i, X in enumerate(train):
            if X['y'].size == 0:
                continue
            batch_loss = model.train_on_batch(X)
            loss_sum += batch_loss * train.scale_factor
            n_examples += X['y'].size
            prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)])
        epoch_stats["train time"] = time.time() - prog.start
        for k in prog.unique_values:
            epoch_stats[k] = prog.sum_values[k][0] / max(
                1, prog.sum_values[k][1])

        epoch_stats["weight diffs"] = [
            (np.sum(np.abs(new_weight - old_weight)), new_weight.size)
            for new_weight, old_weight in zip(model.get_weights(),
                                              model_weights)
        ]
        summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0)
        epoch_stats["total weight diff"] = tuple(summed)

        print "Testing on dev set"
        evaluate_model(dev, dev_docs, model, model_props, epoch_stats)

        history.append(epoch_stats)
        utils.write_pickle(history, model_props.path + 'history.pkl')
        score = -epoch_stats["dev conll"] if model_props.ranking else \
            (epoch_stats["dev loss"] if not model_props.anaphoricity_only else
             epoch_stats["dev anaphoricity loss"])
        if score < best_val_score:
            best_val_score = score
            print "New best {:}, saving model".format(
                "CoNLL F1" if model_props.ranking else "validation loss")
            model.save_weights(model_props.path + "best_weights.hdf5",
                               overwrite=True)
        if score < best_val_score_in_window and epoch > write_start:
            print "Best in last {:}, saved to weights_{:}".format(
                write_every, write_every * (epoch / write_every))
            best_val_score_in_window = score
            model.save_weights(
                model_props.path +
                "weights_{:}.hdf5".format(write_every * (epoch / write_every)),
                overwrite=True)
            if epoch + write_every >= n_epochs:
                model.save_weights(model_props.path + "final_weights.hdf5",
                                   overwrite=True)
        if epoch % write_every == 0:
            best_val_score_in_window = 1000

        timer.stop("train")
        timer.print_totals()
        print

    timer.clear()
Esempio n. 5
0
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'):
    print "Training", model_props.path
    pprint(model_props.__dict__)

    model_props.write(model_props.path + 'model_props.pkl')
    utils.rmkdir(model_props.path + 'src')
    for fname in os.listdir('.'):
        if fname.endswith('.py'):
            shutil.copyfile(fname, model_props.path + 'src/' + fname)
    if model_props.ranking or \
            model_props.top_pairs:
        write_start = 0
        write_every = 10
    else:
        write_start = 80
        write_every = 20

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    train = datasets.DocumentBatchedDataset("train_reduced" if reduced else "train",
                                            model_props, with_ids=True)
    dev = datasets.DocumentBatchedDataset(dev_set_name + "_reduced" if reduced else dev_set_name,
                                          model_props, with_ids=True)

    print "Building model"
    model, _ = pairwise_models.get_model(dev, vectors, model_props)
    json_string = model.to_json()
    open(model_props.path + 'architecture.json', 'w').write(json_string)

    best_val_score = 1000
    best_val_score_in_window = 1000
    history = []
    for epoch in range(n_epochs):
        timer.start("train")
        print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path)

        epoch_stats = {}
        model_weights = model.get_weights()
        train_docs = utils.load_pickle(directories.DOCUMENTS + 'train_docs.pkl')
        dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name + '_docs.pkl')
        if reduced:
            dev_docs = dev_docs[:3]

        if model_props.ranking:
            print "Running over training set"
            run_model_over_docs(train, train_docs, model)
            epoch_stats.update(compute_metrics(train_docs, "train"))
            if model_props.use_rewards:
                print "Setting costs"
                set_costs(train, train_docs)

        print "Training"
        prog = utils.Progbar(train.n_batches)
        train.shuffle()
        loss_sum, n_examples = 0, 0
        for i, X in enumerate(train):
            if X['y'].size == 0:
                continue
            batch_loss = model.train_on_batch(X)
            loss_sum += batch_loss * train.scale_factor
            n_examples += X['y'].size
            prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)])
        epoch_stats["train time"] = time.time() - prog.start
        for k in prog.unique_values:
            epoch_stats[k] = prog.sum_values[k][0] / max(1, prog.sum_values[k][1])

        epoch_stats["weight diffs"] = [
            (np.sum(np.abs(new_weight - old_weight)), new_weight.size)
            for new_weight, old_weight in zip(model.get_weights(), model_weights)]
        summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0)
        epoch_stats["total weight diff"] = tuple(summed)

        print "Testing on dev set"
        evaluate_model(dev, dev_docs, model, model_props, epoch_stats)

        history.append(epoch_stats)
        utils.write_pickle(history, model_props.path + 'history.pkl')
        score = -epoch_stats["dev conll"] if model_props.ranking else \
            (epoch_stats["dev loss"] if not model_props.anaphoricity_only else
             epoch_stats["dev anaphoricity loss"])
        if score < best_val_score:
            best_val_score = score
            print "New best {:}, saving model".format(
                "CoNLL F1" if model_props.ranking else "validation loss")
            model.save_weights(model_props.path + "best_weights.hdf5", overwrite=True)
        if score < best_val_score_in_window and epoch > write_start:
            print "Best in last {:}, saved to weights_{:}".format(
                write_every, write_every * (epoch / write_every))
            best_val_score_in_window = score
            model.save_weights(model_props.path + "weights_{:}.hdf5".format(
                write_every * (epoch / write_every)), overwrite=True)
            if epoch + write_every >= n_epochs:
                model.save_weights(model_props.path + "final_weights.hdf5", overwrite=True)
        if epoch % write_every == 0:
            best_val_score_in_window = 1000

        timer.stop("train")
        timer.print_totals()
        print

    timer.clear()