Esempio n. 1
0
def train(model_to_train, model_base, lookup, oov_embedding, train_path, dev_path, window_size, data_name, overwrite):
    logger.info("Loading CoNLL data.")
    word_sentences_train, pos_sentences_train, word_alphabet, pos_alphabet = processor.read_conll(train_path)

    training_alphabet_output = os.path.join(model_base, data_name)
    ensure_dir(training_alphabet_output)
    word_alphabet.save(training_alphabet_output, 'training_words')

    logger.info("Sliding window on the data.")
    x_train = processor.slide_all_sentences(word_sentences_train, word_alphabet, window_size)
    y_train = processor.get_all_one_hots(pos_sentences_train, pos_alphabet)

    if oov_embedding == 'random':
        logger.info("Dev set word vectors are not added to alphabet.")
        word_alphabet.stop_auto_grow()

    x_dev, y_dev = None, None
    if dev_path:
        word_sentences_dev, pos_sentences_dev, _, _ = processor.read_conll(dev_path)
        x_dev = processor.slide_all_sentences(word_sentences_dev, word_alphabet, window_size)
        y_dev = processor.get_all_one_hots(pos_sentences_dev, pos_alphabet)

    # Alphabet stop growing.
    word_alphabet.stop_auto_grow()

    # A embedding subset from the word alphabet.
    embeddings = lookup.w2v_lookup(word_alphabet)

    logger.info("Training data dimension is %s, here is a sample:" % (str(x_train.shape)))
    logger.info(x_train[0])

    logger.info("Label data dimension is %s, here is a sample:" % (str(y_train.shape)))
    logger.info(y_train[0])

    models = {}
    if model_to_train == 'vanilla' or model_to_train == 'all':
        model_output = get_model_directory(model_base, data_name, 'vanilla')
        mlp = VanillaLabelingMlp(embeddings=embeddings, pos_dim=pos_alphabet.size(),
                                 vocabulary_size=word_alphabet.size(), window_size=window_size)
        train_model(mlp, x_train, y_train, x_dev, y_dev, pos_alphabet, word_alphabet, model_output, overwrite)
        models['vanilla'] = (mlp, pos_alphabet, word_alphabet)
    if model_to_train == 'auto' or model_to_train == 'all':
        model_output = get_model_directory(model_base, data_name, 'auto')
        mlp = AutoEmbeddingMlp(embeddings=embeddings, pos_dim=pos_alphabet.size(),
                               vocabulary_size=word_alphabet.size(), window_size=window_size)
        train_model(mlp, x_train, y_train, x_dev, y_dev, pos_alphabet, word_alphabet, model_output, overwrite)
        models['auto'] = (mlp, pos_alphabet, word_alphabet)
    return models
Esempio n. 2
0
def test(trained_models, lookup, oov_embedding, test_conll, window_size):
    logger.info("Testing condition - [OOV Vector] : %s ; [Test Data] : %s ." % (oov_embedding, test_conll))
    for model_name, (model, pos_alphabet, train_alphabet) in trained_models.iteritems():
        alphabet_for_test = train_alphabet.get_copy()

        if oov_embedding == "pretrained":
            alphabet_for_test.restart_auto_grow()
        elif oov_embedding == "random":
            alphabet_for_test.stop_auto_grow()

        original_alphabet_size = alphabet_for_test.size()
        logger.info("Original alphabet size is %d" % original_alphabet_size)

        word_sentences_test, pos_sentences_test, _, _ = processor.read_conll(test_conll)
        x_test = processor.slide_all_sentences(word_sentences_test, alphabet_for_test, window_size)
        y_test = processor.get_all_one_hots(pos_sentences_test, pos_alphabet)

        test_model = model
        if oov_embedding == "pretrained":
            logger.info("New alphabet size is %d" % alphabet_for_test.size())
            #  A new embedding using the extended word alphabet.
            new_embeddings = lookup.w2v_lookup(alphabet_for_test)
            additional_embeddings = new_embeddings[original_alphabet_size:]
            logger.info("New embedding size is %d" % len(additional_embeddings))
            test_model = model.augment_embedding(additional_embeddings)

        evaluate_result = test_model.test(x_test, y_test)
        try:
            result_str = ", ".join("%.4f" % f for f in evaluate_result)
        except TypeError:
            result_str = "%.4f" % evaluate_result
        logger.info("Direct test results are [%s] by model %s." % (result_str, model_name))
Esempio n. 3
0
def test(trained_models, label_alphabet, lookup, oov_embedding, test_conll,
         window_size):
    logger.info("Testing condition - [OOV Vector] : %s ; [Test Data] : %s ." %
                (oov_embedding, test_conll))

    for model_name, (model, embedding_alphabet) in trained_models.iteritems():
        alphabet_for_test = embedding_alphabet.get_copy()
        original_alphabet_size = alphabet_for_test.size()
        logger.info(
            "Original alphabet used to train the model is of size %d ." %
            original_alphabet_size)

        if oov_embedding == "pretrained":
            alphabet_for_test.restart_auto_grow()

        word_sentences_test, pos_sentences_test, _, _ = processor.read_conll(
            test_conll)
        x_test = processor.slide_all_sentences(word_sentences_test,
                                               alphabet_for_test, window_size)
        y_test = processor.get_all_one_hots(pos_sentences_test, label_alphabet)

        logger.info("New alphabet size is %d" % alphabet_for_test.size())

        # TODO we seems need to make a copy of the model.
        test_model = model
        if oov_embedding == "pretrained":
            additional_embeddings = lookup.load_additional_embeddings(
                embedding_alphabet, alphabet_for_test)
            if additional_embeddings:
                logger.info("New embedding size is %d" %
                            len(additional_embeddings))
                test_model = model.augment_embedding(additional_embeddings)

        evaluate_result = test_model.test(x_test, y_test)
        try:
            result_str = ", ".join("%.4f" % f for f in evaluate_result)
        except TypeError:
            result_str = "%.4f" % evaluate_result
        logger.info("Direct test results are [%s] by model %s." %
                    (result_str, model_name))
Esempio n. 4
0
def train(models_to_train, model_base, lookup, oov_handling, train_path,
          dev_path, window_size, data_name, overwrite):
    logger.info("Loading CoNLL data.")
    word_sentences_train, pos_sentences_train, word_alphabet, label_alphabet = processor.read_conll(
        train_path)

    # Take a snapshot of the current alphabet, which only contains training words. This is useful in fine tuning.
    train_alphabet = word_alphabet.get_copy()

    logger.info("Sliding window on the data.")
    x_train = processor.slide_all_sentences(word_sentences_train,
                                            word_alphabet, window_size)
    y_train = processor.get_all_one_hots(pos_sentences_train, label_alphabet)

    label_alphabet.stop_auto_grow()

    if oov_handling == 'random':
        logger.info("Dev set word vectors are not added to alphabet.")
        word_alphabet.stop_auto_grow()
    else:
        # We will add development word embeddings to the alphabet so that their weights can be used.
        logger.info("Dev set word vectors will be added to alphabet.")

    x_dev, y_dev = None, None
    if dev_path:
        word_sentences_dev, pos_sentences_dev, _, _ = processor.read_conll(
            dev_path)
        x_dev = processor.slide_all_sentences(word_sentences_dev,
                                              word_alphabet, window_size)
        y_dev = processor.get_all_one_hots(pos_sentences_dev, label_alphabet)

    # Alphabet stop growing now anyways.
    word_alphabet.stop_auto_grow()

    logger.info("Training data dimension is %s, here is a sample:" %
                (str(x_train.shape)))
    logger.info(x_train[0])

    logger.info("Training label data dimension is %s, here is a sample:" %
                (str(y_train.shape)))
    logger.info(y_train[0])

    models = {}

    for model_name in models_to_train:
        lookup.initail_lookup(word_alphabet)

        if model_name == 'vanilla':
            model_output = os.path.join(model_base, data_name, 'vanilla')
            train_x = {ExperimentConfig.main_input_name: x_train}
            train_y = {ExperimentConfig.main_output_name: y_train}
            dev_data = ({
                ExperimentConfig.main_input_name: x_dev
            }, {
                ExperimentConfig.main_output_name: y_dev
            })

            for fix_embedding in MLPConfig.fix_embedding:
                mlp = VanillaLabelingMlp(embeddings=lookup.table,
                                         pos_dim=label_alphabet.size(),
                                         vocabulary_size=word_alphabet.size(),
                                         window_size=window_size,
                                         fix_embedding=fix_embedding)
                train_model(mlp, train_x, train_y, dev_data, label_alphabet,
                            word_alphabet, model_output, overwrite)
                actual_model_name = model_name + "%s" % fix_embedding
                models[actual_model_name] = (mlp, word_alphabet)

        elif model_name == 'auto':
            if oov_handling == 'random':
                logger.info(
                    "We do not train the auto model when the embedding is initialized randomly."
                )
                continue

            train_x = {ExperimentConfig.main_input_name: x_train}

            y_auto_train = processor.get_center_embedding(
                x_train, lookup.table)
            y_auto_dev = processor.get_center_embedding(y_dev, lookup.table)

            train_y = {
                ExperimentConfig.main_output_name: y_train,
                AutoConfig.auto_output_name: y_auto_train
            }

            dev_data = ({
                ExperimentConfig.main_input_name: x_dev
            }, {
                ExperimentConfig.main_output_name: y_dev,
                AutoConfig.auto_output_name: y_auto_dev
            })

            for auto_option in AutoConfig.auto_options:
                model_output = os.path.join(model_base, data_name, 'auto',
                                            auto_option)
                mlp = AutoEmbeddingMlp(
                    embeddings=lookup.full_table,
                    pos_dim=label_alphabet.size(),
                    vocabulary_size=lookup.full_alphabet.size(),
                    window_size=window_size,
                    auto_option=auto_option)
                train_model(mlp, train_x, train_y, dev_data, label_alphabet,
                            lookup.full_alphabet, model_output, overwrite)
                models[model_name + "_" + auto_option] = (mlp,
                                                          lookup.full_alphabet)
        else:
            logger.warn("Unknown model name %s." % model_name)
            continue

    return models, train_alphabet, label_alphabet