def train(model_to_train, model_base, lookup, oov_embedding, train_path, dev_path, window_size, data_name, overwrite): logger.info("Loading CoNLL data.") word_sentences_train, pos_sentences_train, word_alphabet, pos_alphabet = processor.read_conll(train_path) training_alphabet_output = os.path.join(model_base, data_name) ensure_dir(training_alphabet_output) word_alphabet.save(training_alphabet_output, 'training_words') logger.info("Sliding window on the data.") x_train = processor.slide_all_sentences(word_sentences_train, word_alphabet, window_size) y_train = processor.get_all_one_hots(pos_sentences_train, pos_alphabet) if oov_embedding == 'random': logger.info("Dev set word vectors are not added to alphabet.") word_alphabet.stop_auto_grow() x_dev, y_dev = None, None if dev_path: word_sentences_dev, pos_sentences_dev, _, _ = processor.read_conll(dev_path) x_dev = processor.slide_all_sentences(word_sentences_dev, word_alphabet, window_size) y_dev = processor.get_all_one_hots(pos_sentences_dev, pos_alphabet) # Alphabet stop growing. word_alphabet.stop_auto_grow() # A embedding subset from the word alphabet. embeddings = lookup.w2v_lookup(word_alphabet) logger.info("Training data dimension is %s, here is a sample:" % (str(x_train.shape))) logger.info(x_train[0]) logger.info("Label data dimension is %s, here is a sample:" % (str(y_train.shape))) logger.info(y_train[0]) models = {} if model_to_train == 'vanilla' or model_to_train == 'all': model_output = get_model_directory(model_base, data_name, 'vanilla') mlp = VanillaLabelingMlp(embeddings=embeddings, pos_dim=pos_alphabet.size(), vocabulary_size=word_alphabet.size(), window_size=window_size) train_model(mlp, x_train, y_train, x_dev, y_dev, pos_alphabet, word_alphabet, model_output, overwrite) models['vanilla'] = (mlp, pos_alphabet, word_alphabet) if model_to_train == 'auto' or model_to_train == 'all': model_output = get_model_directory(model_base, data_name, 'auto') mlp = AutoEmbeddingMlp(embeddings=embeddings, pos_dim=pos_alphabet.size(), vocabulary_size=word_alphabet.size(), window_size=window_size) train_model(mlp, x_train, y_train, x_dev, y_dev, pos_alphabet, word_alphabet, model_output, overwrite) models['auto'] = (mlp, pos_alphabet, word_alphabet) return models
def test(trained_models, lookup, oov_embedding, test_conll, window_size): logger.info("Testing condition - [OOV Vector] : %s ; [Test Data] : %s ." % (oov_embedding, test_conll)) for model_name, (model, pos_alphabet, train_alphabet) in trained_models.iteritems(): alphabet_for_test = train_alphabet.get_copy() if oov_embedding == "pretrained": alphabet_for_test.restart_auto_grow() elif oov_embedding == "random": alphabet_for_test.stop_auto_grow() original_alphabet_size = alphabet_for_test.size() logger.info("Original alphabet size is %d" % original_alphabet_size) word_sentences_test, pos_sentences_test, _, _ = processor.read_conll(test_conll) x_test = processor.slide_all_sentences(word_sentences_test, alphabet_for_test, window_size) y_test = processor.get_all_one_hots(pos_sentences_test, pos_alphabet) test_model = model if oov_embedding == "pretrained": logger.info("New alphabet size is %d" % alphabet_for_test.size()) # A new embedding using the extended word alphabet. new_embeddings = lookup.w2v_lookup(alphabet_for_test) additional_embeddings = new_embeddings[original_alphabet_size:] logger.info("New embedding size is %d" % len(additional_embeddings)) test_model = model.augment_embedding(additional_embeddings) evaluate_result = test_model.test(x_test, y_test) try: result_str = ", ".join("%.4f" % f for f in evaluate_result) except TypeError: result_str = "%.4f" % evaluate_result logger.info("Direct test results are [%s] by model %s." % (result_str, model_name))
def test(trained_models, label_alphabet, lookup, oov_embedding, test_conll, window_size): logger.info("Testing condition - [OOV Vector] : %s ; [Test Data] : %s ." % (oov_embedding, test_conll)) for model_name, (model, embedding_alphabet) in trained_models.iteritems(): alphabet_for_test = embedding_alphabet.get_copy() original_alphabet_size = alphabet_for_test.size() logger.info( "Original alphabet used to train the model is of size %d ." % original_alphabet_size) if oov_embedding == "pretrained": alphabet_for_test.restart_auto_grow() word_sentences_test, pos_sentences_test, _, _ = processor.read_conll( test_conll) x_test = processor.slide_all_sentences(word_sentences_test, alphabet_for_test, window_size) y_test = processor.get_all_one_hots(pos_sentences_test, label_alphabet) logger.info("New alphabet size is %d" % alphabet_for_test.size()) # TODO we seems need to make a copy of the model. test_model = model if oov_embedding == "pretrained": additional_embeddings = lookup.load_additional_embeddings( embedding_alphabet, alphabet_for_test) if additional_embeddings: logger.info("New embedding size is %d" % len(additional_embeddings)) test_model = model.augment_embedding(additional_embeddings) evaluate_result = test_model.test(x_test, y_test) try: result_str = ", ".join("%.4f" % f for f in evaluate_result) except TypeError: result_str = "%.4f" % evaluate_result logger.info("Direct test results are [%s] by model %s." % (result_str, model_name))
def train(models_to_train, model_base, lookup, oov_handling, train_path, dev_path, window_size, data_name, overwrite): logger.info("Loading CoNLL data.") word_sentences_train, pos_sentences_train, word_alphabet, label_alphabet = processor.read_conll( train_path) # Take a snapshot of the current alphabet, which only contains training words. This is useful in fine tuning. train_alphabet = word_alphabet.get_copy() logger.info("Sliding window on the data.") x_train = processor.slide_all_sentences(word_sentences_train, word_alphabet, window_size) y_train = processor.get_all_one_hots(pos_sentences_train, label_alphabet) label_alphabet.stop_auto_grow() if oov_handling == 'random': logger.info("Dev set word vectors are not added to alphabet.") word_alphabet.stop_auto_grow() else: # We will add development word embeddings to the alphabet so that their weights can be used. logger.info("Dev set word vectors will be added to alphabet.") x_dev, y_dev = None, None if dev_path: word_sentences_dev, pos_sentences_dev, _, _ = processor.read_conll( dev_path) x_dev = processor.slide_all_sentences(word_sentences_dev, word_alphabet, window_size) y_dev = processor.get_all_one_hots(pos_sentences_dev, label_alphabet) # Alphabet stop growing now anyways. word_alphabet.stop_auto_grow() logger.info("Training data dimension is %s, here is a sample:" % (str(x_train.shape))) logger.info(x_train[0]) logger.info("Training label data dimension is %s, here is a sample:" % (str(y_train.shape))) logger.info(y_train[0]) models = {} for model_name in models_to_train: lookup.initail_lookup(word_alphabet) if model_name == 'vanilla': model_output = os.path.join(model_base, data_name, 'vanilla') train_x = {ExperimentConfig.main_input_name: x_train} train_y = {ExperimentConfig.main_output_name: y_train} dev_data = ({ ExperimentConfig.main_input_name: x_dev }, { ExperimentConfig.main_output_name: y_dev }) for fix_embedding in MLPConfig.fix_embedding: mlp = VanillaLabelingMlp(embeddings=lookup.table, pos_dim=label_alphabet.size(), vocabulary_size=word_alphabet.size(), window_size=window_size, fix_embedding=fix_embedding) train_model(mlp, train_x, train_y, dev_data, label_alphabet, word_alphabet, model_output, overwrite) actual_model_name = model_name + "%s" % fix_embedding models[actual_model_name] = (mlp, word_alphabet) elif model_name == 'auto': if oov_handling == 'random': logger.info( "We do not train the auto model when the embedding is initialized randomly." ) continue train_x = {ExperimentConfig.main_input_name: x_train} y_auto_train = processor.get_center_embedding( x_train, lookup.table) y_auto_dev = processor.get_center_embedding(y_dev, lookup.table) train_y = { ExperimentConfig.main_output_name: y_train, AutoConfig.auto_output_name: y_auto_train } dev_data = ({ ExperimentConfig.main_input_name: x_dev }, { ExperimentConfig.main_output_name: y_dev, AutoConfig.auto_output_name: y_auto_dev }) for auto_option in AutoConfig.auto_options: model_output = os.path.join(model_base, data_name, 'auto', auto_option) mlp = AutoEmbeddingMlp( embeddings=lookup.full_table, pos_dim=label_alphabet.size(), vocabulary_size=lookup.full_alphabet.size(), window_size=window_size, auto_option=auto_option) train_model(mlp, train_x, train_y, dev_data, label_alphabet, lookup.full_alphabet, model_output, overwrite) models[model_name + "_" + auto_option] = (mlp, lookup.full_alphabet) else: logger.warn("Unknown model name %s." % model_name) continue return models, train_alphabet, label_alphabet