def ensure_model_trains_and_loads(self, model_class, args: Params): args['save_models'] = True # Our loading tests work better if you're not using data generators. Unless you # specifically request it in your test, we'll avoid using them here, and if you _do_ use # them, we'll skip some of the stuff below that isn't compatible. args.setdefault('data_generator', None) model = self.get_model(model_class, args) model.train() # load the model that we serialized loaded_model = self.get_model(model_class, args) loaded_model.load_model() # verify that original model and the loaded model predict the same outputs if model._uses_data_generators(): # We shuffle the data in the data generator. Instead of making that logic more # complicated, we'll just pass on the loading tests here. See comment above. pass else: assert_allclose( model.model.predict(model.validation_arrays[0]), loaded_model.model.predict(model.validation_arrays[0])) # We should get the same result if we index the data from the original model and the loaded # model. _, indexed_validation_arrays = loaded_model.load_data_arrays( model.validation_files) if model._uses_data_generators(): # As above, we'll just pass on this. pass else: assert_allclose( model.model.predict(model.validation_arrays[0]), loaded_model.model.predict(indexed_validation_arrays[0])) return model, loaded_model
def test_works_with_word_and_character_tokenizer(self): answer_options_simple = ("a<>a sentence<><>") background_simple = ("a<>a sentence<><>") line_simple = "\t".join( str(x) for x in [answer_options_simple, background_simple, "0"]) TextInstance.tokenizer = WordAndCharacterTokenizer(Params({})) data_indexer = DataIndexer() a_word_index = data_indexer.add_word_to_index("a", namespace='words') sentence_index = data_indexer.add_word_to_index("sentence", namespace='words') a_index = data_indexer.add_word_to_index("a", namespace='characters') s_index = data_indexer.add_word_to_index("s", namespace='characters') e_index = data_indexer.add_word_to_index("e", namespace='characters') new_instance = TupleInferenceInstance.read_from_line(line_simple) indexed = new_instance.to_indexed_instance(data_indexer) padding_lengths = { 'num_question_tuples': 1, 'num_background_tuples': 1, 'num_slots': 2, 'num_sentence_words': 2, 'num_options': 1, 'num_word_characters': 3 } indexed.pad(padding_lengths) expected_indexed_tuple = [[[0, 0, 0], [a_word_index, a_index, 0]], [[a_word_index, a_index, 0], [sentence_index, s_index, e_index]]] expected_answers_indexed = numpy.asarray([expected_indexed_tuple]) expected_background_indexed = numpy.asarray(expected_indexed_tuple) assert numpy.all(indexed.answers_indexed == expected_answers_indexed) assert numpy.all( indexed.background_indexed == expected_background_indexed) TextInstance.tokenizer = tokenizers['words'](Params({}))
def main(): if len(sys.argv) != 2: print('USAGE: run_model.py [param_file]') sys.exit(-1) log_keras_version_info() param_file = sys.argv[1] param_dict = pyhocon.ConfigFactory.parse_file(param_file) params = Params(replace_none(param_dict)) log_dir = params.get("model_serialization_prefix", None) # pylint: disable=no-member if log_dir is not None: sys.stdout = TeeLogger(log_dir + "_stdout.log", sys.stdout) sys.stderr = TeeLogger(log_dir + "_stderr.log", sys.stderr) handler = logging.FileHandler(log_dir + "_python_logging.log") handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) shutil.copyfile(param_file, log_dir + "_model_params.json") model_type = params.pop_choice('model_class', concrete_models.keys()) model_class = concrete_models[model_type] model = model_class(params) if model.can_train(): logger.info("Training model") model.train() else: logger.info( "Not enough training inputs. Assuming you wanted to load a model instead." ) # TODO(matt): figure out a way to specify which epoch you want to load a model from. model.load_model() K.clear_session()
def check_experiment_type_can_train(self, param_file): param_dict = pyhocon.ConfigFactory.parse_file(param_file) params = Params(replace_none(param_dict)) model_class = concrete_models[params.pop("model_class")] # Tests will try to create root directories as we have /net/efs paths, # so we just remove the serialisation aspect here, alter the train/validation # paths to the dummy test ones and make sure we only do one epoch to # speed things up. params["model_serialization_prefix"] = None if len(params["train_files"]) > 1: params["train_files"] = [self.TRAIN_FILE, self.TRAIN_BACKGROUND] params["validation_files"] = [ self.VALIDATION_FILE, self.VALIDATION_BACKGROUND ] else: params["train_files"] = [self.TRAIN_FILE] params["validation_files"] = [self.TRAIN_FILE] params["num_epochs"] = 1 try: if params["embeddings"]["words"]["pretrained_file"]: params["embeddings"]["words"][ "pretrained_file"] = self.PRETRAINED_VECTORS_GZIP except KeyError: # No embedding/words field passed in the parameters, # so nothing to change. pass model = self.get_model(model_class, params) model.train()
def test_trains_and_loads_correctly(self): self.write_span_prediction_files() args = Params({ 'model_serialization_prefix': self.TEST_DIR + "_bidaf", 'embeddings': {"words": {'dimension': 4}, "characters": {'dimension': 4}}, 'save_models': True, 'tokenizer': {'type': 'words and characters'}, 'show_summary_with_masking_info': True, }) bidaf_model = self.get_model(BidirectionalAttentionFlow, args) bidaf_model.train() K.clear_session() bidaf_model_params = self.get_model_params(BidirectionalAttentionFlow, args) args = Params({ 'bidaf_params': bidaf_model_params, 'train_bidaf': False, 'similarity_function': {'type': 'linear', 'combination': 'x,y'}, }) self.write_who_did_what_files() model, _ = self.ensure_model_trains_and_loads(MultipleChoiceBidaf, args) # All of the params come from the linear similarity function in the attention layer, # because we set `train_bidaf` to `False`. 41 comes from 32 + 8 + 1, where 32 is from the # modeled passage (see the equations in the paper for why it's 32), 8 is from the Bi-LSTM # operating on the answer options (embedding_dim * 2), and 1 is from the bias. assert sum([K.count_params(p) for p in model.model.trainable_weights]) == 41
def serve(port: int, config_file: str): # read in the Typesafe-style config file solver_params = ConfigFactory.parse_file(config_file) params = Params(replace_none(solver_params)) model_type = params.pop_choice('model_class', concrete_models.keys()) solver_class = concrete_models[model_type] solver = solver_class(params) global mySolver mySolver = SolverServer(solver) # start the server on the specified port print("starting server") app.run(host='0.0.0.0')
def test_to_indexed_instance_converts_correctly(self): data_indexer = DataIndexer() sentence_index = data_indexer.add_word_to_index("sentence", namespace='words') capital_a_index = data_indexer.add_word_to_index("A", namespace='words') space_index = data_indexer.add_word_to_index(" ", namespace='words') a_index = data_indexer.add_word_to_index("a", namespace='words') s_index = data_indexer.add_word_to_index("s", namespace='words') e_index = data_indexer.add_word_to_index("e", namespace='words') n_index = data_indexer.add_word_to_index("n", namespace='words') t_index = data_indexer.add_word_to_index("t", namespace='words') c_index = data_indexer.add_word_to_index("c", namespace='words') a_char_index = data_indexer.add_word_to_index("a", namespace='characters') s_char_index = data_indexer.add_word_to_index("s", namespace='characters') e_char_index = data_indexer.add_word_to_index("e", namespace='characters') n_char_index = data_indexer.add_word_to_index("n", namespace='characters') t_char_index = data_indexer.add_word_to_index("t", namespace='characters') c_char_index = data_indexer.add_word_to_index("c", namespace='characters') instance = TextClassificationInstance( "A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [a_index, sentence_index] TextInstance.tokenizer = tokenizers['characters'](Params({})) instance = TextClassificationInstance( "A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [ capital_a_index, space_index, s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ] TextInstance.tokenizer = tokenizers['words and characters'](Params({})) instance = TextClassificationInstance( "A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [[a_index, a_char_index], [ sentence_index, s_char_index, e_char_index, n_char_index, t_char_index, e_char_index, n_char_index, c_char_index, e_char_index ]]
def test_end_to_end_conversion_to_arrays_with_word_and_character_tokenizer( self): TextInstance.tokenizer = tokenizers['words and characters'](Params({})) instance = SentenceInstance("this is a sentence") indexed_instance = instance.to_indexed_instance(self.data_indexer) indexed_instance.pad({ 'num_sentence_words': 6, 'num_word_characters': 5 }) word_array, label_array = indexed_instance.as_training_data() assert_array_equal(word_array, [ [0, 0, 0, 0, 0], [self.start_index, 0, 0, 0, 0], [ self.this_index, self.t_char_index, self.h_char_index, self.i_char_index, self.s_char_index ], [self.is_index, self.i_char_index, self.s_char_index, 0, 0], [self.a_index, self.a_char_index, 0, 0, 0], [ self.sentence_index, self.s_char_index, self.e_char_index, self.n_char_index, self.t_char_index ], ]) assert_array_equal( label_array, [[0], [self.this_index], [self.is_index], [self.a_index], [self.sentence_index], [self.end_index]])
def test_padding_works_correctly(self, _output_debug_info): self.write_question_answer_memory_network_files() args = Params({ 'num_hidden_layers': 1, 'hidden_layer_width': 2, 'show_summary_with_masking_info': True, 'debug': { 'data': 'training', 'layer_names': ['answer_similarity_softmax'] } }) model = self.get_model(QuestionAnswerSimilarity, args) def new_debug(output_dict, epoch): # pylint: disable=unused-argument # We're going to check in here that the attentions and so on are properly masked. In # particular, we'll check two things: (1) that the final answer option softmax has # correctly padded out the extra option, and (2) that the attention weights on all of # the inputs are properly padded. To see that this test is correct, you have to look # at the actual file written in `write_multiple_true_false_memory_network_files()`. print(output_dict) answer_scores = output_dict['answer_similarity_softmax'] assert answer_scores[0][2] == 0 assert answer_scores[1][2] == 0 assert answer_scores[3][2] == 0 _output_debug_info.side_effect = new_debug model.train()
def test_cloze_train_does_not_crash(self): self.write_who_did_what_files() args = Params({ "qd_common_feature": True, "gating_function": "+", "cloze_token": "xxxxx", "num_gated_attention_layers": 2, "tokenizer": { "type": "words and characters" }, "encoder": { "word": { "type": "bi_gru", "units": 2, } }, "seq2seq_encoder": { "question_0": { "type": "bi_gru", "encoder_params": { "units": 3 }, "wrapper_params": {} }, "document_0": { "type": "bi_gru", "encoder_params": { "units": 3 }, "wrapper_params": {} }, "document_final": { "type": "bi_gru", "encoder_params": { "units": 3 }, "wrapper_params": {} }, "question_final": { "type": "bi_gru", "encoder_params": { "units": 3 }, "wrapper_params": { "merge_mode": None } } }, }) model, loaded_model = self.ensure_model_trains_and_loads( GatedAttentionReader, args) # verify that the gated attention function was set properly assert model.gating_function == "+" assert model.gating_function == model.model.get_layer( "gated_attention_0").gating_function # verify that the gated attention function was set properly in the loaded model assert loaded_model.gating_function == "+" assert loaded_model.gating_function == loaded_model.model.get_layer( "gated_attention_0").gating_function
def test_trains_and_loads_correctly(self): self.write_frame_cloze_files() pre_ = "/Users/nikett/Documents/work/code/thirdparty/deepqa/deep_qa/datasets-pulled/glove.6B.100d.txt.gz" args = Params({ 'save_models': True, 'show_summary_with_masking_info': True, 'instance_type': 'FrameEmbeddedLabelInstance', 'validation_metric': 'val_loss', # 'model_serialization_prefix': '/Users/nikett/TEMP/', 'loss': 'mean_squared_error', # 'num_slots': 27, TODO: why? -> "Extra parameters passed to Trainer: {'num_slots': 27}" "embeddings": { "words": { "dimension": 100, "pretrained_embeddings_file": pre_ }, "characters": { "dimension": 8 } }, 'tokenizer': { 'processor': { 'word_splitter': 'simple' } }, }) self.ensure_model_trains_and_loads(FrameClozeModel, args)
def test_trains_and_loads_correctly(self): self.write_span_prediction_files() args = Params({ 'embeddings': { 'words': { 'dimension': 8 }, 'characters': { 'dimension': 4 } }, 'save_models': True, 'tokenizer': { 'type': 'words and characters' }, 'show_summary_with_masking_info': True, }) model, _ = self.ensure_model_trains_and_loads( BidirectionalAttentionFlow, args) for layer in model.model.layers: if layer.name == 'characters_embedding': assert layer.get_output_shape_at(0)[-1] == 4 break else: assert False, "couldn't find character embedding layer"
def test_loss_function_uses_mask(self): # We're going to make sure that the loss and accuracy computations are the same for any # permutation of labels on padded tokens. If not, the loss/accuracy function is paying # attention to the labels when it shouldn't be. We're not going to test for any particular # accuracy value, just that all of them are the same - I ran this a few times by hand to be # sure that we're getting different accuracy values, depending on the initialization. self.write_sequence_tagging_files() args = Params({ 'show_summary_with_masking_info': True, 'instance_type': 'PreTokenizedTaggingInstance', 'tokenizer': {'processor': {'word_splitter': 'no_op'}}, }) model = self.get_model(SimpleTagger, args) model.train() input_indices = [3, 2, 0, 0] labels = [[[0, 1], [1, 0], [1, 0], [1, 0]], [[0, 1], [1, 0], [1, 0], [0, 1]], [[0, 1], [1, 0], [0, 1], [1, 0]], [[0, 1], [1, 0], [0, 1], [0, 1]]] results = [model.model.evaluate(numpy.asarray([input_indices]), numpy.asarray([label])) for label in labels] loss, accuracy = zip(*results) assert len(set(loss)) == 1 assert len(set(accuracy)) == 1
def test_tf_and_keras_optimise_identical_variables(self): self.write_memory_network_files() # Make sure that the variables designated as trainable by tensorflow # are the same as those designated as trainable by Keras. These could # not be equal, for instance, if we manage to use a Keras layer without # calling layer.build(). This is very hard to do, but because this solver # mixes in some tensorflow, which does not have a "build" equivalent # (in that it will happily train a variable which is in a layer which # hasn't been built) we check this. # Create a new tf session to avoid variables created in other tests affecting this. K.clear_session() # Add in a layer which is within the adaptive memory step which actually has # parameters. args = Params({ 'recurrence_mode': {'type': 'adaptive'}, 'knowledge_selector': {'type': 'parameterized'} }) solver = self.get_model(MemoryNetwork, args) solver.training_dataset = solver.load_dataset_from_files(solver.train_files) solver.set_model_state_from_dataset(solver.training_dataset) indexed_dataset = solver.training_dataset.to_indexed_dataset(solver.data_indexer) solver.set_model_state_from_indexed_dataset(indexed_dataset) model = solver._build_model() tf_trainable_variables = tf.trainable_variables() keras_trainable_variables = model.trainable_weights assert [x.name for x in tf_trainable_variables] == [x.name for x in keras_trainable_variables]
def get_model_params(self, model_class, additional_arguments=None): params = Params({}) params['save_models'] = False params['model_serialization_prefix'] = self.TEST_DIR params['train_files'] = [self.TRAIN_FILE] params['validation_files'] = [self.VALIDATION_FILE] params['embeddings'] = { 'words': { 'dimension': 6 }, 'characters': { 'dimension': 2 } } params['encoder'] = {"default": {'type': 'bow'}} params['num_epochs'] = 1 params['validation_split'] = 0.0 if self.is_model_with_background(model_class): # pylint: disable=no-member params['train_files'].append(self.TRAIN_BACKGROUND) params['validation_files'].append(self.VALIDATION_BACKGROUND) # pylint: enable=no-member if self.is_memory_network(model_class): params['knowledge_selector'] = {'type': 'dot_product'} params['memory_updater'] = {'type': 'sum'} params['entailment_input_combiner'] = {'type': 'memory_only'} if additional_arguments: for key, value in additional_arguments.items(): params[key] = deepcopy(value) return params
def test_load_model_and_fit(self): args = Params({ 'test_files': [self.TEST_FILE], 'embedding_dim': { 'words': 4, 'characters': 2 }, 'save_models': True, 'tokenizer': { 'type': 'words and characters' }, 'show_summary_with_masking_info': True, }) self.write_true_false_model_files() model, loaded_model = self.ensure_model_trains_and_loads( ClassificationModel, args) # now fit both models on some more data, and ensure that we get the same results. self.write_additional_true_false_model_files() _, training_arrays = loaded_model.load_data_arrays( loaded_model.train_files) model.model.fit(training_arrays[0], training_arrays[1], shuffle=False, nb_epoch=1) loaded_model.model.fit(training_arrays[0], training_arrays[1], shuffle=False, nb_epoch=1)
def setUp(self): super(TestPreTokenizedTaggingInstance, self).setUp() tokens = ["cats", "are", "animals", "."] tags = ["N", "V", "N", "."] self.instance = PreTokenizedTaggingInstance(tokens, tags) TextInstance.tokenizer = tokenizers['words'](Params( {'processor': { 'word_splitter': 'no_op' }}))
def test_trains_and_loads_correctly(self): self.write_span_prediction_files() args = Params({ 'embedding_dim': {'words': 4, 'characters': 4}, 'save_models': True, 'tokenizer': {'type': 'words and characters'}, 'show_summary_with_masking_info': True, }) self.ensure_model_trains_and_loads(BidirectionalAttentionFlow, args)
def test_read_from_file(self): args = Params({"sequence_length": 4}) dataset = LanguageModelingDataset.read_from_file( self.TRAIN_FILE, SentenceInstance, args) instances = dataset.instances assert instances[0].text == "This is a sentence" assert instances[1].text == "for language modelling. Here's" assert instances[2].text == "another one for language"
def test_passes_through_correctly(self): word_processor = WordProcessor(Params({})) sentence = "this (sentence) has 'crazy' \"punctuation\"." tokens = word_processor.get_tokens(sentence) expected_tokens = [ "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"", "punctuation", "\"", "." ] assert tokens == expected_tokens
def test_words_tokenizes_the_sentence_correctly(self): t = TextClassificationInstance("This is a sentence.", None) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']} TextInstance.tokenizer = tokenizers['characters'](Params({})) assert t.words() == { 'words': [ 'T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.' ] } TextInstance.tokenizer = tokenizers['words and characters'](Params({})) assert t.words() == { 'words': ['this', 'is', 'a', 'sentence', '.'], 'characters': [ 't', 'h', 'i', 's', 'i', 's', 'a', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.' ] }
def test_initialize_lsh_does_not_crash(self): args = Params({ 'corpus_path': self.corpus_path, 'model_serialization_prefix': './', 'num_sentence_words': 3, }) model = self.get_model(DifferentiableSearchMemoryNetwork, args) model.encoder_model = FakeEncoder() model._initialize_lsh()
def test_train_does_not_crash_with_parameterized_heuristic_matching_knowledge_selector( self): args = Params({ 'knowledge_selector': { 'type': 'parameterized_heuristic_matching' } }) model = self.get_model(MemoryNetwork, args) model.train()
def test_trains_and_loads_correctly(self): self.write_sequence_tagging_files() args = Params({ 'save_models': True, 'show_summary_with_masking_info': True, 'instance_type': 'PreTokenizedTaggingInstance', 'tokenizer': {'processor': {'word_splitter': 'no_op'}}, }) self.ensure_model_trains_and_loads(SimpleTagger, args)
def test_pretrained_embeddings_works_correctly(self): self.write_true_false_model_files() self.write_pretrained_vector_files() args = Params({ 'embeddings': {'words': {'dimension': 8, 'pretrained_file': self.PRETRAINED_VECTORS_GZIP}, 'characters': {'dimension': 8}}, }) model = self.get_model(ClassificationModel, args) model.train()
def test_stems_and_filters_correctly(self): word_processor = WordProcessor( Params({ 'word_stemmer': 'porter', 'word_filter': 'stopwords' })) sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = ["sentenc", "ha", "crazi", "punctuat"] tokens = word_processor.get_tokens(sentence) assert tokens == expected_tokens
def setUp(self): super(TestMultiGpu, self).setUp() self.write_true_false_model_files() self.args = Params({ 'embedding_dim': {'words': 4, 'characters': 2}, 'batch_size': 8, 'num_gpus': 2, 'save_models': True, 'show_summary_with_masking_info': True, })
def serve(port: int, param_file: str): # read in the Typesafe-style config file params = pyhocon.ConfigFactory.parse_file(param_file) params = Params(replace_none(params)) retrieval_params = params.pop('retrieval') corpus_file = params.pop('corpus', None) num_neighbors = params.pop('num_neighbors', 10) global retrieval retrieval = VectorBasedRetrieval(retrieval_params) if corpus_file is not None: retrieval.read_background(corpus_file) retrieval.fit() retrieval.save_model() else: retrieval.load_model() # start the server on the specified port print("starting server") app.run(host='0.0.0.0')
def test_data_generator_works(self): args = Params({ 'test_files': [self.TEST_FILE], 'embeddings': {'words': {'dimension': 4}, 'characters': {'dimension': 2}}, 'save_models': True, 'tokenizer': {'type': 'words and characters'}, 'data_generator': {}, 'show_summary_with_masking_info': True, }) self.write_true_false_model_files() self.ensure_model_trains_and_loads(ClassificationModel, args)
def test_dynamic_padding_works(self): args = Params({ 'test_files': [self.TEST_FILE], 'embeddings': {'words': {'dimension': 4}, 'characters': {'dimension': 2}}, 'save_models': True, 'tokenizer': {'type': 'words and characters'}, 'data_generator': {'dynamic_padding': True}, 'batch_size': 2, }) self.write_true_false_model_files() self.ensure_model_trains_and_loads(ClassificationModel, args)