def testBuildLexicon(self): empty_input_path = os.path.join(test_flags.temp_dir(), 'empty-input') lexicon_output_path = os.path.join(test_flags.temp_dir(), 'lexicon-output') with open(empty_input_path, 'w'): pass # The directory may already exist when running locally multiple times. if not os.path.exists(lexicon_output_path): os.mkdir(lexicon_output_path) # Just make sure this doesn't crash; the lexicon builder op is already # exercised in its own unit test. lexicon.build_lexicon(lexicon_output_path, empty_input_path)
def CreateLocalSpec(self, spec_path): master_spec = self.LoadSpec(spec_path) master_spec_name = os.path.basename(spec_path) outfile = os.path.join(test_flags.temp_dir(), master_spec_name) fout = open(outfile, 'w') fout.write(text_format.MessageToString(master_spec)) return outfile
def testWordEmbeddingInitializerRepeatability(self): records_path = os.path.join(test_flags.temp_dir(), 'records2') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('.', [1, 2, 3])) # 3 dims del writer # As long as there is one non-zero seed, the result should be repeatable. for seed1, seed2 in [(0, 1), (1, 0), (123, 456)]: with tf.Graph().as_default(), self.test_session(): embeddings1 = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context, seed=seed1, seed2=seed2) embeddings2 = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context, seed=seed1, seed2=seed2) # The number of terms is based on the word map, which may change if the # test corpus is updated. Just assert that there are some terms. self.assertGreater(tf.shape(embeddings1)[0].eval(), 0) self.assertGreater(tf.shape(embeddings2)[0].eval(), 0) self.assertEqual(tf.shape(embeddings1)[1].eval(), 3) self.assertEqual(tf.shape(embeddings2)[1].eval(), 3) self.assertAllEqual(embeddings1.eval(), embeddings2.eval())
def ValidateTagToCategoryMap(self): with open(os.path.join(test_flags.temp_dir(), 'tag-to-category'), 'r') as f: entries = [line.strip().split('\t') for line in f.readlines()] for tag, category in entries: self.assertIn(tag, TAGS) self.assertIn(category, CATEGORIES)
def testModelExport(self): # Get the master spec and params for this graph. master_spec = self.LoadSpec('ud-hungarian.master-spec') params_path = os.path.join( test_flags.source_root(), 'dragnn/python/testdata' '/ud-hungarian.params') # Export the graph via SavedModel. (Here, we maintain a handle to the graph # for comparison, but that's usually not necessary.) export_path = os.path.join(test_flags.temp_dir(), 'export') dragnn_model_saver_lib.clean_output_paths(export_path) saver_graph = tf.Graph() shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths( master_spec) dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph) dragnn_model_saver_lib.export_to_graph(master_spec, params_path, export_path, saver_graph, export_moving_averages=False, build_runtime_graph=False) # Export the assets as well. dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original, export_path) # Validate that the assets are all in the exported directory. path_set = self.ValidateAssetExistence(master_spec, export_path) # This master-spec has 4 unique assets. If there are more, we have not # uniquified the assets properly. self.assertEqual(len(path_set), 4) # Restore the graph from the checkpoint into a new Graph object. restored_graph = tf.Graph() restoration_config = tf.ConfigProto(log_device_placement=False, intra_op_parallelism_threads=10, inter_op_parallelism_threads=10) with tf.Session(graph=restored_graph, config=restoration_config) as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], export_path) averaged_hook_name, non_averaged_hook_name, _ = self.GetHookNodeNames( master_spec) # Check that the averaged runtime hook node does not exist. with self.assertRaises(KeyError): restored_graph.get_operation_by_name(averaged_hook_name) # Check that the non-averaged version also does not exist. with self.assertRaises(KeyError): restored_graph.get_operation_by_name(non_averaged_hook_name)
def testModelExport(self): # Get the master spec and params for this graph. master_spec = self.LoadSpec('ud-hungarian.master-spec') params_path = os.path.join( test_flags.source_root(), 'dragnn/python/testdata' '/ud-hungarian.params') # Export the graph via SavedModel. (Here, we maintain a handle to the graph # for comparison, but that's usually not necessary.) export_path = os.path.join(test_flags.temp_dir(), 'export') dragnn_model_saver_lib.clean_output_paths(export_path) saver_graph = tf.Graph() shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths( master_spec) dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph) dragnn_model_saver_lib.export_to_graph( master_spec, params_path, export_path, saver_graph, export_moving_averages=False, build_runtime_graph=False) # Export the assets as well. dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original, export_path) # Validate that the assets are all in the exported directory. path_set = self.ValidateAssetExistence(master_spec, export_path) # This master-spec has 4 unique assets. If there are more, we have not # uniquified the assets properly. self.assertEqual(len(path_set), 4) # Restore the graph from the checkpoint into a new Graph object. restored_graph = tf.Graph() restoration_config = tf.ConfigProto( log_device_placement=False, intra_op_parallelism_threads=10, inter_op_parallelism_threads=10) with tf.Session(graph=restored_graph, config=restoration_config) as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], export_path) averaged_hook_name, non_averaged_hook_name, _ = self.GetHookNodeNames( master_spec) # Check that the averaged runtime hook node does not exist. with self.assertRaises(KeyError): restored_graph.get_operation_by_name(averaged_hook_name) # Check that the non-averaged version also does not exist. with self.assertRaises(KeyError): restored_graph.get_operation_by_name(non_averaged_hook_name)
def LoadMap(self, map_name): loaded_map = {} with open(os.path.join(test_flags.temp_dir(), map_name), 'r') as f: for line in f: entries = line.strip().split(' ') if len(entries) >= 2: loaded_map[' '.join(entries[:-1])] = entries[-1] return loaded_map
def testWordEmbeddingInitializerPresetRowNumber(self): records_path = os.path.join(test_flags.temp_dir(), 'records3') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('a', [1, 2, 3])) writer.write(self._token_embedding('b', [2, 3, 4])) writer.write(self._token_embedding('c', [3, 4, 5])) writer.write(self._token_embedding('d', [4, 5, 6])) writer.write(self._token_embedding('e', [5, 6, 7])) del writer vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary3') with open(vocabulary_path, 'w') as vocabulary_file: vocabulary_file.write( 'a\nc\ne\nx\n') # 'x' not in pretrained embeddings # Enumerate a variety of configurations. for cache_vectors_locally in [False, True]: for num_special_embeddings in [None, 1, 2, 5]: # None = use default of 3 for override_num_embeddings in [-1, 8, 10]: with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, vocabulary=vocabulary_path, override_num_embeddings=override_num_embeddings, cache_vectors_locally=cache_vectors_locally, num_special_embeddings=num_special_embeddings) # Expect 4 embeddings from the vocabulary plus special embeddings. expected_num_embeddings = 4 + (num_special_embeddings or 3) if override_num_embeddings > 0: expected_num_embeddings = override_num_embeddings self.assertAllEqual([expected_num_embeddings, 3], tf.shape(embeddings).eval()) # The first 3 embeddings should be pretrained. norm_a = (1.0 + 4.0 + 9.0)**0.5 norm_c = (9.0 + 16.0 + 25.0)**0.5 norm_e = (25.0 + 36.0 + 49.0)**0.5 self.assertAllClose( [[1.0 / norm_a, 2.0 / norm_a, 3.0 / norm_a], [3.0 / norm_c, 4.0 / norm_c, 5.0 / norm_c], [5.0 / norm_e, 6.0 / norm_e, 7.0 / norm_e]], embeddings[:3].eval())
def testWordEmbeddingInitializerVocabularyFileWithDuplicates(self): records_path = os.path.join(test_flags.temp_dir(), 'records4') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('a', [1, 2, 3])) writer.write(self._token_embedding('b', [2, 3, 4])) writer.write(self._token_embedding('c', [3, 4, 5])) writer.write(self._token_embedding('d', [4, 5, 6])) writer.write(self._token_embedding('e', [5, 6, 7])) del writer vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary4') with open(vocabulary_path, 'w') as vocabulary_file: vocabulary_file.write('a\nc\ne\nx\ny\nx') # 'x' duplicated with self.test_session(): with self.assertRaises(Exception): gen_parser_ops.word_embedding_initializer( vectors=records_path, vocabulary=vocabulary_path).eval()
def setUp(self): # Creates a task context with the correct testing paths. initial_task_context = os.path.join(test_flags.source_root(), 'syntaxnet/' 'testdata/context.pbtxt') self._task_context = os.path.join(test_flags.temp_dir(), 'context.pbtxt') with open(initial_task_context, 'r') as fin: with open(self._task_context, 'w') as fout: fout.write(fin.read().replace('SRCDIR', test_flags.source_root()) .replace('OUTPATH', test_flags.temp_dir())) # Creates necessary term maps. with self.test_session() as sess: gen_parser_ops.lexicon_builder(task_context=self._task_context, corpus_name='training-corpus').run() self._num_features, self._num_feature_ids, _, self._num_actions = ( sess.run(gen_parser_ops.feature_size(task_context=self._task_context, arg_prefix='brain_parser')))
def testModelExportProducesRunnableModel(self): # Get the master spec and params for this graph. master_spec = self.LoadSpec('ud-hungarian.master-spec') params_path = os.path.join( test_flags.source_root(), 'dragnn/python/testdata' '/ud-hungarian.params') # Export the graph via SavedModel. (Here, we maintain a handle to the graph # for comparison, but that's usually not necessary.) export_path = os.path.join(test_flags.temp_dir(), 'export') dragnn_model_saver_lib.clean_output_paths(export_path) saver_graph = tf.Graph() shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths( master_spec) dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph) dragnn_model_saver_lib.export_to_graph(master_spec, params_path, export_path, saver_graph, export_moving_averages=False, build_runtime_graph=False) # Export the assets as well. dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original, export_path) # Restore the graph from the checkpoint into a new Graph object. restored_graph = tf.Graph() restoration_config = tf.ConfigProto(log_device_placement=False, intra_op_parallelism_threads=10, inter_op_parallelism_threads=10) with tf.Session(graph=restored_graph, config=restoration_config) as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], export_path) test_doc = sentence_pb2.Sentence() text_format.Parse(_DUMMY_TEST_SENTENCE, test_doc) test_reader_string = test_doc.SerializeToString() test_inputs = [test_reader_string] tf_out = sess.run('annotation/annotations:0', feed_dict={ 'annotation/ComputeSession/InputBatch:0': test_inputs }) # We don't care about accuracy, only that the run sessions don't crash. del tf_out
def WriteContext(self, corpus_format): context = task_spec_pb2.TaskSpec() self.AddInput('documents', self.corpus_file, corpus_format, context) for name in ('word-map', 'lcword-map', 'tag-map', 'category-map', 'label-map', 'prefix-table', 'suffix-table', 'tag-to-category'): self.AddInput(name, os.path.join(test_flags.temp_dir(), name), '', context) LOGGING.info('Writing context to: %s', self.context_file) with open(self.context_file, 'w') as f: f.write(str(context))
def WriteContext(self, corpus_format): context = task_spec_pb2.TaskSpec() self.AddInput('documents', self.corpus_file, corpus_format, context) for name in ('word-map', 'lcword-map', 'tag-map', 'category-map', 'label-map', 'prefix-table', 'suffix-table', 'tag-to-category'): self.AddInput(name, os.path.join(test_flags.temp_dir(), name), '', context) logging.info('Writing context to: %s', self.context_file) with open(self.context_file, 'w') as f: f.write(str(context))
def testWordEmbeddingInitializerPresetRowNumber(self): records_path = os.path.join(test_flags.temp_dir(), 'records3') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('a', [1, 2, 3])) writer.write(self._token_embedding('b', [2, 3, 4])) writer.write(self._token_embedding('c', [3, 4, 5])) writer.write(self._token_embedding('d', [4, 5, 6])) writer.write(self._token_embedding('e', [5, 6, 7])) del writer vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary3') with open(vocabulary_path, 'w') as vocabulary_file: vocabulary_file.write('a\nc\ne\nx\n') # 'x' not in pretrained embeddings # Enumerate a variety of configurations. for cache_vectors_locally in [False, True]: for num_special_embeddings in [None, 1, 2, 5]: # None = use default of 3 for override_num_embeddings in [-1, 8, 10]: with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, vocabulary=vocabulary_path, override_num_embeddings=override_num_embeddings, cache_vectors_locally=cache_vectors_locally, num_special_embeddings=num_special_embeddings) # Expect 4 embeddings from the vocabulary plus special embeddings. expected_num_embeddings = 4 + (num_special_embeddings or 3) if override_num_embeddings > 0: expected_num_embeddings = override_num_embeddings self.assertAllEqual([expected_num_embeddings, 3], tf.shape(embeddings).eval()) # The first 3 embeddings should be pretrained. norm_a = (1.0 + 4.0 + 9.0)**0.5 norm_c = (9.0 + 16.0 + 25.0)**0.5 norm_e = (25.0 + 36.0 + 49.0)**0.5 self.assertAllClose([[1.0 / norm_a, 2.0 / norm_a, 3.0 / norm_a], [ 3.0 / norm_c, 4.0 / norm_c, 5.0 / norm_c ], [5.0 / norm_e, 6.0 / norm_e, 7.0 / norm_e]], embeddings[:3].eval())
def testModelExportProducesRunnableModel(self): # Get the master spec and params for this graph. master_spec = self.LoadSpec('ud-hungarian.master-spec') params_path = os.path.join( test_flags.source_root(), 'dragnn/python/testdata' '/ud-hungarian.params') # Export the graph via SavedModel. (Here, we maintain a handle to the graph # for comparison, but that's usually not necessary.) export_path = os.path.join(test_flags.temp_dir(), 'export') dragnn_model_saver_lib.clean_output_paths(export_path) saver_graph = tf.Graph() shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths( master_spec) dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph) dragnn_model_saver_lib.export_to_graph( master_spec, params_path, export_path, saver_graph, export_moving_averages=False, build_runtime_graph=False) # Export the assets as well. dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original, export_path) # Restore the graph from the checkpoint into a new Graph object. restored_graph = tf.Graph() restoration_config = tf.ConfigProto( log_device_placement=False, intra_op_parallelism_threads=10, inter_op_parallelism_threads=10) with tf.Session(graph=restored_graph, config=restoration_config) as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], export_path) test_doc = sentence_pb2.Sentence() text_format.Parse(_DUMMY_TEST_SENTENCE, test_doc) test_reader_string = test_doc.SerializeToString() test_inputs = [test_reader_string] tf_out = sess.run( 'annotation/annotations:0', feed_dict={'annotation/ComputeSession/InputBatch:0': test_inputs}) # We don't care about accuracy, only that the run sessions don't crash. del tf_out
def WriteContext(self, corpus_format): context = task_spec_pb2.TaskSpec() self.AddParameter('brain_parser_embedding_names', 'words;tags', context) self.AddParameter('brain_parser_features', 'input.token.word;input.tag', context) self.AddInput('documents', self.corpus_file, corpus_format, context) for name in ('word-map', 'lcword-map', 'tag-map', 'category-map', 'label-map', 'prefix-table', 'suffix-table', 'tag-to-category', 'char-map', 'char-ngram-map'): self.AddInput(name, os.path.join(test_flags.temp_dir(), name), '', context) logging.info('Writing context to: %s', self.context_file) with open(self.context_file, 'w') as f: f.write(str(context))
def testWordEmbeddingInitializer(self): # Provide embeddings for the first three words in the word map. records_path = os.path.join(test_flags.temp_dir(), 'records1') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('.', [1, 2])) writer.write(self._token_embedding(',', [3, 4])) writer.write(self._token_embedding('the', [5, 6])) del writer with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context).eval() self.assertAllClose( np.array([[1. / (1 + 4)**.5, 2. / (1 + 4)**.5], [3. / (9 + 16)**.5, 4. / (9 + 16)**.5], [5. / (25 + 36)**.5, 6. / (25 + 36)**.5]]), embeddings[:3, ])
def testWordEmbeddingInitializer(self): # Provide embeddings for the first three words in the word map. records_path = os.path.join(test_flags.temp_dir(), 'records1') writer = tf.python_io.TFRecordWriter(records_path) writer.write(self._token_embedding('.', [1, 2])) writer.write(self._token_embedding(',', [3, 4])) writer.write(self._token_embedding('the', [5, 6])) del writer with self.test_session(): embeddings = gen_parser_ops.word_embedding_initializer( vectors=records_path, task_context=self._task_context).eval() self.assertAllClose( np.array([[1. / (1 + 4) ** .5, 2. / (1 + 4) ** .5], [3. / (9 + 16) ** .5, 4. / (9 + 16) ** .5], [5. / (25 + 36) ** .5, 6. / (25 + 36) ** .5]]), embeddings[:3,])
def RunFullTrainingAndInference(self, test_name, master_spec_path=None, master_spec=None, hyperparam_config=None, component_weights=None, unroll_using_oracle=None, num_evaluated_components=1, expected_num_actions=None, expected=None, batch_size_limit=None): if not master_spec: master_spec = self.LoadSpec(master_spec_path) gold_doc = sentence_pb2.Sentence() text_format.Parse(_DUMMY_GOLD_SENTENCE, gold_doc) gold_doc_2 = sentence_pb2.Sentence() text_format.Parse(_DUMMY_GOLD_SENTENCE_2, gold_doc_2) gold_reader_strings = [ gold_doc.SerializeToString(), gold_doc_2.SerializeToString() ] test_doc = sentence_pb2.Sentence() text_format.Parse(_DUMMY_TEST_SENTENCE, test_doc) test_doc_2 = sentence_pb2.Sentence() text_format.Parse(_DUMMY_TEST_SENTENCE_2, test_doc_2) test_reader_strings = [ test_doc.SerializeToString(), test_doc.SerializeToString(), test_doc_2.SerializeToString(), test_doc.SerializeToString() ] if batch_size_limit is not None: gold_reader_strings = gold_reader_strings[:batch_size_limit] test_reader_strings = test_reader_strings[:batch_size_limit] with tf.Graph().as_default(): tf.set_random_seed(1) if not hyperparam_config: hyperparam_config = spec_pb2.GridPoint() builder = graph_builder.MasterBuilder(master_spec, hyperparam_config, pool_scope=test_name) target = spec_pb2.TrainTarget() target.name = 'testFullInference-train_bkp-%s' % test_name if component_weights: target.component_weights.extend(component_weights) else: target.component_weights.extend([0] * len(master_spec.component)) target.component_weights[-1] = 1.0 if unroll_using_oracle: target.unroll_using_oracle.extend(unroll_using_oracle) else: target.unroll_using_oracle.extend([False] * len(master_spec.component)) target.unroll_using_oracle[-1] = True train = builder.add_training_from_config(target) oracle_trace = builder.add_training_from_config( target, prefix='train_traced-', trace_only=True) builder.add_saver() anno = builder.add_annotation(test_name) trace = builder.add_annotation(test_name + '-traced', enable_tracing=True) # Verifies that the summaries can be built. for component in builder.components: component.get_summaries() config = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0) with self.test_session(config=config) as sess: logging.info('Initializing') sess.run(tf.global_variables_initializer()) logging.info('Dry run oracle trace...') traces = sess.run(oracle_trace['traces'], feed_dict={ oracle_trace['input_batch']: gold_reader_strings }) # Check that the oracle traces are not empty. for serialized_trace in traces: master_trace = trace_pb2.MasterTrace() master_trace.ParseFromString(serialized_trace) self.assertTrue(master_trace.component_trace) self.assertTrue(master_trace.component_trace[0].step_trace) logging.info('Simulating training...') break_iter = 400 is_resolved = False for i in range( 0, 400 ): # needs ~100 iterations, but is not deterministic cost, eval_res_val = sess.run( [train['cost'], train['metrics']], feed_dict={train['input_batch']: gold_reader_strings}) logging.info('cost = %s', cost) self.assertFalse(np.isnan(cost)) total_val = eval_res_val.reshape((-1, 2))[:, 0].sum() correct_val = eval_res_val.reshape((-1, 2))[:, 1].sum() if correct_val == total_val and not is_resolved: logging.info( '... converged on iteration %d with (correct, total) ' '= (%d, %d)', i, correct_val, total_val) is_resolved = True # Run for slightly longer than convergence to help with quantized # weight tiebreakers. break_iter = i + 50 if i == break_iter: break # If training failed, report total/correct actions for each component. if not expected_num_actions: expected_num_actions = 4 * num_evaluated_components if (correct_val != total_val or correct_val != expected_num_actions or total_val != expected_num_actions): for c in xrange(len(master_spec.component)): logging.error( 'component %s:\nname=%s\ntotal=%s\ncorrect=%s', c, master_spec.component[c].name, eval_res_val[2 * c], eval_res_val[2 * c + 1]) assert correct_val == total_val, 'Did not converge! %d vs %d.' % ( correct_val, total_val) self.assertEqual(expected_num_actions, correct_val) self.assertEqual(expected_num_actions, total_val) builder.saver.save( sess, os.path.join(test_flags.temp_dir(), 'model')) logging.info('Running test.') logging.info('Printing annotations') annotations = sess.run( anno['annotations'], feed_dict={anno['input_batch']: test_reader_strings}) logging.info('Put %d inputs in, got %d annotations out.', len(test_reader_strings), len(annotations)) # Also run the annotation graph with tracing enabled. annotations_with_trace, traces = sess.run( [trace['annotations'], trace['traces']], feed_dict={trace['input_batch']: test_reader_strings}) # The result of the two annotation graphs should be identical. self.assertItemsEqual(annotations, annotations_with_trace) # Check that the inference traces are not empty. for serialized_trace in traces: master_trace = trace_pb2.MasterTrace() master_trace.ParseFromString(serialized_trace) self.assertTrue(master_trace.component_trace) self.assertTrue(master_trace.component_trace[0].step_trace) self.assertEqual(len(test_reader_strings), len(annotations)) pred_sentences = [] for annotation in annotations: pred_sentences.append(sentence_pb2.Sentence()) pred_sentences[-1].ParseFromString(annotation) if expected is None: expected = _TAGGER_EXPECTED_SENTENCES expected_sentences = [expected[i] for i in [0, 0, 1, 0]] for i, pred_sentence in enumerate(pred_sentences): self.assertProtoEquals(expected_sentences[i], pred_sentence)
def testModelExportWithAveragesAndHooks(self): # Get the master spec and params for this graph. master_spec = self.LoadSpec('ud-hungarian.master-spec') params_path = os.path.join( test_flags.source_root(), 'dragnn/python/testdata' '/ud-hungarian.params') # Export the graph via SavedModel. (Here, we maintain a handle to the graph # for comparison, but that's usually not necessary.) Note that the export # path must not already exist. export_path = os.path.join(test_flags.temp_dir(), 'export2') dragnn_model_saver_lib.clean_output_paths(export_path) saver_graph = tf.Graph() shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths( master_spec) dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph) dragnn_model_saver_lib.export_to_graph( master_spec, params_path, export_path, saver_graph, export_moving_averages=True, build_runtime_graph=True) # Export the assets as well. dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original, export_path) # Validate that the assets are all in the exported directory. path_set = self.ValidateAssetExistence(master_spec, export_path) # This master-spec has 4 unique assets. If there are more, we have not # uniquified the assets properly. self.assertEqual(len(path_set), 4) # Restore the graph from the checkpoint into a new Graph object. restored_graph = tf.Graph() restoration_config = tf.ConfigProto( log_device_placement=False, intra_op_parallelism_threads=10, inter_op_parallelism_threads=10) with tf.Session(graph=restored_graph, config=restoration_config) as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], export_path) averaged_hook_name, non_averaged_hook_name, cell_subgraph_hook_name = ( self.GetHookNodeNames(master_spec)) # Check that an averaged runtime hook node exists. restored_graph.get_operation_by_name(averaged_hook_name) # Check that the non-averaged version does not exist. with self.assertRaises(KeyError): restored_graph.get_operation_by_name(non_averaged_hook_name) # Load the cell subgraph. cell_subgraph_bytes = restored_graph.get_tensor_by_name( cell_subgraph_hook_name + ':0') cell_subgraph_bytes = cell_subgraph_bytes.eval( feed_dict={'annotation/ComputeSession/InputBatch:0': []}) cell_subgraph_spec = export_pb2.CellSubgraphSpec() cell_subgraph_spec.ParseFromString(cell_subgraph_bytes) tf.logging.info('cell_subgraph_spec = %s', cell_subgraph_spec) # Sanity check inputs. for cell_input in cell_subgraph_spec.input: self.assertGreater(len(cell_input.name), 0) self.assertGreater(len(cell_input.tensor), 0) self.assertNotEqual(cell_input.type, export_pb2.CellSubgraphSpec.Input.TYPE_UNKNOWN) restored_graph.get_tensor_by_name(cell_input.tensor) # shouldn't raise # Sanity check outputs. for cell_output in cell_subgraph_spec.output: self.assertGreater(len(cell_output.name), 0) self.assertGreater(len(cell_output.tensor), 0) restored_graph.get_tensor_by_name(cell_output.tensor) # shouldn't raise # GetHookNames() finds a component with a fixed feature, so at least the # first feature ID should exist. self.assertTrue( any(cell_input.name == 'fixed_channel_0_index_0_ids' for cell_input in cell_subgraph_spec.input)) # Most dynamic components produce a logits layer. self.assertTrue( any(cell_output.name == 'logits' for cell_output in cell_subgraph_spec.output))
def setUp(self): self.corpus_file = os.path.join(test_flags.temp_dir(), 'documents.conll') self.context_file = os.path.join(test_flags.temp_dir(), 'context.pbtxt')
def testModelExportWithAveragesAndHooks(self): # Get the master spec and params for this graph. master_spec = self.LoadSpec('ud-hungarian.master-spec') params_path = os.path.join( test_flags.source_root(), 'dragnn/python/testdata' '/ud-hungarian.params') # Export the graph via SavedModel. (Here, we maintain a handle to the graph # for comparison, but that's usually not necessary.) Note that the export # path must not already exist. export_path = os.path.join(test_flags.temp_dir(), 'export2') dragnn_model_saver_lib.clean_output_paths(export_path) saver_graph = tf.Graph() shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths( master_spec) dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph) dragnn_model_saver_lib.export_to_graph(master_spec, params_path, export_path, saver_graph, export_moving_averages=True, build_runtime_graph=True) # Export the assets as well. dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original, export_path) # Validate that the assets are all in the exported directory. path_set = self.ValidateAssetExistence(master_spec, export_path) # This master-spec has 4 unique assets. If there are more, we have not # uniquified the assets properly. self.assertEqual(len(path_set), 4) # Restore the graph from the checkpoint into a new Graph object. restored_graph = tf.Graph() restoration_config = tf.ConfigProto(log_device_placement=False, intra_op_parallelism_threads=10, inter_op_parallelism_threads=10) with tf.Session(graph=restored_graph, config=restoration_config) as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], export_path) averaged_hook_name, non_averaged_hook_name, cell_subgraph_hook_name = ( self.GetHookNodeNames(master_spec)) # Check that an averaged runtime hook node exists. restored_graph.get_operation_by_name(averaged_hook_name) # Check that the non-averaged version does not exist. with self.assertRaises(KeyError): restored_graph.get_operation_by_name(non_averaged_hook_name) # Load the cell subgraph. cell_subgraph_bytes = restored_graph.get_tensor_by_name( cell_subgraph_hook_name + ':0') cell_subgraph_bytes = cell_subgraph_bytes.eval( feed_dict={'annotation/ComputeSession/InputBatch:0': []}) cell_subgraph_spec = export_pb2.CellSubgraphSpec() cell_subgraph_spec.ParseFromString(cell_subgraph_bytes) tf.logging.info('cell_subgraph_spec = %s', cell_subgraph_spec) # Sanity check inputs. for cell_input in cell_subgraph_spec.input: self.assertGreater(len(cell_input.name), 0) self.assertGreater(len(cell_input.tensor), 0) self.assertNotEqual( cell_input.type, export_pb2.CellSubgraphSpec.Input.TYPE_UNKNOWN) restored_graph.get_tensor_by_name( cell_input.tensor) # shouldn't raise # Sanity check outputs. for cell_output in cell_subgraph_spec.output: self.assertGreater(len(cell_output.name), 0) self.assertGreater(len(cell_output.tensor), 0) restored_graph.get_tensor_by_name( cell_output.tensor) # shouldn't raise # GetHookNames() finds a component with a fixed feature, so at least the # first feature ID should exist. self.assertTrue( any(cell_input.name == 'fixed_channel_0_index_0_ids' for cell_input in cell_subgraph_spec.input)) # Most dynamic components produce a logits layer. self.assertTrue( any(cell_output.name == 'logits' for cell_output in cell_subgraph_spec.output))
def RunFullTrainingAndInference(self, test_name, master_spec_path=None, master_spec=None, hyperparam_config=None, component_weights=None, unroll_using_oracle=None, num_evaluated_components=1, expected_num_actions=None, expected=None, batch_size_limit=None): if not master_spec: master_spec = self.LoadSpec(master_spec_path) gold_doc = sentence_pb2.Sentence() text_format.Parse(_DUMMY_GOLD_SENTENCE, gold_doc) gold_doc_2 = sentence_pb2.Sentence() text_format.Parse(_DUMMY_GOLD_SENTENCE_2, gold_doc_2) gold_reader_strings = [ gold_doc.SerializeToString(), gold_doc_2.SerializeToString() ] test_doc = sentence_pb2.Sentence() text_format.Parse(_DUMMY_TEST_SENTENCE, test_doc) test_doc_2 = sentence_pb2.Sentence() text_format.Parse(_DUMMY_TEST_SENTENCE_2, test_doc_2) test_reader_strings = [ test_doc.SerializeToString(), test_doc.SerializeToString(), test_doc_2.SerializeToString(), test_doc.SerializeToString() ] if batch_size_limit is not None: gold_reader_strings = gold_reader_strings[:batch_size_limit] test_reader_strings = test_reader_strings[:batch_size_limit] with tf.Graph().as_default(): tf.set_random_seed(1) if not hyperparam_config: hyperparam_config = spec_pb2.GridPoint() builder = graph_builder.MasterBuilder( master_spec, hyperparam_config, pool_scope=test_name) target = spec_pb2.TrainTarget() target.name = 'testFullInference-train-%s' % test_name if component_weights: target.component_weights.extend(component_weights) else: target.component_weights.extend([0] * len(master_spec.component)) target.component_weights[-1] = 1.0 if unroll_using_oracle: target.unroll_using_oracle.extend(unroll_using_oracle) else: target.unroll_using_oracle.extend([False] * len(master_spec.component)) target.unroll_using_oracle[-1] = True train = builder.add_training_from_config(target) oracle_trace = builder.add_training_from_config( target, prefix='train_traced-', trace_only=True) builder.add_saver() anno = builder.add_annotation(test_name) trace = builder.add_annotation(test_name + '-traced', enable_tracing=True) # Verifies that the summaries can be built. for component in builder.components: component.get_summaries() config = tf.ConfigProto( intra_op_parallelism_threads=0, inter_op_parallelism_threads=0) with self.test_session(config=config) as sess: logging.info('Initializing') sess.run(tf.global_variables_initializer()) logging.info('Dry run oracle trace...') traces = sess.run( oracle_trace['traces'], feed_dict={oracle_trace['input_batch']: gold_reader_strings}) # Check that the oracle traces are not empty. for serialized_trace in traces: master_trace = trace_pb2.MasterTrace() master_trace.ParseFromString(serialized_trace) self.assertTrue(master_trace.component_trace) self.assertTrue(master_trace.component_trace[0].step_trace) logging.info('Simulating training...') break_iter = 400 is_resolved = False for i in range(0, 400): # needs ~100 iterations, but is not deterministic cost, eval_res_val = sess.run( [train['cost'], train['metrics']], feed_dict={train['input_batch']: gold_reader_strings}) logging.info('cost = %s', cost) self.assertFalse(np.isnan(cost)) total_val = eval_res_val.reshape((-1, 2))[:, 0].sum() correct_val = eval_res_val.reshape((-1, 2))[:, 1].sum() if correct_val == total_val and not is_resolved: logging.info('... converged on iteration %d with (correct, total) ' '= (%d, %d)', i, correct_val, total_val) is_resolved = True # Run for slightly longer than convergence to help with quantized # weight tiebreakers. break_iter = i + 50 if i == break_iter: break # If training failed, report total/correct actions for each component. if not expected_num_actions: expected_num_actions = 4 * num_evaluated_components if (correct_val != total_val or correct_val != expected_num_actions or total_val != expected_num_actions): for c in xrange(len(master_spec.component)): logging.error('component %s:\nname=%s\ntotal=%s\ncorrect=%s', c, master_spec.component[c].name, eval_res_val[2 * c], eval_res_val[2 * c + 1]) assert correct_val == total_val, 'Did not converge! %d vs %d.' % ( correct_val, total_val) self.assertEqual(expected_num_actions, correct_val) self.assertEqual(expected_num_actions, total_val) builder.saver.save(sess, os.path.join(test_flags.temp_dir(), 'model')) logging.info('Running test.') logging.info('Printing annotations') annotations = sess.run( anno['annotations'], feed_dict={anno['input_batch']: test_reader_strings}) logging.info('Put %d inputs in, got %d annotations out.', len(test_reader_strings), len(annotations)) # Also run the annotation graph with tracing enabled. annotations_with_trace, traces = sess.run( [trace['annotations'], trace['traces']], feed_dict={trace['input_batch']: test_reader_strings}) # The result of the two annotation graphs should be identical. self.assertItemsEqual(annotations, annotations_with_trace) # Check that the inference traces are not empty. for serialized_trace in traces: master_trace = trace_pb2.MasterTrace() master_trace.ParseFromString(serialized_trace) self.assertTrue(master_trace.component_trace) self.assertTrue(master_trace.component_trace[0].step_trace) self.assertEqual(len(test_reader_strings), len(annotations)) pred_sentences = [] for annotation in annotations: pred_sentences.append(sentence_pb2.Sentence()) pred_sentences[-1].ParseFromString(annotation) if expected is None: expected = _TAGGER_EXPECTED_SENTENCES expected_sentences = [expected[i] for i in [0, 0, 1, 0]] for i, pred_sentence in enumerate(pred_sentences): self.assertProtoEquals(expected_sentences[i], pred_sentence)