Ejemplo n.º 1
0
  def testBuildLexicon(self):
    empty_input_path = os.path.join(test_flags.temp_dir(), 'empty-input')
    lexicon_output_path = os.path.join(test_flags.temp_dir(), 'lexicon-output')

    with open(empty_input_path, 'w'):
      pass

    # The directory may already exist when running locally multiple times.
    if not os.path.exists(lexicon_output_path):
      os.mkdir(lexicon_output_path)

    # Just make sure this doesn't crash; the lexicon builder op is already
    # exercised in its own unit test.
    lexicon.build_lexicon(lexicon_output_path, empty_input_path)
Ejemplo n.º 2
0
    def testBuildLexicon(self):
        empty_input_path = os.path.join(test_flags.temp_dir(), 'empty-input')
        lexicon_output_path = os.path.join(test_flags.temp_dir(),
                                           'lexicon-output')

        with open(empty_input_path, 'w'):
            pass

        # The directory may already exist when running locally multiple times.
        if not os.path.exists(lexicon_output_path):
            os.mkdir(lexicon_output_path)

        # Just make sure this doesn't crash; the lexicon builder op is already
        # exercised in its own unit test.
        lexicon.build_lexicon(lexicon_output_path, empty_input_path)
Ejemplo n.º 3
0
 def CreateLocalSpec(self, spec_path):
   master_spec = self.LoadSpec(spec_path)
   master_spec_name = os.path.basename(spec_path)
   outfile = os.path.join(test_flags.temp_dir(), master_spec_name)
   fout = open(outfile, 'w')
   fout.write(text_format.MessageToString(master_spec))
   return outfile
Ejemplo n.º 4
0
    def testWordEmbeddingInitializerRepeatability(self):
        records_path = os.path.join(test_flags.temp_dir(), 'records2')
        writer = tf.python_io.TFRecordWriter(records_path)
        writer.write(self._token_embedding('.', [1, 2, 3]))  # 3 dims
        del writer

        # As long as there is one non-zero seed, the result should be repeatable.
        for seed1, seed2 in [(0, 1), (1, 0), (123, 456)]:
            with tf.Graph().as_default(), self.test_session():
                embeddings1 = gen_parser_ops.word_embedding_initializer(
                    vectors=records_path,
                    task_context=self._task_context,
                    seed=seed1,
                    seed2=seed2)
                embeddings2 = gen_parser_ops.word_embedding_initializer(
                    vectors=records_path,
                    task_context=self._task_context,
                    seed=seed1,
                    seed2=seed2)

                # The number of terms is based on the word map, which may change if the
                # test corpus is updated.  Just assert that there are some terms.
                self.assertGreater(tf.shape(embeddings1)[0].eval(), 0)
                self.assertGreater(tf.shape(embeddings2)[0].eval(), 0)
                self.assertEqual(tf.shape(embeddings1)[1].eval(), 3)
                self.assertEqual(tf.shape(embeddings2)[1].eval(), 3)
                self.assertAllEqual(embeddings1.eval(), embeddings2.eval())
 def CreateLocalSpec(self, spec_path):
     master_spec = self.LoadSpec(spec_path)
     master_spec_name = os.path.basename(spec_path)
     outfile = os.path.join(test_flags.temp_dir(), master_spec_name)
     fout = open(outfile, 'w')
     fout.write(text_format.MessageToString(master_spec))
     return outfile
 def ValidateTagToCategoryMap(self):
     with open(os.path.join(test_flags.temp_dir(), 'tag-to-category'),
               'r') as f:
         entries = [line.strip().split('\t') for line in f.readlines()]
     for tag, category in entries:
         self.assertIn(tag, TAGS)
         self.assertIn(category, CATEGORIES)
Ejemplo n.º 7
0
  def testWordEmbeddingInitializerRepeatability(self):
    records_path = os.path.join(test_flags.temp_dir(), 'records2')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('.', [1, 2, 3]))  # 3 dims
    del writer

    # As long as there is one non-zero seed, the result should be repeatable.
    for seed1, seed2 in [(0, 1), (1, 0), (123, 456)]:
      with tf.Graph().as_default(), self.test_session():
        embeddings1 = gen_parser_ops.word_embedding_initializer(
            vectors=records_path,
            task_context=self._task_context,
            seed=seed1,
            seed2=seed2)
        embeddings2 = gen_parser_ops.word_embedding_initializer(
            vectors=records_path,
            task_context=self._task_context,
            seed=seed1,
            seed2=seed2)

        # The number of terms is based on the word map, which may change if the
        # test corpus is updated.  Just assert that there are some terms.
        self.assertGreater(tf.shape(embeddings1)[0].eval(), 0)
        self.assertGreater(tf.shape(embeddings2)[0].eval(), 0)
        self.assertEqual(tf.shape(embeddings1)[1].eval(), 3)
        self.assertEqual(tf.shape(embeddings2)[1].eval(), 3)
        self.assertAllEqual(embeddings1.eval(), embeddings2.eval())
    def testModelExport(self):
        # Get the master spec and params for this graph.
        master_spec = self.LoadSpec('ud-hungarian.master-spec')
        params_path = os.path.join(
            test_flags.source_root(), 'dragnn/python/testdata'
            '/ud-hungarian.params')

        # Export the graph via SavedModel. (Here, we maintain a handle to the graph
        # for comparison, but that's usually not necessary.)
        export_path = os.path.join(test_flags.temp_dir(), 'export')
        dragnn_model_saver_lib.clean_output_paths(export_path)
        saver_graph = tf.Graph()

        shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths(
            master_spec)

        dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph)

        dragnn_model_saver_lib.export_to_graph(master_spec,
                                               params_path,
                                               export_path,
                                               saver_graph,
                                               export_moving_averages=False,
                                               build_runtime_graph=False)

        # Export the assets as well.
        dragnn_model_saver_lib.export_assets(master_spec,
                                             shortened_to_original,
                                             export_path)

        # Validate that the assets are all in the exported directory.
        path_set = self.ValidateAssetExistence(master_spec, export_path)

        # This master-spec has 4 unique assets. If there are more, we have not
        # uniquified the assets properly.
        self.assertEqual(len(path_set), 4)

        # Restore the graph from the checkpoint into a new Graph object.
        restored_graph = tf.Graph()
        restoration_config = tf.ConfigProto(log_device_placement=False,
                                            intra_op_parallelism_threads=10,
                                            inter_op_parallelism_threads=10)

        with tf.Session(graph=restored_graph,
                        config=restoration_config) as sess:
            tf.saved_model.loader.load(sess,
                                       [tf.saved_model.tag_constants.SERVING],
                                       export_path)

        averaged_hook_name, non_averaged_hook_name, _ = self.GetHookNodeNames(
            master_spec)

        # Check that the averaged runtime hook node does not exist.
        with self.assertRaises(KeyError):
            restored_graph.get_operation_by_name(averaged_hook_name)

        # Check that the non-averaged version also does not exist.
        with self.assertRaises(KeyError):
            restored_graph.get_operation_by_name(non_averaged_hook_name)
Ejemplo n.º 9
0
  def testModelExport(self):
    # Get the master spec and params for this graph.
    master_spec = self.LoadSpec('ud-hungarian.master-spec')
    params_path = os.path.join(
        test_flags.source_root(),
        'dragnn/python/testdata'
        '/ud-hungarian.params')

    # Export the graph via SavedModel. (Here, we maintain a handle to the graph
    # for comparison, but that's usually not necessary.)
    export_path = os.path.join(test_flags.temp_dir(), 'export')
    dragnn_model_saver_lib.clean_output_paths(export_path)
    saver_graph = tf.Graph()

    shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths(
        master_spec)

    dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph)

    dragnn_model_saver_lib.export_to_graph(
        master_spec,
        params_path,
        export_path,
        saver_graph,
        export_moving_averages=False,
        build_runtime_graph=False)

    # Export the assets as well.
    dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original,
                                         export_path)

    # Validate that the assets are all in the exported directory.
    path_set = self.ValidateAssetExistence(master_spec, export_path)

    # This master-spec has 4 unique assets. If there are more, we have not
    # uniquified the assets properly.
    self.assertEqual(len(path_set), 4)

    # Restore the graph from the checkpoint into a new Graph object.
    restored_graph = tf.Graph()
    restoration_config = tf.ConfigProto(
        log_device_placement=False,
        intra_op_parallelism_threads=10,
        inter_op_parallelism_threads=10)

    with tf.Session(graph=restored_graph, config=restoration_config) as sess:
      tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING],
                                 export_path)

    averaged_hook_name, non_averaged_hook_name, _ = self.GetHookNodeNames(
        master_spec)

    # Check that the averaged runtime hook node does not exist.
    with self.assertRaises(KeyError):
      restored_graph.get_operation_by_name(averaged_hook_name)

    # Check that the non-averaged version also does not exist.
    with self.assertRaises(KeyError):
      restored_graph.get_operation_by_name(non_averaged_hook_name)
 def LoadMap(self, map_name):
     loaded_map = {}
     with open(os.path.join(test_flags.temp_dir(), map_name), 'r') as f:
         for line in f:
             entries = line.strip().split(' ')
             if len(entries) >= 2:
                 loaded_map[' '.join(entries[:-1])] = entries[-1]
     return loaded_map
Ejemplo n.º 11
0
    def testWordEmbeddingInitializerPresetRowNumber(self):
        records_path = os.path.join(test_flags.temp_dir(), 'records3')
        writer = tf.python_io.TFRecordWriter(records_path)
        writer.write(self._token_embedding('a', [1, 2, 3]))
        writer.write(self._token_embedding('b', [2, 3, 4]))
        writer.write(self._token_embedding('c', [3, 4, 5]))
        writer.write(self._token_embedding('d', [4, 5, 6]))
        writer.write(self._token_embedding('e', [5, 6, 7]))
        del writer

        vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary3')
        with open(vocabulary_path, 'w') as vocabulary_file:
            vocabulary_file.write(
                'a\nc\ne\nx\n')  # 'x' not in pretrained embeddings

        # Enumerate a variety of configurations.
        for cache_vectors_locally in [False, True]:
            for num_special_embeddings in [None, 1, 2,
                                           5]:  # None = use default of 3
                for override_num_embeddings in [-1, 8, 10]:
                    with self.test_session():
                        embeddings = gen_parser_ops.word_embedding_initializer(
                            vectors=records_path,
                            vocabulary=vocabulary_path,
                            override_num_embeddings=override_num_embeddings,
                            cache_vectors_locally=cache_vectors_locally,
                            num_special_embeddings=num_special_embeddings)

                        # Expect 4 embeddings from the vocabulary plus special embeddings.
                        expected_num_embeddings = 4 + (num_special_embeddings
                                                       or 3)
                        if override_num_embeddings > 0:
                            expected_num_embeddings = override_num_embeddings
                        self.assertAllEqual([expected_num_embeddings, 3],
                                            tf.shape(embeddings).eval())

                        # The first 3 embeddings should be pretrained.
                        norm_a = (1.0 + 4.0 + 9.0)**0.5
                        norm_c = (9.0 + 16.0 + 25.0)**0.5
                        norm_e = (25.0 + 36.0 + 49.0)**0.5
                        self.assertAllClose(
                            [[1.0 / norm_a, 2.0 / norm_a, 3.0 / norm_a],
                             [3.0 / norm_c, 4.0 / norm_c, 5.0 / norm_c],
                             [5.0 / norm_e, 6.0 / norm_e, 7.0 / norm_e]],
                            embeddings[:3].eval())
Ejemplo n.º 12
0
    def testWordEmbeddingInitializerVocabularyFileWithDuplicates(self):
        records_path = os.path.join(test_flags.temp_dir(), 'records4')
        writer = tf.python_io.TFRecordWriter(records_path)
        writer.write(self._token_embedding('a', [1, 2, 3]))
        writer.write(self._token_embedding('b', [2, 3, 4]))
        writer.write(self._token_embedding('c', [3, 4, 5]))
        writer.write(self._token_embedding('d', [4, 5, 6]))
        writer.write(self._token_embedding('e', [5, 6, 7]))
        del writer

        vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary4')
        with open(vocabulary_path, 'w') as vocabulary_file:
            vocabulary_file.write('a\nc\ne\nx\ny\nx')  # 'x' duplicated

        with self.test_session():
            with self.assertRaises(Exception):
                gen_parser_ops.word_embedding_initializer(
                    vectors=records_path, vocabulary=vocabulary_path).eval()
  def setUp(self):
    # Creates a task context with the correct testing paths.
    initial_task_context = os.path.join(test_flags.source_root(),
                                        'syntaxnet/'
                                        'testdata/context.pbtxt')
    self._task_context = os.path.join(test_flags.temp_dir(), 'context.pbtxt')
    with open(initial_task_context, 'r') as fin:
      with open(self._task_context, 'w') as fout:
        fout.write(fin.read().replace('SRCDIR', test_flags.source_root())
                   .replace('OUTPATH', test_flags.temp_dir()))

    # Creates necessary term maps.
    with self.test_session() as sess:
      gen_parser_ops.lexicon_builder(task_context=self._task_context,
                                     corpus_name='training-corpus').run()
      self._num_features, self._num_feature_ids, _, self._num_actions = (
          sess.run(gen_parser_ops.feature_size(task_context=self._task_context,
                                               arg_prefix='brain_parser')))
Ejemplo n.º 14
0
  def testWordEmbeddingInitializerVocabularyFileWithDuplicates(self):
    records_path = os.path.join(test_flags.temp_dir(), 'records4')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('a', [1, 2, 3]))
    writer.write(self._token_embedding('b', [2, 3, 4]))
    writer.write(self._token_embedding('c', [3, 4, 5]))
    writer.write(self._token_embedding('d', [4, 5, 6]))
    writer.write(self._token_embedding('e', [5, 6, 7]))
    del writer

    vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary4')
    with open(vocabulary_path, 'w') as vocabulary_file:
      vocabulary_file.write('a\nc\ne\nx\ny\nx')  # 'x' duplicated

    with self.test_session():
      with self.assertRaises(Exception):
        gen_parser_ops.word_embedding_initializer(
            vectors=records_path, vocabulary=vocabulary_path).eval()
Ejemplo n.º 15
0
  def setUp(self):
    # Creates a task context with the correct testing paths.
    initial_task_context = os.path.join(test_flags.source_root(),
                                        'syntaxnet/'
                                        'testdata/context.pbtxt')
    self._task_context = os.path.join(test_flags.temp_dir(), 'context.pbtxt')
    with open(initial_task_context, 'r') as fin:
      with open(self._task_context, 'w') as fout:
        fout.write(fin.read().replace('SRCDIR', test_flags.source_root())
                   .replace('OUTPATH', test_flags.temp_dir()))

    # Creates necessary term maps.
    with self.test_session() as sess:
      gen_parser_ops.lexicon_builder(task_context=self._task_context,
                                     corpus_name='training-corpus').run()
      self._num_features, self._num_feature_ids, _, self._num_actions = (
          sess.run(gen_parser_ops.feature_size(task_context=self._task_context,
                                               arg_prefix='brain_parser')))
    def testModelExportProducesRunnableModel(self):
        # Get the master spec and params for this graph.
        master_spec = self.LoadSpec('ud-hungarian.master-spec')
        params_path = os.path.join(
            test_flags.source_root(), 'dragnn/python/testdata'
            '/ud-hungarian.params')

        # Export the graph via SavedModel. (Here, we maintain a handle to the graph
        # for comparison, but that's usually not necessary.)
        export_path = os.path.join(test_flags.temp_dir(), 'export')
        dragnn_model_saver_lib.clean_output_paths(export_path)
        saver_graph = tf.Graph()

        shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths(
            master_spec)

        dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph)

        dragnn_model_saver_lib.export_to_graph(master_spec,
                                               params_path,
                                               export_path,
                                               saver_graph,
                                               export_moving_averages=False,
                                               build_runtime_graph=False)

        # Export the assets as well.
        dragnn_model_saver_lib.export_assets(master_spec,
                                             shortened_to_original,
                                             export_path)

        # Restore the graph from the checkpoint into a new Graph object.
        restored_graph = tf.Graph()
        restoration_config = tf.ConfigProto(log_device_placement=False,
                                            intra_op_parallelism_threads=10,
                                            inter_op_parallelism_threads=10)

        with tf.Session(graph=restored_graph,
                        config=restoration_config) as sess:
            tf.saved_model.loader.load(sess,
                                       [tf.saved_model.tag_constants.SERVING],
                                       export_path)

            test_doc = sentence_pb2.Sentence()
            text_format.Parse(_DUMMY_TEST_SENTENCE, test_doc)
            test_reader_string = test_doc.SerializeToString()
            test_inputs = [test_reader_string]

            tf_out = sess.run('annotation/annotations:0',
                              feed_dict={
                                  'annotation/ComputeSession/InputBatch:0':
                                  test_inputs
                              })

            # We don't care about accuracy, only that the run sessions don't crash.
            del tf_out
Ejemplo n.º 17
0
 def WriteContext(self, corpus_format):
     context = task_spec_pb2.TaskSpec()
     self.AddInput('documents', self.corpus_file, corpus_format, context)
     for name in ('word-map', 'lcword-map', 'tag-map', 'category-map',
                  'label-map', 'prefix-table', 'suffix-table',
                  'tag-to-category'):
         self.AddInput(name, os.path.join(test_flags.temp_dir(), name), '',
                       context)
     LOGGING.info('Writing context to: %s', self.context_file)
     with open(self.context_file, 'w') as f:
         f.write(str(context))
Ejemplo n.º 18
0
 def WriteContext(self, corpus_format):
   context = task_spec_pb2.TaskSpec()
   self.AddInput('documents', self.corpus_file, corpus_format, context)
   for name in ('word-map', 'lcword-map', 'tag-map', 'category-map',
                'label-map', 'prefix-table', 'suffix-table',
                'tag-to-category'):
     self.AddInput(name, os.path.join(test_flags.temp_dir(), name), '',
                   context)
   logging.info('Writing context to: %s', self.context_file)
   with open(self.context_file, 'w') as f:
     f.write(str(context))
Ejemplo n.º 19
0
  def testWordEmbeddingInitializerPresetRowNumber(self):
    records_path = os.path.join(test_flags.temp_dir(), 'records3')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('a', [1, 2, 3]))
    writer.write(self._token_embedding('b', [2, 3, 4]))
    writer.write(self._token_embedding('c', [3, 4, 5]))
    writer.write(self._token_embedding('d', [4, 5, 6]))
    writer.write(self._token_embedding('e', [5, 6, 7]))
    del writer

    vocabulary_path = os.path.join(test_flags.temp_dir(), 'vocabulary3')
    with open(vocabulary_path, 'w') as vocabulary_file:
      vocabulary_file.write('a\nc\ne\nx\n')  # 'x' not in pretrained embeddings

    # Enumerate a variety of configurations.
    for cache_vectors_locally in [False, True]:
      for num_special_embeddings in [None, 1, 2, 5]:  # None = use default of 3
        for override_num_embeddings in [-1, 8, 10]:
          with self.test_session():
            embeddings = gen_parser_ops.word_embedding_initializer(
                vectors=records_path,
                vocabulary=vocabulary_path,
                override_num_embeddings=override_num_embeddings,
                cache_vectors_locally=cache_vectors_locally,
                num_special_embeddings=num_special_embeddings)

            # Expect 4 embeddings from the vocabulary plus special embeddings.
            expected_num_embeddings = 4 + (num_special_embeddings or 3)
            if override_num_embeddings > 0:
              expected_num_embeddings = override_num_embeddings
            self.assertAllEqual([expected_num_embeddings, 3],
                                tf.shape(embeddings).eval())

            # The first 3 embeddings should be pretrained.
            norm_a = (1.0 + 4.0 + 9.0)**0.5
            norm_c = (9.0 + 16.0 + 25.0)**0.5
            norm_e = (25.0 + 36.0 + 49.0)**0.5
            self.assertAllClose([[1.0 / norm_a, 2.0 / norm_a, 3.0 / norm_a], [
                3.0 / norm_c, 4.0 / norm_c, 5.0 / norm_c
            ], [5.0 / norm_e, 6.0 / norm_e, 7.0 / norm_e]],
                                embeddings[:3].eval())
Ejemplo n.º 20
0
  def testModelExportProducesRunnableModel(self):
    # Get the master spec and params for this graph.
    master_spec = self.LoadSpec('ud-hungarian.master-spec')
    params_path = os.path.join(
        test_flags.source_root(),
        'dragnn/python/testdata'
        '/ud-hungarian.params')

    # Export the graph via SavedModel. (Here, we maintain a handle to the graph
    # for comparison, but that's usually not necessary.)
    export_path = os.path.join(test_flags.temp_dir(), 'export')
    dragnn_model_saver_lib.clean_output_paths(export_path)
    saver_graph = tf.Graph()

    shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths(
        master_spec)

    dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph)

    dragnn_model_saver_lib.export_to_graph(
        master_spec,
        params_path,
        export_path,
        saver_graph,
        export_moving_averages=False,
        build_runtime_graph=False)

    # Export the assets as well.
    dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original,
                                         export_path)

    # Restore the graph from the checkpoint into a new Graph object.
    restored_graph = tf.Graph()
    restoration_config = tf.ConfigProto(
        log_device_placement=False,
        intra_op_parallelism_threads=10,
        inter_op_parallelism_threads=10)

    with tf.Session(graph=restored_graph, config=restoration_config) as sess:
      tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING],
                                 export_path)

      test_doc = sentence_pb2.Sentence()
      text_format.Parse(_DUMMY_TEST_SENTENCE, test_doc)
      test_reader_string = test_doc.SerializeToString()
      test_inputs = [test_reader_string]

      tf_out = sess.run(
          'annotation/annotations:0',
          feed_dict={'annotation/ComputeSession/InputBatch:0': test_inputs})

      # We don't care about accuracy, only that the run sessions don't crash.
      del tf_out
 def WriteContext(self, corpus_format):
     context = task_spec_pb2.TaskSpec()
     self.AddParameter('brain_parser_embedding_names', 'words;tags',
                       context)
     self.AddParameter('brain_parser_features',
                       'input.token.word;input.tag', context)
     self.AddInput('documents', self.corpus_file, corpus_format, context)
     for name in ('word-map', 'lcword-map', 'tag-map', 'category-map',
                  'label-map', 'prefix-table', 'suffix-table',
                  'tag-to-category', 'char-map', 'char-ngram-map'):
         self.AddInput(name, os.path.join(test_flags.temp_dir(), name), '',
                       context)
     logging.info('Writing context to: %s', self.context_file)
     with open(self.context_file, 'w') as f:
         f.write(str(context))
Ejemplo n.º 22
0
    def testWordEmbeddingInitializer(self):
        # Provide embeddings for the first three words in the word map.
        records_path = os.path.join(test_flags.temp_dir(), 'records1')
        writer = tf.python_io.TFRecordWriter(records_path)
        writer.write(self._token_embedding('.', [1, 2]))
        writer.write(self._token_embedding(',', [3, 4]))
        writer.write(self._token_embedding('the', [5, 6]))
        del writer

        with self.test_session():
            embeddings = gen_parser_ops.word_embedding_initializer(
                vectors=records_path, task_context=self._task_context).eval()
        self.assertAllClose(
            np.array([[1. / (1 + 4)**.5, 2. / (1 + 4)**.5],
                      [3. / (9 + 16)**.5, 4. / (9 + 16)**.5],
                      [5. / (25 + 36)**.5, 6. / (25 + 36)**.5]]),
            embeddings[:3, ])
Ejemplo n.º 23
0
  def testWordEmbeddingInitializer(self):
    # Provide embeddings for the first three words in the word map.
    records_path = os.path.join(test_flags.temp_dir(), 'records1')
    writer = tf.python_io.TFRecordWriter(records_path)
    writer.write(self._token_embedding('.', [1, 2]))
    writer.write(self._token_embedding(',', [3, 4]))
    writer.write(self._token_embedding('the', [5, 6]))
    del writer

    with self.test_session():
      embeddings = gen_parser_ops.word_embedding_initializer(
          vectors=records_path,
          task_context=self._task_context).eval()
    self.assertAllClose(
        np.array([[1. / (1 + 4) ** .5, 2. / (1 + 4) ** .5],
                  [3. / (9 + 16) ** .5, 4. / (9 + 16) ** .5],
                  [5. / (25 + 36) ** .5, 6. / (25 + 36) ** .5]]),
        embeddings[:3,])
Ejemplo n.º 24
0
    def RunFullTrainingAndInference(self,
                                    test_name,
                                    master_spec_path=None,
                                    master_spec=None,
                                    hyperparam_config=None,
                                    component_weights=None,
                                    unroll_using_oracle=None,
                                    num_evaluated_components=1,
                                    expected_num_actions=None,
                                    expected=None,
                                    batch_size_limit=None):
        if not master_spec:
            master_spec = self.LoadSpec(master_spec_path)

        gold_doc = sentence_pb2.Sentence()
        text_format.Parse(_DUMMY_GOLD_SENTENCE, gold_doc)
        gold_doc_2 = sentence_pb2.Sentence()
        text_format.Parse(_DUMMY_GOLD_SENTENCE_2, gold_doc_2)
        gold_reader_strings = [
            gold_doc.SerializeToString(),
            gold_doc_2.SerializeToString()
        ]

        test_doc = sentence_pb2.Sentence()
        text_format.Parse(_DUMMY_TEST_SENTENCE, test_doc)
        test_doc_2 = sentence_pb2.Sentence()
        text_format.Parse(_DUMMY_TEST_SENTENCE_2, test_doc_2)
        test_reader_strings = [
            test_doc.SerializeToString(),
            test_doc.SerializeToString(),
            test_doc_2.SerializeToString(),
            test_doc.SerializeToString()
        ]

        if batch_size_limit is not None:
            gold_reader_strings = gold_reader_strings[:batch_size_limit]
            test_reader_strings = test_reader_strings[:batch_size_limit]

        with tf.Graph().as_default():
            tf.set_random_seed(1)
            if not hyperparam_config:
                hyperparam_config = spec_pb2.GridPoint()
            builder = graph_builder.MasterBuilder(master_spec,
                                                  hyperparam_config,
                                                  pool_scope=test_name)
            target = spec_pb2.TrainTarget()
            target.name = 'testFullInference-train_bkp-%s' % test_name
            if component_weights:
                target.component_weights.extend(component_weights)
            else:
                target.component_weights.extend([0] *
                                                len(master_spec.component))
                target.component_weights[-1] = 1.0
            if unroll_using_oracle:
                target.unroll_using_oracle.extend(unroll_using_oracle)
            else:
                target.unroll_using_oracle.extend([False] *
                                                  len(master_spec.component))
                target.unroll_using_oracle[-1] = True
            train = builder.add_training_from_config(target)
            oracle_trace = builder.add_training_from_config(
                target, prefix='train_traced-', trace_only=True)
            builder.add_saver()

            anno = builder.add_annotation(test_name)
            trace = builder.add_annotation(test_name + '-traced',
                                           enable_tracing=True)

            # Verifies that the summaries can be built.
            for component in builder.components:
                component.get_summaries()

            config = tf.ConfigProto(intra_op_parallelism_threads=0,
                                    inter_op_parallelism_threads=0)
            with self.test_session(config=config) as sess:
                logging.info('Initializing')
                sess.run(tf.global_variables_initializer())

                logging.info('Dry run oracle trace...')
                traces = sess.run(oracle_trace['traces'],
                                  feed_dict={
                                      oracle_trace['input_batch']:
                                      gold_reader_strings
                                  })

                # Check that the oracle traces are not empty.
                for serialized_trace in traces:
                    master_trace = trace_pb2.MasterTrace()
                    master_trace.ParseFromString(serialized_trace)
                    self.assertTrue(master_trace.component_trace)
                    self.assertTrue(master_trace.component_trace[0].step_trace)

                logging.info('Simulating training...')
                break_iter = 400
                is_resolved = False
                for i in range(
                        0, 400
                ):  # needs ~100 iterations, but is not deterministic
                    cost, eval_res_val = sess.run(
                        [train['cost'], train['metrics']],
                        feed_dict={train['input_batch']: gold_reader_strings})
                    logging.info('cost = %s', cost)
                    self.assertFalse(np.isnan(cost))
                    total_val = eval_res_val.reshape((-1, 2))[:, 0].sum()
                    correct_val = eval_res_val.reshape((-1, 2))[:, 1].sum()
                    if correct_val == total_val and not is_resolved:
                        logging.info(
                            '... converged on iteration %d with (correct, total) '
                            '= (%d, %d)', i, correct_val, total_val)
                        is_resolved = True
                        # Run for slightly longer than convergence to help with quantized
                        # weight tiebreakers.
                        break_iter = i + 50

                    if i == break_iter:
                        break

                # If training failed, report total/correct actions for each component.
                if not expected_num_actions:
                    expected_num_actions = 4 * num_evaluated_components
                if (correct_val != total_val
                        or correct_val != expected_num_actions
                        or total_val != expected_num_actions):
                    for c in xrange(len(master_spec.component)):
                        logging.error(
                            'component %s:\nname=%s\ntotal=%s\ncorrect=%s', c,
                            master_spec.component[c].name, eval_res_val[2 * c],
                            eval_res_val[2 * c + 1])

                assert correct_val == total_val, 'Did not converge! %d vs %d.' % (
                    correct_val, total_val)

                self.assertEqual(expected_num_actions, correct_val)
                self.assertEqual(expected_num_actions, total_val)

                builder.saver.save(
                    sess, os.path.join(test_flags.temp_dir(), 'model'))

                logging.info('Running test.')
                logging.info('Printing annotations')
                annotations = sess.run(
                    anno['annotations'],
                    feed_dict={anno['input_batch']: test_reader_strings})
                logging.info('Put %d inputs in, got %d annotations out.',
                             len(test_reader_strings), len(annotations))

                # Also run the annotation graph with tracing enabled.
                annotations_with_trace, traces = sess.run(
                    [trace['annotations'], trace['traces']],
                    feed_dict={trace['input_batch']: test_reader_strings})

                # The result of the two annotation graphs should be identical.
                self.assertItemsEqual(annotations, annotations_with_trace)

                # Check that the inference traces are not empty.
                for serialized_trace in traces:
                    master_trace = trace_pb2.MasterTrace()
                    master_trace.ParseFromString(serialized_trace)
                    self.assertTrue(master_trace.component_trace)
                    self.assertTrue(master_trace.component_trace[0].step_trace)

                self.assertEqual(len(test_reader_strings), len(annotations))
                pred_sentences = []
                for annotation in annotations:
                    pred_sentences.append(sentence_pb2.Sentence())
                    pred_sentences[-1].ParseFromString(annotation)

                if expected is None:
                    expected = _TAGGER_EXPECTED_SENTENCES

                expected_sentences = [expected[i] for i in [0, 0, 1, 0]]

                for i, pred_sentence in enumerate(pred_sentences):
                    self.assertProtoEquals(expected_sentences[i],
                                           pred_sentence)
Ejemplo n.º 25
0
  def testModelExportWithAveragesAndHooks(self):
    # Get the master spec and params for this graph.
    master_spec = self.LoadSpec('ud-hungarian.master-spec')
    params_path = os.path.join(
        test_flags.source_root(),
        'dragnn/python/testdata'
        '/ud-hungarian.params')

    # Export the graph via SavedModel. (Here, we maintain a handle to the graph
    # for comparison, but that's usually not necessary.)  Note that the export
    # path must not already exist.
    export_path = os.path.join(test_flags.temp_dir(), 'export2')
    dragnn_model_saver_lib.clean_output_paths(export_path)
    saver_graph = tf.Graph()

    shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths(
        master_spec)

    dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph)

    dragnn_model_saver_lib.export_to_graph(
        master_spec,
        params_path,
        export_path,
        saver_graph,
        export_moving_averages=True,
        build_runtime_graph=True)

    # Export the assets as well.
    dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original,
                                         export_path)

    # Validate that the assets are all in the exported directory.
    path_set = self.ValidateAssetExistence(master_spec, export_path)

    # This master-spec has 4 unique assets. If there are more, we have not
    # uniquified the assets properly.
    self.assertEqual(len(path_set), 4)

    # Restore the graph from the checkpoint into a new Graph object.
    restored_graph = tf.Graph()
    restoration_config = tf.ConfigProto(
        log_device_placement=False,
        intra_op_parallelism_threads=10,
        inter_op_parallelism_threads=10)

    with tf.Session(graph=restored_graph, config=restoration_config) as sess:
      tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING],
                                 export_path)

      averaged_hook_name, non_averaged_hook_name, cell_subgraph_hook_name = (
          self.GetHookNodeNames(master_spec))

      # Check that an averaged runtime hook node exists.
      restored_graph.get_operation_by_name(averaged_hook_name)

      # Check that the non-averaged version does not exist.
      with self.assertRaises(KeyError):
        restored_graph.get_operation_by_name(non_averaged_hook_name)

      # Load the cell subgraph.
      cell_subgraph_bytes = restored_graph.get_tensor_by_name(
          cell_subgraph_hook_name + ':0')
      cell_subgraph_bytes = cell_subgraph_bytes.eval(
          feed_dict={'annotation/ComputeSession/InputBatch:0': []})
      cell_subgraph_spec = export_pb2.CellSubgraphSpec()
      cell_subgraph_spec.ParseFromString(cell_subgraph_bytes)
      tf.logging.info('cell_subgraph_spec = %s', cell_subgraph_spec)

      # Sanity check inputs.
      for cell_input in cell_subgraph_spec.input:
        self.assertGreater(len(cell_input.name), 0)
        self.assertGreater(len(cell_input.tensor), 0)
        self.assertNotEqual(cell_input.type,
                            export_pb2.CellSubgraphSpec.Input.TYPE_UNKNOWN)
        restored_graph.get_tensor_by_name(cell_input.tensor)  # shouldn't raise

      # Sanity check outputs.
      for cell_output in cell_subgraph_spec.output:
        self.assertGreater(len(cell_output.name), 0)
        self.assertGreater(len(cell_output.tensor), 0)
        restored_graph.get_tensor_by_name(cell_output.tensor)  # shouldn't raise

      # GetHookNames() finds a component with a fixed feature, so at least the
      # first feature ID should exist.
      self.assertTrue(
          any(cell_input.name == 'fixed_channel_0_index_0_ids'
              for cell_input in cell_subgraph_spec.input))

      # Most dynamic components produce a logits layer.
      self.assertTrue(
          any(cell_output.name == 'logits'
              for cell_output in cell_subgraph_spec.output))
Ejemplo n.º 26
0
 def setUp(self):
   self.corpus_file = os.path.join(test_flags.temp_dir(), 'documents.conll')
   self.context_file = os.path.join(test_flags.temp_dir(), 'context.pbtxt')
    def testModelExportWithAveragesAndHooks(self):
        # Get the master spec and params for this graph.
        master_spec = self.LoadSpec('ud-hungarian.master-spec')
        params_path = os.path.join(
            test_flags.source_root(), 'dragnn/python/testdata'
            '/ud-hungarian.params')

        # Export the graph via SavedModel. (Here, we maintain a handle to the graph
        # for comparison, but that's usually not necessary.)  Note that the export
        # path must not already exist.
        export_path = os.path.join(test_flags.temp_dir(), 'export2')
        dragnn_model_saver_lib.clean_output_paths(export_path)
        saver_graph = tf.Graph()

        shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths(
            master_spec)

        dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph)

        dragnn_model_saver_lib.export_to_graph(master_spec,
                                               params_path,
                                               export_path,
                                               saver_graph,
                                               export_moving_averages=True,
                                               build_runtime_graph=True)

        # Export the assets as well.
        dragnn_model_saver_lib.export_assets(master_spec,
                                             shortened_to_original,
                                             export_path)

        # Validate that the assets are all in the exported directory.
        path_set = self.ValidateAssetExistence(master_spec, export_path)

        # This master-spec has 4 unique assets. If there are more, we have not
        # uniquified the assets properly.
        self.assertEqual(len(path_set), 4)

        # Restore the graph from the checkpoint into a new Graph object.
        restored_graph = tf.Graph()
        restoration_config = tf.ConfigProto(log_device_placement=False,
                                            intra_op_parallelism_threads=10,
                                            inter_op_parallelism_threads=10)

        with tf.Session(graph=restored_graph,
                        config=restoration_config) as sess:
            tf.saved_model.loader.load(sess,
                                       [tf.saved_model.tag_constants.SERVING],
                                       export_path)

            averaged_hook_name, non_averaged_hook_name, cell_subgraph_hook_name = (
                self.GetHookNodeNames(master_spec))

            # Check that an averaged runtime hook node exists.
            restored_graph.get_operation_by_name(averaged_hook_name)

            # Check that the non-averaged version does not exist.
            with self.assertRaises(KeyError):
                restored_graph.get_operation_by_name(non_averaged_hook_name)

            # Load the cell subgraph.
            cell_subgraph_bytes = restored_graph.get_tensor_by_name(
                cell_subgraph_hook_name + ':0')
            cell_subgraph_bytes = cell_subgraph_bytes.eval(
                feed_dict={'annotation/ComputeSession/InputBatch:0': []})
            cell_subgraph_spec = export_pb2.CellSubgraphSpec()
            cell_subgraph_spec.ParseFromString(cell_subgraph_bytes)
            tf.logging.info('cell_subgraph_spec = %s', cell_subgraph_spec)

            # Sanity check inputs.
            for cell_input in cell_subgraph_spec.input:
                self.assertGreater(len(cell_input.name), 0)
                self.assertGreater(len(cell_input.tensor), 0)
                self.assertNotEqual(
                    cell_input.type,
                    export_pb2.CellSubgraphSpec.Input.TYPE_UNKNOWN)
                restored_graph.get_tensor_by_name(
                    cell_input.tensor)  # shouldn't raise

            # Sanity check outputs.
            for cell_output in cell_subgraph_spec.output:
                self.assertGreater(len(cell_output.name), 0)
                self.assertGreater(len(cell_output.tensor), 0)
                restored_graph.get_tensor_by_name(
                    cell_output.tensor)  # shouldn't raise

            # GetHookNames() finds a component with a fixed feature, so at least the
            # first feature ID should exist.
            self.assertTrue(
                any(cell_input.name == 'fixed_channel_0_index_0_ids'
                    for cell_input in cell_subgraph_spec.input))

            # Most dynamic components produce a logits layer.
            self.assertTrue(
                any(cell_output.name == 'logits'
                    for cell_output in cell_subgraph_spec.output))
Ejemplo n.º 28
0
  def RunFullTrainingAndInference(self,
                                  test_name,
                                  master_spec_path=None,
                                  master_spec=None,
                                  hyperparam_config=None,
                                  component_weights=None,
                                  unroll_using_oracle=None,
                                  num_evaluated_components=1,
                                  expected_num_actions=None,
                                  expected=None,
                                  batch_size_limit=None):
    if not master_spec:
      master_spec = self.LoadSpec(master_spec_path)

    gold_doc = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_GOLD_SENTENCE, gold_doc)
    gold_doc_2 = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_GOLD_SENTENCE_2, gold_doc_2)
    gold_reader_strings = [
        gold_doc.SerializeToString(),
        gold_doc_2.SerializeToString()
    ]

    test_doc = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_TEST_SENTENCE, test_doc)
    test_doc_2 = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_TEST_SENTENCE_2, test_doc_2)
    test_reader_strings = [
        test_doc.SerializeToString(),
        test_doc.SerializeToString(),
        test_doc_2.SerializeToString(),
        test_doc.SerializeToString()
    ]

    if batch_size_limit is not None:
      gold_reader_strings = gold_reader_strings[:batch_size_limit]
      test_reader_strings = test_reader_strings[:batch_size_limit]

    with tf.Graph().as_default():
      tf.set_random_seed(1)
      if not hyperparam_config:
        hyperparam_config = spec_pb2.GridPoint()
      builder = graph_builder.MasterBuilder(
          master_spec, hyperparam_config, pool_scope=test_name)
      target = spec_pb2.TrainTarget()
      target.name = 'testFullInference-train-%s' % test_name
      if component_weights:
        target.component_weights.extend(component_weights)
      else:
        target.component_weights.extend([0] * len(master_spec.component))
        target.component_weights[-1] = 1.0
      if unroll_using_oracle:
        target.unroll_using_oracle.extend(unroll_using_oracle)
      else:
        target.unroll_using_oracle.extend([False] * len(master_spec.component))
        target.unroll_using_oracle[-1] = True
      train = builder.add_training_from_config(target)
      oracle_trace = builder.add_training_from_config(
          target, prefix='train_traced-', trace_only=True)
      builder.add_saver()

      anno = builder.add_annotation(test_name)
      trace = builder.add_annotation(test_name + '-traced', enable_tracing=True)

      # Verifies that the summaries can be built.
      for component in builder.components:
        component.get_summaries()

      config = tf.ConfigProto(
          intra_op_parallelism_threads=0, inter_op_parallelism_threads=0)
      with self.test_session(config=config) as sess:
        logging.info('Initializing')
        sess.run(tf.global_variables_initializer())

        logging.info('Dry run oracle trace...')
        traces = sess.run(
            oracle_trace['traces'],
            feed_dict={oracle_trace['input_batch']: gold_reader_strings})

        # Check that the oracle traces are not empty.
        for serialized_trace in traces:
          master_trace = trace_pb2.MasterTrace()
          master_trace.ParseFromString(serialized_trace)
          self.assertTrue(master_trace.component_trace)
          self.assertTrue(master_trace.component_trace[0].step_trace)

        logging.info('Simulating training...')
        break_iter = 400
        is_resolved = False
        for i in range(0,
                       400):  # needs ~100 iterations, but is not deterministic
          cost, eval_res_val = sess.run(
              [train['cost'], train['metrics']],
              feed_dict={train['input_batch']: gold_reader_strings})
          logging.info('cost = %s', cost)
          self.assertFalse(np.isnan(cost))
          total_val = eval_res_val.reshape((-1, 2))[:, 0].sum()
          correct_val = eval_res_val.reshape((-1, 2))[:, 1].sum()
          if correct_val == total_val and not is_resolved:
            logging.info('... converged on iteration %d with (correct, total) '
                         '= (%d, %d)', i, correct_val, total_val)
            is_resolved = True
            # Run for slightly longer than convergence to help with quantized
            # weight tiebreakers.
            break_iter = i + 50

          if i == break_iter:
            break

        # If training failed, report total/correct actions for each component.
        if not expected_num_actions:
          expected_num_actions = 4 * num_evaluated_components
        if (correct_val != total_val or correct_val != expected_num_actions or
            total_val != expected_num_actions):
          for c in xrange(len(master_spec.component)):
            logging.error('component %s:\nname=%s\ntotal=%s\ncorrect=%s', c,
                          master_spec.component[c].name, eval_res_val[2 * c],
                          eval_res_val[2 * c + 1])

        assert correct_val == total_val, 'Did not converge! %d vs %d.' % (
            correct_val, total_val)

        self.assertEqual(expected_num_actions, correct_val)
        self.assertEqual(expected_num_actions, total_val)

        builder.saver.save(sess, os.path.join(test_flags.temp_dir(), 'model'))

        logging.info('Running test.')
        logging.info('Printing annotations')
        annotations = sess.run(
            anno['annotations'],
            feed_dict={anno['input_batch']: test_reader_strings})
        logging.info('Put %d inputs in, got %d annotations out.',
                     len(test_reader_strings), len(annotations))

        # Also run the annotation graph with tracing enabled.
        annotations_with_trace, traces = sess.run(
            [trace['annotations'], trace['traces']],
            feed_dict={trace['input_batch']: test_reader_strings})

        # The result of the two annotation graphs should be identical.
        self.assertItemsEqual(annotations, annotations_with_trace)

        # Check that the inference traces are not empty.
        for serialized_trace in traces:
          master_trace = trace_pb2.MasterTrace()
          master_trace.ParseFromString(serialized_trace)
          self.assertTrue(master_trace.component_trace)
          self.assertTrue(master_trace.component_trace[0].step_trace)

        self.assertEqual(len(test_reader_strings), len(annotations))
        pred_sentences = []
        for annotation in annotations:
          pred_sentences.append(sentence_pb2.Sentence())
          pred_sentences[-1].ParseFromString(annotation)

        if expected is None:
          expected = _TAGGER_EXPECTED_SENTENCES

        expected_sentences = [expected[i] for i in [0, 0, 1, 0]]

        for i, pred_sentence in enumerate(pred_sentences):
          self.assertProtoEquals(expected_sentences[i], pred_sentence)
Ejemplo n.º 29
0
 def setUp(self):
     self.corpus_file = os.path.join(test_flags.temp_dir(),
                                     'documents.conll')
     self.context_file = os.path.join(test_flags.temp_dir(),
                                      'context.pbtxt')