Ejemplo n.º 1
0
  def testInitEmbeddingColumnWeightsFromCkpt(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        column_name="object_in_image", hash_bucket_size=4)
    # Create _EmbeddingColumn which randomly initializes embedding of size
    # [4, 16].
    embedding_col = fc.embedding_column(sparse_col, dimension=16)

    # Creating a SparseTensor which has all the ids possible for the given
    # vocab.
    input_tensor = sparse_tensor_lib.SparseTensor(
        indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
        values=[0, 1, 2, 3],
        dense_shape=[4, 4])

    # Invoking 'layers.input_from_feature_columns' will create the embedding
    # variable. Creating under scope 'run_1' so as to prevent name conflicts
    # when creating embedding variable for 'embedding_column_pretrained'.
    with variable_scope.variable_scope("run_1"):
      with variable_scope.variable_scope(embedding_col.name):
        # This will return a [4, 16] tensor which is same as embedding variable.
        embeddings = feature_column_ops.input_from_feature_columns({
            embedding_col: input_tensor
        }, [embedding_col])

    save = saver.Saver()
    ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                   "init_embedding_col_w_from_ckpt")
    ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
    checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      saved_embedding = embeddings.eval()
      save.save(sess, checkpoint_path)

    embedding_col_initialized = fc.embedding_column(
        sparse_id_column=sparse_col,
        dimension=16,
        ckpt_to_load_from=checkpoint_path,
        tensor_name_in_ckpt=("run_1/object_in_image_embedding/"
                             "input_from_feature_columns/object"
                             "_in_image_embedding/weights"))

    with variable_scope.variable_scope("run_2"):
      # This will initialize the embedding from provided checkpoint and return a
      # [4, 16] tensor which is same as embedding variable. Since we didn't
      # modify embeddings, this should be same as 'saved_embedding'.
      pretrained_embeddings = feature_column_ops.input_from_feature_columns({
          embedding_col_initialized: input_tensor
      }, [embedding_col_initialized])

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      loaded_embedding = pretrained_embeddings.eval()

    self.assertAllClose(saved_embedding, loaded_embedding)
Ejemplo n.º 2
0
  def testInitEmbeddingColumnWeightsFromCkpt(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        column_name="object_in_image", hash_bucket_size=4)
    # Create _EmbeddingColumn which randomly initializes embedding of size
    # [4, 16].
    embedding_col = fc.embedding_column(sparse_col, dimension=16)

    # Creating a SparseTensor which has all the ids possible for the given
    # vocab.
    input_tensor = sparse_tensor_lib.SparseTensor(
        indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
        values=[0, 1, 2, 3],
        dense_shape=[4, 4])

    # Invoking 'layers.input_from_feature_columns' will create the embedding
    # variable. Creating under scope 'run_1' so as to prevent name conflicts
    # when creating embedding variable for 'embedding_column_pretrained'.
    with variable_scope.variable_scope("run_1"):
      with variable_scope.variable_scope(embedding_col.name):
        # This will return a [4, 16] tensor which is same as embedding variable.
        embeddings = feature_column_ops.input_from_feature_columns({
            embedding_col: input_tensor
        }, [embedding_col])

    save = saver.Saver()
    ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                   "init_embedding_col_w_from_ckpt")
    ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
    checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      saved_embedding = embeddings.eval()
      save.save(sess, checkpoint_path)

    embedding_col_initialized = fc.embedding_column(
        sparse_id_column=sparse_col,
        dimension=16,
        ckpt_to_load_from=checkpoint_path,
        tensor_name_in_ckpt=("run_1/object_in_image_embedding/"
                             "input_from_feature_columns/object"
                             "_in_image_embedding/weights"))

    with variable_scope.variable_scope("run_2"):
      # This will initialize the embedding from provided checkpoint and return a
      # [4, 16] tensor which is same as embedding variable. Since we didn't
      # modify embeddings, this should be same as 'saved_embedding'.
      pretrained_embeddings = feature_column_ops.input_from_feature_columns({
          embedding_col_initialized: input_tensor
      }, [embedding_col_initialized])

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      loaded_embedding = pretrained_embeddings.eval()

    self.assertAllClose(saved_embedding, loaded_embedding)
Ejemplo n.º 3
0
  def testRegression_TensorData(self):
    """Tests regression using tensor data as input."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.15], [0.]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32)

    language_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            language_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    regressor = dnn.DNNRegressor(
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    regressor.fit(input_fn=_input_fn, steps=200)

    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
    self.assertIn('loss', scores)
Ejemplo n.º 4
0
 def testEmbeddingColumn(self):
   a = fc.sparse_column_with_hash_bucket(
       "aaa", hash_bucket_size=100, combiner="sum")
   b = fc.embedding_column(a, dimension=4, combiner="mean")
   self.assertEqual(b.sparse_id_column.name, "aaa")
   self.assertEqual(b.dimension, 4)
   self.assertEqual(b.combiner, "mean")
Ejemplo n.º 5
0
  def testExport(self):
    """Tests export model for servo."""

    def input_fn():
      return {
          'age':
              constant_op.constant([1]),
          'language':
              sparse_tensor.SparseTensor(
                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
      }, constant_op.constant([[1]])

    language = feature_column.sparse_column_with_hash_bucket('language', 100)
    feature_columns = [
        feature_column.real_valued_column('age'),
        feature_column.embedding_column(
            language, dimension=1)
    ]

    classifier = debug.DebugClassifier(config=run_config.RunConfig(
        tf_random_seed=1))
    classifier.fit(input_fn=input_fn, steps=5)

    def default_input_fn(unused_estimator, examples):
      return feature_column_ops.parse_feature_columns_from_examples(
          examples, feature_columns)

    export_dir = tempfile.mkdtemp()
    classifier.export(export_dir, input_fn=default_input_fn)
Ejemplo n.º 6
0
 def testEmbeddingColumn(self):
   a = fc.sparse_column_with_hash_bucket(
       "aaa", hash_bucket_size=100, combiner="sum")
   b = fc.embedding_column(a, dimension=4, combiner="mean")
   self.assertEqual(b.sparse_id_column.name, "aaa")
   self.assertEqual(b.dimension, 4)
   self.assertEqual(b.combiner, "mean")
Ejemplo n.º 7
0
  def testExport(self):
    """Tests export model for servo."""

    def input_fn():
      return {
          'age':
              constant_op.constant([1]),
          'language':
              sparse_tensor.SparseTensor(
                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
      }, constant_op.constant([[1]])

    language = feature_column.sparse_column_with_hash_bucket('language', 100)
    feature_columns = [
        feature_column.real_valued_column('age'),
        feature_column.embedding_column(
            language, dimension=1)
    ]

    classifier = dnn.DNNClassifier(
        feature_columns=feature_columns, hidden_units=[3, 3])
    classifier.fit(input_fn=input_fn, steps=5)

    export_dir = tempfile.mkdtemp()
    classifier.export(export_dir)
Ejemplo n.º 8
0
  def testMultipliesGradient(self):
    embedding_language = feature_column.embedding_column(
        feature_column.sparse_column_with_hash_bucket('language', 10),
        dimension=1,
        initializer=init_ops.constant_initializer(0.1))
    embedding_wire = feature_column.embedding_column(
        feature_column.sparse_column_with_hash_bucket('wire', 10),
        dimension=1,
        initializer=init_ops.constant_initializer(0.1))

    params = {
        'feature_columns': [embedding_language, embedding_wire],
        'head': head_lib._multi_class_head(2),
        'hidden_units': [1],
        # Set lr mult to 0. to keep embeddings constant.
        'embedding_lr_multipliers': {
            embedding_language: 0.0
        },
    }
    features = {
        'language':
            sparse_tensor.SparseTensor(
                values=['en', 'fr', 'zh'],
                indices=[[0, 0], [1, 0], [2, 0]],
                dense_shape=[3, 1]),
        'wire':
            sparse_tensor.SparseTensor(
                values=['omar', 'stringer', 'marlo'],
                indices=[[0, 0], [1, 0], [2, 0]],
                dense_shape=[3, 1]),
    }
    labels = constant_op.constant([[0], [0], [0]], dtype=dtypes.int32)
    model_ops = dnn._dnn_model_fn(features, labels, model_fn.ModeKeys.TRAIN,
                                  params)
    with monitored_session.MonitoredSession() as sess:
      language_var = dnn_linear_combined._get_embedding_variable(
          embedding_language, 'dnn', 'dnn/input_from_feature_columns')
      wire_var = dnn_linear_combined._get_embedding_variable(
          embedding_wire, 'dnn', 'dnn/input_from_feature_columns')
      for _ in range(2):
        _, language_value, wire_value = sess.run(
            [model_ops.train_op, language_var, wire_var])
      initial_value = np.full_like(language_value, 0.1)
      self.assertTrue(np.all(np.isclose(language_value, initial_value)))
      self.assertFalse(np.all(np.isclose(wire_value, initial_value)))
Ejemplo n.º 9
0
 def testEmbeddingColumnDeepCopy(self):
   a = fc.sparse_column_with_hash_bucket(
       "aaa", hash_bucket_size=100, combiner="sum")
   column = fc.embedding_column(a, dimension=4, combiner="mean")
   column_copy = copy.deepcopy(column)
   self.assertEqual(column_copy.name, "aaa_embedding")
   self.assertEqual(column_copy.sparse_id_column.name, "aaa")
   self.assertEqual(column_copy.dimension, 4)
   self.assertEqual(column_copy.combiner, "mean")
Ejemplo n.º 10
0
 def testEmbeddingColumnDeepCopy(self):
   a = fc.sparse_column_with_hash_bucket(
       "aaa", hash_bucket_size=100, combiner="sum")
   column = fc.embedding_column(a, dimension=4, combiner="mean")
   column_copy = copy.deepcopy(column)
   self.assertEqual(column_copy.name, "aaa_embedding")
   self.assertEqual(column_copy.sparse_id_column.name, "aaa")
   self.assertEqual(column_copy.dimension, 4)
   self.assertEqual(column_copy.combiner, "mean")
Ejemplo n.º 11
0
 def testEmbeddingMultiplier(self):
   embedding_language = feature_column.embedding_column(
       feature_column.sparse_column_with_hash_bucket('language', 10),
       dimension=1,
       initializer=init_ops.constant_initializer(0.1))
   classifier = dnn.DNNClassifier(
       feature_columns=[embedding_language],
       hidden_units=[3, 3],
       embedding_lr_multipliers={embedding_language: 0.8})
   self.assertEqual({
       embedding_language: 0.8
   }, classifier._estimator.params['embedding_lr_multipliers'])
Ejemplo n.º 12
0
  def testCreateSequenceFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
    real_valued_col2 = fc.real_valued_column(
        "real_valued_default_column", dimension=5, default_value=3.0)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)

    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, real_valued_col4
    ])

    feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)

    expected_feature_spec = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[2], dtype=dtypes.float32, allow_missing=False),
        "real_valued_default_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[5], dtype=dtypes.float32, allow_missing=True),
        "real_valued_var_len_column":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_var_len_dense_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=dtypes.float32, allow_missing=True,
                default_value=4.0),
    }

    self.assertDictEqual(expected_feature_spec, feature_spec)
Ejemplo n.º 13
0
  def testCreateSequenceFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
    real_valued_col2 = fc.real_valued_column(
        "real_valued_default_column", dimension=5, default_value=3.0)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)

    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, real_valued_col4
    ])

    feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)

    expected_feature_spec = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[2], dtype=dtypes.float32, allow_missing=False),
        "real_valued_default_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[5], dtype=dtypes.float32, allow_missing=True),
        "real_valued_var_len_column":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_var_len_dense_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=dtypes.float32, allow_missing=True,
                default_value=4.0),
    }

    self.assertDictEqual(expected_feature_spec, feature_spec)
Ejemplo n.º 14
0
  def testTrainWithPartitionedVariables(self):
    """Tests training with partitioned variables."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.2], [.1]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)

    # The given hash_bucket_size results in variables larger than the
    # default min_slice_size attribute, so the variables are partitioned.
    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=2e7)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1)
    ]

    tf_config = {
        'cluster': {
            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
        }
    }
    with test.mock.patch.dict('os.environ',
                              {'TF_CONFIG': json.dumps(tf_config)}):
      config = run_config.RunConfig(tf_random_seed=1)
      # Because we did not start a distributed cluster, we need to pass an
      # empty ClusterSpec, otherwise the device_setter will look for
      # distributed jobs, such as "/job:ps" which are not present.
      config._cluster_spec = server_lib.ClusterSpec({})

    classifier = dnn.DNNClassifier(
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=config)

    classifier.fit(input_fn=_input_fn, steps=5)
    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
    self._assertInRange(0.0, 1.0, scores['accuracy'])
    self.assertIn('loss', scores)
Ejemplo n.º 15
0
    def testPrepareInputsForRnnSparseAndDense(self):
        num_unroll = 2
        embedding_dimension = 8
        dense_dimension = 2

        expected = [
            np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.],
                      [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.],
                      [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]),
            np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.],
                      [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.],
                      [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]])
        ]

        sequence_features = {
            'wire_cast':
            sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1,
                                                            0], [1, 0, 0],
                                                [1, 1, 0], [1, 1, 1],
                                                [2, 0, 0], [2, 1, 1]],
                                       values=[
                                           b'marlo', b'stringer', b'omar',
                                           b'stringer', b'marlo', b'marlo',
                                           b'omar'
                                       ],
                                       dense_shape=[3, 2, 2]),
            'seq_feature0':
            constant_op.constant([[[111., 112.], [121., 122.]],
                                  [[211., 212.], [221., 222.]],
                                  [[311., 312.], [321., 322.]]])
        }

        wire_cast = feature_column.sparse_column_with_keys(
            'wire_cast', ['marlo', 'omar', 'stringer'])
        wire_cast_embedded = feature_column.embedding_column(
            wire_cast,
            dimension=embedding_dimension,
            combiner='sum',
            initializer=init_ops.ones_initializer())
        seq_feature0_column = feature_column.real_valued_column(
            'seq_feature0', dimension=dense_dimension)

        sequence_feature_columns = [seq_feature0_column, wire_cast_embedded]

        context_features = None

        self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                          sequence_feature_columns, num_unroll,
                                          expected)
Ejemplo n.º 16
0
  def benchmarkLogisticFloatLabel(self):

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant(((50,), (20,), (10,))),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ('en', 'fr', 'zh'), num_epochs=num_epochs),
                  indices=((0, 0), (0, 1), (2, 0)),
                  dense_shape=(3, 2))
      }
      return features, constant_op.constant(
          ((0.8,), (0.,), (0.2,)), dtype=dtypes.float32)

    lang_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    n_classes = 2
    classifier = dnn.DNNClassifier(
        n_classes=n_classes,
        feature_columns=(feature_column.embedding_column(
            lang_column, dimension=1),
                         feature_column.real_valued_column('age')),
        hidden_units=(3, 3),
        config=run_config.RunConfig(tf_random_seed=1))
    steps = 1000
    metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate(
        input_fn=_input_fn, steps=1)
    estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step',
                                         metrics)

    # Prediction probabilities mirror the labels column, which proves that the
    # classifier learns from float input.
    self._report_metrics(metrics)
    self._report_predictions(
        classifier=classifier,
        input_fn=functools.partial(_input_fn, num_epochs=1),
        iters=metrics['global_step'],
        n_examples=3,
        n_classes=n_classes,
        expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)),
        expected_classes=(1, 0, 0),
        benchmark_name_override=(
            'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions'))
Ejemplo n.º 17
0
    def benchmarkLogisticFloatLabel(self):
        def _input_fn(num_epochs=None):
            features = {
                'age':
                input_lib.limit_epochs(constant_op.constant(
                    ((50, ), (20, ), (10, ))),
                                       num_epochs=num_epochs),
                'language':
                sparse_tensor.SparseTensor(values=input_lib.limit_epochs(
                    ('en', 'fr', 'zh'), num_epochs=num_epochs),
                                           indices=((0, 0), (0, 1), (2, 0)),
                                           dense_shape=(3, 2))
            }
            return features, constant_op.constant(((0.8, ), (0., ), (0.2, )),
                                                  dtype=dtypes.float32)

        lang_column = feature_column.sparse_column_with_hash_bucket(
            'language', hash_bucket_size=20)
        n_classes = 2
        classifier = dnn.DNNClassifier(
            n_classes=n_classes,
            feature_columns=(feature_column.embedding_column(lang_column,
                                                             dimension=1),
                             feature_column.real_valued_column('age')),
            hidden_units=(3, 3),
            config=run_config.RunConfig(tf_random_seed=1))
        steps = 1000
        metrics = classifier.fit(input_fn=_input_fn,
                                 steps=steps).evaluate(input_fn=_input_fn,
                                                       steps=1)
        estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step',
                                             metrics)

        # Prediction probabilities mirror the labels column, which proves that the
        # classifier learns from float input.
        self._report_metrics(metrics)
        self._report_predictions(
            classifier=classifier,
            input_fn=functools.partial(_input_fn, num_epochs=1),
            iters=metrics['global_step'],
            n_examples=3,
            n_classes=n_classes,
            expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)),
            expected_classes=(1, 0, 0),
            benchmark_name_override=(
                'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions'
            ))
  def testPrepareInputsForRnnSparseAndDense(self):
    num_unroll = 2
    embedding_dimension = 8
    dense_dimension = 2

    expected = [
        np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]),
        np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.],
                  [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]])
    ]

    sequence_features = {
        'wire_cast':
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
        'seq_feature0':
            constant_op.constant([[[111., 112.], [121., 122.]],
                                  [[211., 212.], [221., 222.]],
                                  [[311., 312.], [321., 322.]]])
    }

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    wire_cast_embedded = feature_column.embedding_column(
        wire_cast,
        dimension=embedding_dimension,
        combiner='sum',
        initializer=init_ops.ones_initializer())
    seq_feature0_column = feature_column.real_valued_column(
        'seq_feature0', dimension=dense_dimension)

    sequence_feature_columns = [seq_feature0_column, wire_cast_embedded]

    context_features = None

    self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                      sequence_feature_columns, num_unroll,
                                      expected)
Ejemplo n.º 19
0
  def testPredict_AsIterable(self):
    """Tests predict and predict_prob methods with as_iterable=True."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.2], [.1]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)

    language_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            language_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    classifier = dnn.DNNClassifier(
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    classifier.fit(input_fn=_input_fn, steps=200)

    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
    self._assertInRange(0.0, 1.0, scores['accuracy'])
    self.assertIn('loss', scores)
    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
    predictions = list(
        classifier.predict(
            input_fn=predict_input_fn, as_iterable=True))
    self.assertListEqual(predictions, [1, 0, 0])
    predictions = list(
        classifier.predict_proba(
            input_fn=predict_input_fn, as_iterable=True))
    self.assertAllClose(
        predictions, [[0., 1., 0.], [1., 0., 0.], [1., 0., 0.]], atol=0.3)
  def setUp(self):
    super(DynamicRnnEstimatorTest, self).setUp()
    self.rnn_cell = rnn_cell.BasicRNNCell(self.NUM_RNN_CELL_UNITS)
    self.mock_target_column = MockTargetColumn(
        num_label_columns=self.NUM_LABEL_COLUMNS)

    location = feature_column.sparse_column_with_keys(
        'location', keys=['west_side', 'east_side', 'nyc'])
    location_onehot = feature_column.one_hot_column(location)
    self.context_feature_columns = [location_onehot]

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8)
    measurements = feature_column.real_valued_column(
        'measurements', dimension=2)
    self.sequence_feature_columns = [measurements, wire_cast_embedded]
  def setUp(self):
    super(DynamicRnnEstimatorTest, self).setUp()
    self.rnn_cell = core_rnn_cell_impl.BasicRNNCell(self.NUM_RNN_CELL_UNITS)
    self.mock_target_column = MockTargetColumn(
        num_label_columns=self.NUM_LABEL_COLUMNS)

    location = feature_column.sparse_column_with_keys(
        'location', keys=['west_side', 'east_side', 'nyc'])
    location_onehot = feature_column.one_hot_column(location)
    self.context_feature_columns = [location_onehot]

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8)
    measurements = feature_column.real_valued_column(
        'measurements', dimension=2)
    self.sequence_feature_columns = [measurements, wire_cast_embedded]
Ejemplo n.º 22
0
  def testTrainSaveLoad(self):
    """Tests that insures you can save and reload a trained model."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.2], [.1]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)

    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1)
    ]

    model_dir = tempfile.mkdtemp()
    classifier = dnn.DNNClassifier(
        model_dir=model_dir,
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    classifier.fit(input_fn=_input_fn, steps=5)
    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
    predictions1 = classifier.predict(input_fn=predict_input_fn)
    del classifier

    classifier2 = dnn.DNNClassifier(
        model_dir=model_dir,
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))
    predictions2 = classifier2.predict(input_fn=predict_input_fn)
    self.assertEqual(list(predictions1), list(predictions2))
Ejemplo n.º 23
0
  def testTrainSaveLoad(self):
    """Tests that insures you can save and reload a trained model."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[0.8], [0.15], [0.]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32)

    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    model_dir = tempfile.mkdtemp()
    regressor = dnn.DNNRegressor(
        model_dir=model_dir,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    regressor.fit(input_fn=_input_fn, steps=5)
    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
    predictions = list(regressor.predict(input_fn=predict_input_fn))
    del regressor

    regressor2 = dnn.DNNRegressor(
        model_dir=model_dir,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))
    predictions2 = list(regressor2.predict(input_fn=predict_input_fn))
    self.assertAllClose(predictions, predictions2)
Ejemplo n.º 24
0
  def test_make_parsing_export_strategy(self):
    """Only tests that an ExportStrategy instance is created."""
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    feature_columns = [sparse_col, embedding_col, real_valued_col1,
                       bucketized_col1]

    export_strategy = saved_model_export_utils.make_parsing_export_strategy(
        feature_columns=feature_columns)
    self.assertTrue(
        isinstance(export_strategy, export_strategy_lib.ExportStrategy))
  def test_make_parsing_export_strategy(self):
    """Only tests that an ExportStrategy instance is created."""
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    feature_columns = [sparse_col, embedding_col, real_valued_col1,
                       bucketized_col1]

    export_strategy = saved_model_export_utils.make_parsing_export_strategy(
        feature_columns=feature_columns)
    self.assertTrue(
        isinstance(export_strategy, export_strategy_lib.ExportStrategy))
Ejemplo n.º 26
0
  def testLogisticRegression_FloatLabel(self):
    """Tests binary classification with float labels."""

    def _input_fn_float_label(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[50], [20], [10]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      labels = constant_op.constant([[0.8], [0.], [0.2]], dtype=dtypes.float32)
      return features, labels

    language_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            language_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    classifier = dnn.DNNClassifier(
        n_classes=2,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    classifier.fit(input_fn=_input_fn_float_label, steps=50)

    predict_input_fn = functools.partial(_input_fn_float_label, num_epochs=1)
    predictions = list(
        classifier.predict(
            input_fn=predict_input_fn, as_iterable=True))
    self._assertBinaryPredictions(3, predictions)
    predictions_proba = list(
        classifier.predict_proba(
            input_fn=predict_input_fn, as_iterable=True))
    self._assertProbabilities(3, 2, predictions_proba)
Ejemplo n.º 27
0
    def benchmarkLogisticTensorData(self):
        def _input_fn(num_epochs=None):
            features = {
                'age':
                input_lib.limit_epochs(constant_op.constant(
                    ((.8, ), (0.2, ), (.1, ))),
                                       num_epochs=num_epochs),
                'language':
                sparse_tensor.SparseTensor(values=input_lib.limit_epochs(
                    ('en', 'fr', 'zh'), num_epochs=num_epochs),
                                           indices=((0, 0), (0, 1), (2, 0)),
                                           dense_shape=(3, 2))
            }
            return features, constant_op.constant(((1, ), (0, ), (0, )),
                                                  dtype=dtypes.int32)

        lang_column = feature_column.sparse_column_with_hash_bucket(
            'language', hash_bucket_size=20)
        classifier = dnn.DNNClassifier(
            feature_columns=(feature_column.embedding_column(lang_column,
                                                             dimension=1),
                             feature_column.real_valued_column('age')),
            hidden_units=(3, 3),
            config=run_config.RunConfig(tf_random_seed=1))
        steps = 100
        metrics = classifier.fit(input_fn=_input_fn,
                                 steps=steps).evaluate(input_fn=_input_fn,
                                                       steps=1)
        estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step',
                                             metrics)
        estimator_test_utils.assert_in_range(0.9, 1.0, 'accuracy', metrics)
        estimator_test_utils.assert_in_range(0.0, 0.3, 'loss', metrics)

        self._report_metrics(metrics)
        self._report_predictions(
            classifier=classifier,
            input_fn=functools.partial(_input_fn, num_epochs=1),
            iters=metrics['global_step'],
            n_examples=3,
            n_classes=2,
            expected_classes=(1, 0, 0),
            benchmark_name_override=(
                'DNNClassifierBenchmark.benchmarkLogisticTensorData_predictions'
            ))
Ejemplo n.º 28
0
  def benchmarkLogisticTensorData(self):

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant(((.8,), (0.2,), (.1,))),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ('en', 'fr', 'zh'), num_epochs=num_epochs),
                  indices=((0, 0), (0, 1), (2, 0)),
                  dense_shape=(3, 2))
      }
      return features, constant_op.constant(
          ((1,), (0,), (0,)), dtype=dtypes.int32)

    lang_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    classifier = dnn.DNNClassifier(
        feature_columns=(feature_column.embedding_column(
            lang_column, dimension=1),
                         feature_column.real_valued_column('age')),
        hidden_units=(3, 3),
        config=run_config.RunConfig(tf_random_seed=1))
    steps = 100
    metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate(
        input_fn=_input_fn, steps=1)
    estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step',
                                         metrics)
    estimator_test_utils.assert_in_range(0.9, 1.0, 'accuracy', metrics)
    estimator_test_utils.assert_in_range(0.0, 0.3, 'loss', metrics)

    self._report_metrics(metrics)
    self._report_predictions(
        classifier=classifier,
        input_fn=functools.partial(_input_fn, num_epochs=1),
        iters=metrics['global_step'],
        n_examples=3,
        n_classes=2,
        expected_classes=(1, 0, 0),
        benchmark_name_override=(
            'DNNClassifierBenchmark.benchmarkLogisticTensorData_predictions'))
Ejemplo n.º 29
0
  def testPredict_AsIterableFalse(self):
    """Tests predict and predict_prob methods with as_iterable=False."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.2], [.1]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)

    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1)
    ]

    n_classes = 3
    classifier = dnn.DNNClassifier(
        n_classes=n_classes,
        feature_columns=feature_columns,
        hidden_units=[10, 10],
        config=run_config.RunConfig(tf_random_seed=1))

    classifier.fit(input_fn=_input_fn, steps=100)

    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
    self._assertInRange(0.0, 1.0, scores['accuracy'])
    self.assertIn('loss', scores)
    predictions = classifier.predict(input_fn=_input_fn, as_iterable=False)
    self._assertBinaryPredictions(3, predictions)
    probabilities = classifier.predict_proba(
        input_fn=_input_fn, as_iterable=False)
    self._assertProbabilities(3, n_classes, probabilities)
Ejemplo n.º 30
0
  def testLogisticRegression_TensorData(self):
    """Tests binary classification using tensor data as input."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [0.2], [.1]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)

    language_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            language_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    classifier = dnn.DNNClassifier(
        n_classes=2,
        feature_columns=feature_columns,
        hidden_units=[10, 10],
        config=run_config.RunConfig(tf_random_seed=1))

    classifier.fit(input_fn=_input_fn, steps=50)

    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
    self._assertInRange(0.0, 1.0, scores['accuracy'])
    self.assertIn('loss', scores)
    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
    predictions = list(
        classifier.predict(
            input_fn=predict_input_fn, as_iterable=True))
    self._assertBinaryPredictions(3, predictions)
  def benchmarkPartitionedVariables(self):

    def _input_fn():
      features = {
          'language':
              sparse_tensor.SparseTensor(
                  values=('en', 'fr', 'zh'),
                  indices=((0, 0), (0, 1), (2, 0)),
                  dense_shape=(3, 2))
      }
      labels = constant_op.constant(((1,), (0,), (0,)))
      return features, labels

    # The given hash_bucket_size results in variables larger than the
    # default min_slice_size attribute, so the variables are partitioned.
    sparse_feature = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=2e7)
    embedding_feature = feature_column.embedding_column(
        sparse_feature, dimension=1)

    tf_config = {
        'cluster': {
            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
        }
    }
    with test.mock.patch.dict('os.environ',
                              {'TF_CONFIG': json.dumps(tf_config)}):
      config = run_config.RunConfig()
      # Because we did not start a distributed cluster, we need to pass an
      # empty ClusterSpec, otherwise the device_setter will look for
      # distributed jobs, such as "/job:ps" which are not present.
      config._cluster_spec = server_lib.ClusterSpec({})

    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
        linear_feature_columns=(sparse_feature,),
        dnn_feature_columns=(embedding_feature,),
        dnn_hidden_units=(3, 3),
        config=config)

    metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate(
        input_fn=_input_fn, steps=100)
    self._assertCommonMetrics(metrics)
Ejemplo n.º 32
0
    def testPrepareInputsForRnnSparse(self):
        num_unroll = 2
        embedding_dimension = 8

        expected = [
            np.array([[1., 1., 1., 1., 1., 1., 1., 1.],
                      [1., 1., 1., 1., 1., 1., 1., 1.],
                      [1., 1., 1., 1., 1., 1., 1., 1.]]),
            np.array([[1., 1., 1., 1., 1., 1., 1., 1.],
                      [2., 2., 2., 2., 2., 2., 2., 2.],
                      [1., 1., 1., 1., 1., 1., 1., 1.]])
        ]

        sequence_features = {
            'wire_cast':
            sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1,
                                                            0], [1, 0, 0],
                                                [1, 1, 0], [1, 1, 1],
                                                [2, 0, 0], [2, 1, 1]],
                                       values=[
                                           b'marlo', b'stringer', b'omar',
                                           b'stringer', b'marlo', b'marlo',
                                           b'omar'
                                       ],
                                       dense_shape=[3, 2, 2])
        }

        wire_cast = feature_column.sparse_column_with_keys(
            'wire_cast', ['marlo', 'omar', 'stringer'])
        sequence_feature_columns = [
            feature_column.embedding_column(
                wire_cast,
                dimension=embedding_dimension,
                combiner='sum',
                initializer=init_ops.ones_initializer())
        ]

        context_features = None

        self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                          sequence_feature_columns, num_unroll,
                                          expected)
    def benchmarkPartitionedVariables(self):
        def _input_fn():
            features = {
                'language':
                sparse_tensor.SparseTensor(values=('en', 'fr', 'zh'),
                                           indices=((0, 0), (0, 1), (2, 0)),
                                           dense_shape=(3, 2))
            }
            labels = constant_op.constant(((1, ), (0, ), (0, )))
            return features, labels

        # The given hash_bucket_size results in variables larger than the
        # default min_slice_size attribute, so the variables are partitioned.
        sparse_feature = feature_column.sparse_column_with_hash_bucket(
            'language', hash_bucket_size=2e7)
        embedding_feature = feature_column.embedding_column(sparse_feature,
                                                            dimension=1)

        tf_config = {
            'cluster': {
                run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
            }
        }
        with test.mock.patch.dict('os.environ',
                                  {'TF_CONFIG': json.dumps(tf_config)}):
            config = run_config.RunConfig()
            # Because we did not start a distributed cluster, we need to pass an
            # empty ClusterSpec, otherwise the device_setter will look for
            # distributed jobs, such as "/job:ps" which are not present.
            config._cluster_spec = server_lib.ClusterSpec({})

        classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=(sparse_feature, ),
            dnn_feature_columns=(embedding_feature, ),
            dnn_hidden_units=(3, 3),
            config=config)

        metrics = classifier.fit(input_fn=_input_fn,
                                 steps=_ITERS).evaluate(input_fn=_input_fn,
                                                        steps=100)
        self._assertCommonMetrics(metrics)
Ejemplo n.º 34
0
  def testPredict_AsIterable(self):
    """Tests predict method with as_iterable=True."""
    labels = [1., 0., 0.2]

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[0.8], [0.15], [0.]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant(labels, dtype=dtypes.float32)

    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    regressor = dnn.DNNRegressor(
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    regressor.fit(input_fn=_input_fn, steps=200)

    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
    self.assertIn('loss', scores)
    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
    predictions = list(
        regressor.predict(
            input_fn=predict_input_fn, as_iterable=True))
    self.assertAllClose(labels, predictions, atol=0.2)
  def testPrepareInputsForRnnSparse(self):
    num_unroll = 2
    embedding_dimension = 8

    expected = [
        np.array([[1., 1., 1., 1., 1., 1., 1., 1.],
                  [1., 1., 1., 1., 1., 1., 1., 1.],
                  [1., 1., 1., 1., 1., 1., 1., 1.]]),
        np.array([[1., 1., 1., 1., 1., 1., 1., 1.],
                  [2., 2., 2., 2., 2., 2., 2., 2.],
                  [1., 1., 1., 1., 1., 1., 1., 1.]])
    ]

    sequence_features = {
        'wire_cast':
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2])
    }

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    sequence_feature_columns = [
        feature_column.embedding_column(
            wire_cast,
            dimension=embedding_dimension,
            combiner='sum',
            initializer=init_ops.ones_initializer())
    ]

    context_features = None

    self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                      sequence_feature_columns, num_unroll,
                                      expected)
Ejemplo n.º 36
0
  def testDisableCenteredBias(self):
    """Tests that we can disable centered bias."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[0.8], [0.15], [0.]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32)

    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    regressor = dnn.DNNRegressor(
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        enable_centered_bias=False,
        config=run_config.RunConfig(tf_random_seed=1))

    regressor.fit(input_fn=_input_fn, steps=5)
    self.assertNotIn('centered_bias_weight', regressor.get_variable_names())

    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
    self.assertIn('loss', scores)
    def testCreateFeatureSpec(self):
        sparse_col = fc.sparse_column_with_hash_bucket("sparse_column",
                                                       hash_bucket_size=100)
        embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
                                            dimension=4)
        sparse_id_col = fc.sparse_column_with_keys(
            "id_column", ["marlo", "omar", "stringer"])
        weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                    "id_weights_column")
        real_valued_col1 = fc.real_valued_column("real_valued_column1")
        real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
        real_valued_col3 = fc.real_valued_column("real_valued_column3",
                                                 dimension=None)
        bucketized_col1 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization1"),
            [0, 4])
        bucketized_col2 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization2", 4),
            [0, 4])
        a = fc.sparse_column_with_hash_bucket("cross_aaa",
                                              hash_bucket_size=100)
        b = fc.sparse_column_with_hash_bucket("cross_bbb",
                                              hash_bucket_size=100)
        cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
        feature_columns = set([
            sparse_col, embedding_col, weighted_id_col, real_valued_col1,
            real_valued_col2, real_valued_col3, bucketized_col1,
            bucketized_col2, cross_col
        ])
        expected_config = {
            "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
            "real_valued_column1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column2":
            parsing_ops.FixedLenFeature([5], dtype=dtypes.float32),
            "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
            "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature([4], dtype=dtypes.float32),
            "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
            "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
        }

        config = fc.create_feature_spec_for_parsing(feature_columns)
        self.assertDictEqual(expected_config, config)

        # Test that the same config is parsed out if we pass a dictionary.
        feature_columns_dict = {
            str(i): val
            for i, val in enumerate(feature_columns)
        }
        config = fc.create_feature_spec_for_parsing(feature_columns_dict)
        self.assertDictEqual(expected_config, config)
Ejemplo n.º 38
0
  def testPrepareFeaturesForSQSS(self):
    mode = model_fn_lib.ModeKeys.TRAIN
    seq_feature_name = 'seq_feature'
    sparse_seq_feature_name = 'wire_cast'
    ctx_feature_name = 'ctx_feature'
    sequence_length = 4
    embedding_dimension = 8

    features = {
        sparse_seq_feature_name:
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
        seq_feature_name:
            constant_op.constant(
                1.0, shape=[sequence_length]),
        ctx_feature_name:
            constant_op.constant(2.0)
    }

    labels = constant_op.constant(5.0, shape=[sequence_length])

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    sequence_feature_columns = [
        feature_column.real_valued_column(
            seq_feature_name, dimension=1), feature_column.embedding_column(
                wire_cast,
                dimension=embedding_dimension,
                initializer=init_ops.ones_initializer())
    ]

    context_feature_columns = [
        feature_column.real_valued_column(
            ctx_feature_name, dimension=1)
    ]

    expected_sequence = {
        rnn_common.RNNKeys.LABELS_KEY:
            np.array([5., 5., 5., 5.]),
        seq_feature_name:
            np.array([1., 1., 1., 1.]),
        sparse_seq_feature_name:
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
    }

    expected_context = {ctx_feature_name: 2.}

    sequence, context = ssre._prepare_features_for_sqss(
        features, labels, mode, sequence_feature_columns,
        context_feature_columns)

    def assert_equal(expected, got):
      self.assertEqual(sorted(expected), sorted(got))
      for k, v in expected.items():
        if isinstance(v, sparse_tensor.SparseTensor):
          self.assertAllEqual(v.values.eval(), got[k].values)
          self.assertAllEqual(v.indices.eval(), got[k].indices)
          self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape)
        else:
          self.assertAllEqual(v, got[k])

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      sess.run(lookup_ops.tables_initializer())
      actual_sequence, actual_context = sess.run(
          [sequence, context])
      assert_equal(expected_sequence, actual_sequence)
      assert_equal(expected_context, actual_context)
Ejemplo n.º 39
0
  def testLearnLyrics(self):
    lyrics = 'if I go there will be trouble and if I stay it will be double'
    lyrics_list = lyrics.split()
    sequence_length = len(lyrics_list)
    vocab = set(lyrics_list)
    batch_size = 16
    num_classes = len(vocab)
    num_unroll = 7  # not a divisor of sequence_length
    train_steps = 350
    eval_steps = 30
    num_units = [4]
    learning_rate = 0.4
    accuracy_threshold = 0.65

    def get_lyrics_input_fn(seed):

      def input_fn():
        start = random_ops.random_uniform(
            (), minval=0, maxval=sequence_length, dtype=dtypes.int32, seed=seed)
        # Concatenate lyrics_list so inputs and labels wrap when start > 0.
        lyrics_list_concat = lyrics_list + lyrics_list
        inputs_dense = array_ops.slice(lyrics_list_concat, [start],
                                       [sequence_length])
        indices = array_ops.constant(
            [[i, 0] for i in range(sequence_length)], dtype=dtypes.int64)
        dense_shape = [sequence_length, 1]
        inputs = sparse_tensor.SparseTensor(
            indices=indices, values=inputs_dense, dense_shape=dense_shape)
        table = lookup.string_to_index_table_from_tensor(
            mapping=list(vocab), default_value=-1, name='lookup')
        labels = table.lookup(
            array_ops.slice(lyrics_list_concat, [start + 1], [sequence_length]))
        return {'lyrics': inputs}, labels

      return input_fn

    sequence_feature_columns = [
        feature_column.embedding_column(
            feature_column.sparse_column_with_keys('lyrics', vocab),
            dimension=8)
    ]
    config = run_config.RunConfig(tf_random_seed=21212)
    sequence_estimator = ssre.StateSavingRnnEstimator(
        constants.ProblemType.CLASSIFICATION,
        num_units=num_units,
        cell_type='basic_rnn',
        num_unroll=num_unroll,
        batch_size=batch_size,
        sequence_feature_columns=sequence_feature_columns,
        num_classes=num_classes,
        learning_rate=learning_rate,
        config=config,
        predict_probabilities=True,
        queue_capacity=2 + batch_size,
        seed=1234)

    train_input_fn = get_lyrics_input_fn(seed=12321)
    eval_input_fn = get_lyrics_input_fn(seed=32123)

    sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps)

    evaluation = sequence_estimator.evaluate(
        input_fn=eval_input_fn, steps=eval_steps)
    accuracy = evaluation['accuracy']
    self.assertGreater(accuracy, accuracy_threshold,
                       'Accuracy should be higher than {}; got {}'.format(
                           accuracy_threshold, accuracy))
Ejemplo n.º 40
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc.real_valued_column(
        "real_valued_column3", dimension=None)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2,
        cross_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
Ejemplo n.º 41
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
Ejemplo n.º 42
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
  def testLearnLyrics(self):
    lyrics = 'if I go there will be trouble and if I stay it will be double'
    lyrics_list = lyrics.split()
    sequence_length = len(lyrics_list)
    vocab = set(lyrics_list)
    batch_size = 16
    num_classes = len(vocab)
    num_unroll = 7  # not a divisor of sequence_length
    train_steps = 350
    eval_steps = 30
    num_units = [4]
    learning_rate = 0.4
    accuracy_threshold = 0.65

    def get_lyrics_input_fn(seed):

      def input_fn():
        start = random_ops.random_uniform(
            (), minval=0, maxval=sequence_length, dtype=dtypes.int32, seed=seed)
        # Concatenate lyrics_list so inputs and labels wrap when start > 0.
        lyrics_list_concat = lyrics_list + lyrics_list
        inputs_dense = array_ops.slice(lyrics_list_concat, [start],
                                       [sequence_length])
        indices = array_ops.constant(
            [[i, 0] for i in range(sequence_length)], dtype=dtypes.int64)
        dense_shape = [sequence_length, 1]
        inputs = sparse_tensor.SparseTensor(
            indices=indices, values=inputs_dense, dense_shape=dense_shape)
        table = lookup.string_to_index_table_from_tensor(
            mapping=list(vocab), default_value=-1, name='lookup')
        labels = table.lookup(
            array_ops.slice(lyrics_list_concat, [start + 1], [sequence_length]))
        return {'lyrics': inputs}, labels

      return input_fn

    sequence_feature_columns = [
        feature_column.embedding_column(
            feature_column.sparse_column_with_keys('lyrics', vocab),
            dimension=8)
    ]
    config = run_config.RunConfig(tf_random_seed=21212)
    sequence_estimator = ssre.StateSavingRnnEstimator(
        constants.ProblemType.CLASSIFICATION,
        num_units=num_units,
        cell_type='basic_rnn',
        num_unroll=num_unroll,
        batch_size=batch_size,
        sequence_feature_columns=sequence_feature_columns,
        num_classes=num_classes,
        learning_rate=learning_rate,
        config=config,
        predict_probabilities=True,
        queue_capacity=2 + batch_size,
        seed=1234)

    train_input_fn = get_lyrics_input_fn(seed=12321)
    eval_input_fn = get_lyrics_input_fn(seed=32123)

    sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps)

    evaluation = sequence_estimator.evaluate(
        input_fn=eval_input_fn, steps=eval_steps)
    accuracy = evaluation['accuracy']
    self.assertGreater(accuracy, accuracy_threshold,
                       'Accuracy should be higher than {}; got {}'.format(
                           accuracy_threshold, accuracy))
  def testPrepareFeaturesForSQSS(self):
    mode = model_fn_lib.ModeKeys.TRAIN
    seq_feature_name = 'seq_feature'
    sparse_seq_feature_name = 'wire_cast'
    ctx_feature_name = 'ctx_feature'
    sequence_length = 4
    embedding_dimension = 8

    features = {
        sparse_seq_feature_name:
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
        seq_feature_name:
            constant_op.constant(
                1.0, shape=[sequence_length]),
        ctx_feature_name:
            constant_op.constant(2.0)
    }

    labels = constant_op.constant(5.0, shape=[sequence_length])

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    sequence_feature_columns = [
        feature_column.real_valued_column(
            seq_feature_name, dimension=1), feature_column.embedding_column(
                wire_cast,
                dimension=embedding_dimension,
                initializer=init_ops.ones_initializer())
    ]

    context_feature_columns = [
        feature_column.real_valued_column(
            ctx_feature_name, dimension=1)
    ]

    expected_sequence = {
        rnn_common.RNNKeys.LABELS_KEY:
            np.array([5., 5., 5., 5.]),
        seq_feature_name:
            np.array([1., 1., 1., 1.]),
        sparse_seq_feature_name:
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
    }

    expected_context = {ctx_feature_name: 2.}

    sequence, context = ssre._prepare_features_for_sqss(
        features, labels, mode, sequence_feature_columns,
        context_feature_columns)

    def assert_equal(expected, got):
      self.assertEqual(sorted(expected), sorted(got))
      for k, v in expected.items():
        if isinstance(v, sparse_tensor.SparseTensor):
          self.assertAllEqual(v.values.eval(), got[k].values)
          self.assertAllEqual(v.indices.eval(), got[k].indices)
          self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape)
        else:
          self.assertAllEqual(v, got[k])

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      sess.run(data_flow_ops.initialize_all_tables())
      actual_sequence, actual_context = sess.run(
          [sequence, context])
      assert_equal(expected_sequence, actual_sequence)
      assert_equal(expected_context, actual_context)
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
Ejemplo n.º 46
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)