Esempio n. 1
0
    def testSharedEmbeddingColumn(self):
        a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"])
        a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"])
        b = fc.shared_embedding_columns([a1, a2], dimension=4, combiner="mean")
        self.assertEqual(len(b), 2)
        self.assertEqual(b[0].shared_embedding_name, "a1_a2_shared_embedding")
        self.assertEqual(b[1].shared_embedding_name, "a1_a2_shared_embedding")

        # Create a sparse id tensor for a1.
        input_tensor_c1 = sparse_tensor_lib.SparseTensor(indices=[[0,
                                                                   0], [1, 1],
                                                                  [2, 2]],
                                                         values=[0, 1, 2],
                                                         dense_shape=[3, 3])
        # Create a sparse id tensor for a2.
        input_tensor_c2 = sparse_tensor_lib.SparseTensor(indices=[[0,
                                                                   0], [1, 1],
                                                                  [2, 2]],
                                                         values=[0, 1, 2],
                                                         dense_shape=[3, 3])
        with variable_scope.variable_scope("run_1"):
            b1 = feature_column_ops.input_from_feature_columns(
                {b[0]: input_tensor_c1}, [b[0]])
            b2 = feature_column_ops.input_from_feature_columns(
                {b[1]: input_tensor_c2}, [b[1]])
        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            b1_value = b1.eval()
            b2_value = b2.eval()
        for i in range(len(b1_value)):
            self.assertAllClose(b1_value[i], b2_value[i])

        # Test the case when a shared_embedding_name is explictly specified.
        d = fc.shared_embedding_columns(
            [a1, a2],
            dimension=4,
            combiner="mean",
            shared_embedding_name="my_shared_embedding")
        # a3 is a completely different sparse column with a1 and a2, but since the
        # same shared_embedding_name is passed in, a3 will have the same embedding
        # as a1 and a2
        a3 = fc.sparse_column_with_keys("a3", [42, 1, -1000],
                                        dtype=dtypes.int32)
        e = fc.shared_embedding_columns(
            [a3],
            dimension=4,
            combiner="mean",
            shared_embedding_name="my_shared_embedding")
        with variable_scope.variable_scope("run_2"):
            d1 = feature_column_ops.input_from_feature_columns(
                {d[0]: input_tensor_c1}, [d[0]])
            e1 = feature_column_ops.input_from_feature_columns(
                {e[0]: input_tensor_c1}, [e[0]])
        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            d1_value = d1.eval()
            e1_value = e1.eval()
        for i in range(len(d1_value)):
            self.assertAllClose(d1_value[i], e1_value[i])
  def testSharedEmbeddingColumn(self):
    a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"])
    a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"])
    b = fc.shared_embedding_columns([a1, a2], dimension=4, combiner="mean")
    self.assertEqual(len(b), 2)
    self.assertEqual(b[0].shared_embedding_name, "a1_a2_shared_embedding")
    self.assertEqual(b[1].shared_embedding_name, "a1_a2_shared_embedding")

    # Create a sparse id tensor for a1.
    input_tensor_c1 = sparse_tensor_lib.SparseTensor(
        indices=[[0, 0], [1, 1], [2, 2]], values=[0, 1, 2], dense_shape=[3, 3])
    # Create a sparse id tensor for a2.
    input_tensor_c2 = sparse_tensor_lib.SparseTensor(
        indices=[[0, 0], [1, 1], [2, 2]], values=[0, 1, 2], dense_shape=[3, 3])
    with variable_scope.variable_scope("run_1"):
      b1 = feature_column_ops.input_from_feature_columns({
          b[0]: input_tensor_c1
      }, [b[0]])
      b2 = feature_column_ops.input_from_feature_columns({
          b[1]: input_tensor_c2
      }, [b[1]])
    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      b1_value = b1.eval()
      b2_value = b2.eval()
    for i in range(len(b1_value)):
      self.assertAllClose(b1_value[i], b2_value[i])

    # Test the case when a shared_embedding_name is explictly specified.
    d = fc.shared_embedding_columns(
        [a1, a2],
        dimension=4,
        combiner="mean",
        shared_embedding_name="my_shared_embedding")
    # a3 is a completely different sparse column with a1 and a2, but since the
    # same shared_embedding_name is passed in, a3 will have the same embedding
    # as a1 and a2
    a3 = fc.sparse_column_with_keys("a3", ["cathy", "tom", "anderson"])
    e = fc.shared_embedding_columns(
        [a3],
        dimension=4,
        combiner="mean",
        shared_embedding_name="my_shared_embedding")
    with variable_scope.variable_scope("run_2"):
      d1 = feature_column_ops.input_from_feature_columns({
          d[0]: input_tensor_c1
      }, [d[0]])
      e1 = feature_column_ops.input_from_feature_columns({
          e[0]: input_tensor_c1
      }, [e[0]])
    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      d1_value = d1.eval()
      e1_value = e1.eval()
    for i in range(len(d1_value)):
      self.assertAllClose(d1_value[i], e1_value[i])
Esempio n. 3
0
 def testSharedEmbeddingColumnDeepCopy(self):
   a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"])
   a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"])
   columns = fc.shared_embedding_columns(
       [a1, a2], dimension=4, combiner="mean")
   columns_copy = copy.deepcopy(columns)
   self.assertEqual(
       columns_copy[0].shared_embedding_name, "a1_a2_shared_embedding")
   self.assertEqual(
       columns_copy[1].shared_embedding_name, "a1_a2_shared_embedding")
 def testSharedEmbeddingColumnDeepCopy(self):
   a1 = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"])
   a2 = fc.sparse_column_with_keys("a2", ["marlo", "omar", "stringer"])
   columns = fc.shared_embedding_columns(
       [a1, a2], dimension=4, combiner="mean")
   columns_copy = copy.deepcopy(columns)
   self.assertEqual(
       columns_copy[0].shared_embedding_name, "a1_a2_shared_embedding")
   self.assertEqual(
       columns_copy[1].shared_embedding_name, "a1_a2_shared_embedding")
  def testSharedEmbeddingColumnErrors(self):
    # Tries passing in a string.
    with self.assertRaises(TypeError):
      invalid_string = "Invalid string."
      fc.shared_embedding_columns(invalid_string, dimension=2, combiner="mean")

    # Tries passing in a set of sparse columns.
    with self.assertRaises(TypeError):
      invalid_set = set([
          fc.sparse_column_with_keys("a", ["foo", "bar"]),
          fc.sparse_column_with_keys("b", ["foo", "bar"]),
      ])
      fc.shared_embedding_columns(invalid_set, dimension=2, combiner="mean")
Esempio n. 6
0
  def testSharedEmbeddingColumnErrors(self):
    # Tries passing in a string.
    with self.assertRaises(TypeError):
      invalid_string = "Invalid string."
      fc.shared_embedding_columns(invalid_string, dimension=2, combiner="mean")

    # Tries passing in a set of sparse columns.
    with self.assertRaises(TypeError):
      invalid_set = set([
          fc.sparse_column_with_keys("a", ["foo", "bar"]),
          fc.sparse_column_with_keys("b", ["foo", "bar"]),
      ])
      fc.shared_embedding_columns(invalid_set, dimension=2, combiner="mean")
Esempio n. 7
0
 def testWeightedSparseColumnDeepCopy(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted = fc.weighted_sparse_column(ids, "weights")
   weighted_copy = copy.deepcopy(weighted)
   self.assertEqual(weighted_copy.sparse_id_column.name, "ids")
   self.assertEqual(weighted_copy.weight_column_name, "weights")
   self.assertEqual(weighted_copy.name, "ids_weighted_by_weights")
Esempio n. 8
0
 def test_exogenous_input(self):
   """Test that no errors are raised when using exogenous features."""
   dtype = dtypes.float64
   times = [1, 2, 3, 4, 5, 6]
   values = [[0.01], [5.10], [5.21], [0.30], [5.41], [0.50]]
   feature_a = [["off"], ["on"], ["on"], ["off"], ["on"], ["off"]]
   sparse_column_a = feature_column.sparse_column_with_keys(
       column_name="feature_a", keys=["on", "off"])
   one_hot_a = layers.one_hot_column(sparse_id_column=sparse_column_a)
   regressor = estimators.StructuralEnsembleRegressor(
       periodicities=[],
       num_features=1,
       moving_average_order=0,
       exogenous_feature_columns=[one_hot_a],
       dtype=dtype)
   features = {TrainEvalFeatures.TIMES: times,
               TrainEvalFeatures.VALUES: values,
               "feature_a": feature_a}
   train_input_fn = input_pipeline.RandomWindowInputFn(
       input_pipeline.NumpyReader(features),
       window_size=6, batch_size=1)
   regressor.train(input_fn=train_input_fn, steps=1)
   eval_input_fn = input_pipeline.WholeDatasetInputFn(
       input_pipeline.NumpyReader(features))
   evaluation = regressor.evaluate(input_fn=eval_input_fn, steps=1)
   predict_input_fn = input_pipeline.predict_continuation_input_fn(
       evaluation, times=[[7, 8, 9]],
       exogenous_features={"feature_a": [[["on"], ["off"], ["on"]]]})
   regressor.predict(input_fn=predict_input_fn)
 def testOneHotColumnForWeightedSparseColumn(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     one_hot = fc.one_hot_column(weighted_ids)
     self.assertEqual(one_hot.sparse_id_column.name,
                      "ids_weighted_by_weights")
     self.assertEqual(one_hot.length, 3)
Esempio n. 10
0
 def testOneHotColumnDeepCopy(self):
   a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
   column = fc.one_hot_column(a)
   column_copy = copy.deepcopy(column)
   self.assertEqual(column_copy.sparse_id_column.name, "a")
   self.assertEqual(column.name, "a_one_hot")
   self.assertEqual(column.length, 4)
 def testFloat32WeightedSparseStringColumnDtypes(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertDictEqual({
       "ids": parsing_ops.VarLenFeature(dtypes.string),
       "weights": parsing_ops.VarLenFeature(dtypes.float32)
   }, weighted_ids.config)
 def testFloat32WeightedSparseInt32ColumnDtypes(self):
   ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int32)
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertDictEqual({
       "ids": parsing_ops.VarLenFeature(dtypes.int32),
       "weights": parsing_ops.VarLenFeature(dtypes.float32)
   }, weighted_ids.config)
 def testWeightedSparseColumnDeepCopy(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted = fc.weighted_sparse_column(ids, "weights")
   weighted_copy = copy.deepcopy(weighted)
   self.assertEqual(weighted_copy.sparse_id_column.name, "ids")
   self.assertEqual(weighted_copy.weight_column_name, "weights")
   self.assertEqual(weighted_copy.name, "ids_weighted_by_weights")
Esempio n. 14
0
 def testFloat32WeightedSparseStringColumnDtypes(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertDictEqual({
       "ids": parsing_ops.VarLenFeature(dtypes.string),
       "weights": parsing_ops.VarLenFeature(dtypes.float32)
   }, weighted_ids.config)
Esempio n. 15
0
 def testFloat32WeightedSparseInt32ColumnDtypes(self):
   ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int32)
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertDictEqual({
       "ids": parsing_ops.VarLenFeature(dtypes.int32),
       "weights": parsing_ops.VarLenFeature(dtypes.float32)
   }, weighted_ids.config)
 def testOneHotColumnDeepCopy(self):
   a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
   column = fc.one_hot_column(a)
   column_copy = copy.deepcopy(column)
   self.assertEqual(column_copy.sparse_id_column.name, "a")
   self.assertEqual(column.name, "a_one_hot")
   self.assertEqual(column.length, 4)
  def setUp(self):
    super(DynamicRnnEstimatorTest, self).setUp()
    self.rnn_cell = core_rnn_cell_impl.BasicRNNCell(self.NUM_RNN_CELL_UNITS)
    self.mock_target_column = MockTargetColumn(
        num_label_columns=self.NUM_LABEL_COLUMNS)

    location = feature_column.sparse_column_with_keys(
        'location', keys=['west_side', 'east_side', 'nyc'])
    location_onehot = feature_column.one_hot_column(location)
    self.context_feature_columns = [location_onehot]

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8)
    measurements = feature_column.real_valued_column(
        'measurements', dimension=2)
    self.sequence_feature_columns = [measurements, wire_cast_embedded]
  def setUp(self):
    super(DynamicRnnEstimatorTest, self).setUp()
    self.rnn_cell = rnn_cell.BasicRNNCell(self.NUM_RNN_CELL_UNITS)
    self.mock_target_column = MockTargetColumn(
        num_label_columns=self.NUM_LABEL_COLUMNS)

    location = feature_column.sparse_column_with_keys(
        'location', keys=['west_side', 'east_side', 'nyc'])
    location_onehot = feature_column.one_hot_column(location)
    self.context_feature_columns = [location_onehot]

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8)
    measurements = feature_column.real_valued_column(
        'measurements', dimension=2)
    self.sequence_feature_columns = [measurements, wire_cast_embedded]
 def testMissingValueInOneHotColumnForSparseColumnWithKeys(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   one_hot = fc.one_hot_column(ids)
   features = {"ids": constant_op.constant([["marlo", "unknown", "omar"]])}
   one_hot_tensor = feature_column_ops.input_from_feature_columns(
       features, [one_hot])
   with self.test_session() as sess:
     sess.run(variables.global_variables_initializer())
     sess.run(lookup_ops.tables_initializer())
     self.assertAllEqual([[1., 1., 0.]], one_hot_tensor.eval())
  def testOneHotColumn(self):
    a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
    onehot_a = fc.one_hot_column(a)
    self.assertEqual(onehot_a.sparse_id_column.name, "a")
    self.assertEqual(onehot_a.length, 4)

    b = fc.sparse_column_with_hash_bucket(
        "b", hash_bucket_size=100, combiner="sum")
    onehot_b = fc.one_hot_column(b)
    self.assertEqual(onehot_b.sparse_id_column.name, "b")
    self.assertEqual(onehot_b.length, 100)
Esempio n. 21
0
  def testOneHotColumn(self):
    a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
    onehot_a = fc.one_hot_column(a)
    self.assertEqual(onehot_a.sparse_id_column.name, "a")
    self.assertEqual(onehot_a.length, 4)

    b = fc.sparse_column_with_hash_bucket(
        "b", hash_bucket_size=100, combiner="sum")
    onehot_b = fc.one_hot_column(b)
    self.assertEqual(onehot_b.sparse_id_column.name, "b")
    self.assertEqual(onehot_b.length, 100)
  def testInt32WeightedSparseInt64ColumnDtypes(self):
    ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int64)
    weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32)
    self.assertDictEqual({
        "ids": parsing_ops.VarLenFeature(dtypes.int64),
        "weights": parsing_ops.VarLenFeature(dtypes.int32)
    }, weighted_ids.config)

    with self.assertRaisesRegexp(ValueError,
                                 "dtype is not convertible to float"):
      weighted_ids = fc.weighted_sparse_column(
          ids, "weights", dtype=dtypes.string)
 def testSharedEmbeddingColumnDeterminism(self):
   # Tests determinism in auto-generated shared_embedding_name.
   sparse_id_columns = tuple([
       fc.sparse_column_with_keys(k, ["foo", "bar"])
       for k in ["07", "02", "00", "03", "05", "01", "09", "06", "04", "08"]
   ])
   output = fc.shared_embedding_columns(
       sparse_id_columns, dimension=2, combiner="mean")
   self.assertEqual(len(output), 10)
   for x in output:
     self.assertEqual(x.shared_embedding_name,
                      "00_01_02_plus_7_others_shared_embedding")
Esempio n. 24
0
  def testInt32WeightedSparseInt64ColumnDtypes(self):
    ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int64)
    weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32)
    self.assertDictEqual({
        "ids": parsing_ops.VarLenFeature(dtypes.int64),
        "weights": parsing_ops.VarLenFeature(dtypes.int32)
    }, weighted_ids.config)

    with self.assertRaisesRegexp(ValueError,
                                 "dtype is not convertible to float"):
      weighted_ids = fc.weighted_sparse_column(
          ids, "weights", dtype=dtypes.string)
Esempio n. 25
0
 def testSharedEmbeddingColumnDeterminism(self):
   # Tests determinism in auto-generated shared_embedding_name.
   sparse_id_columns = tuple([
       fc.sparse_column_with_keys(k, ["foo", "bar"])
       for k in ["07", "02", "00", "03", "05", "01", "09", "06", "04", "08"]
   ])
   output = fc.shared_embedding_columns(
       sparse_id_columns, dimension=2, combiner="mean")
   self.assertEqual(len(output), 10)
   for x in output:
     self.assertEqual(x.shared_embedding_name,
                      "00_01_02_plus_7_others_shared_embedding")
 def testSparseColumnKeysDeepCopy(self):
   """Tests deepcopy of sparse_column_with_keys."""
   column = fc.sparse_column_with_keys("a", keys=["key0", "key1", "key2"])
   self.assertEqual("a", column.name)
   column_copy = copy.deepcopy(column)
   self.assertEqual("a", column_copy.name)
   self.assertEqual(
       fc._SparseIdLookupConfig(  # pylint: disable=protected-access
           keys=("key0", "key1", "key2"),
           vocab_size=3,
           default_value=-1),
       column_copy.lookup_config)
   self.assertFalse(column_copy.is_integerized)
Esempio n. 27
0
 def testSparseColumnKeysDeepCopy(self):
     """Tests deepcopy of sparse_column_with_keys."""
     column = fc.sparse_column_with_keys("a", keys=["key0", "key1", "key2"])
     self.assertEqual("a", column.name)
     column_copy = copy.deepcopy(column)
     self.assertEqual("a", column_copy.name)
     self.assertEqual(
         fc._SparseIdLookupConfig(  # pylint: disable=protected-access
             keys=("key0", "key1", "key2"),
             vocab_size=3,
             default_value=-1),
         column_copy.lookup_config)
     self.assertFalse(column_copy.is_integerized)
Esempio n. 28
0
  def testSharedEmbeddingColumnWithWeightedSparseColumn(self):
    # Tests creation of shared embeddings containing weighted sparse columns.
    sparse_col = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"])
    ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
    weighted_sparse_col = fc.weighted_sparse_column(ids, "weights")
    self.assertEqual(weighted_sparse_col.name, "ids_weighted_by_weights")

    b = fc.shared_embedding_columns([sparse_col, weighted_sparse_col],
                                    dimension=4, combiner="mean")
    self.assertEqual(len(b), 2)
    self.assertEqual(b[0].shared_embedding_name,
                     "a1_ids_weighted_by_weights_shared_embedding")
    self.assertEqual(b[1].shared_embedding_name,
                     "a1_ids_weighted_by_weights_shared_embedding")

    # Tries reversing order to check compatibility condition.
    b = fc.shared_embedding_columns([weighted_sparse_col, sparse_col],
                                    dimension=4, combiner="mean")
    self.assertEqual(len(b), 2)
    self.assertEqual(b[0].shared_embedding_name,
                     "a1_ids_weighted_by_weights_shared_embedding")
    self.assertEqual(b[1].shared_embedding_name,
                     "a1_ids_weighted_by_weights_shared_embedding")

    # Tries adding two weighted columns to check compatibility between them.
    weighted_sparse_col_2 = fc.weighted_sparse_column(ids, "weights_2")
    b = fc.shared_embedding_columns([weighted_sparse_col,
                                     weighted_sparse_col_2],
                                    dimension=4, combiner="mean")
    self.assertEqual(len(b), 2)
    self.assertEqual(
        b[0].shared_embedding_name,
        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
    )
    self.assertEqual(
        b[1].shared_embedding_name,
        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
    )
Esempio n. 29
0
  def testCreateSequenceFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
    real_valued_col2 = fc.real_valued_column(
        "real_valued_default_column", dimension=5, default_value=3.0)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)

    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, real_valued_col4
    ])

    feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)

    expected_feature_spec = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[2], dtype=dtypes.float32, allow_missing=False),
        "real_valued_default_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[5], dtype=dtypes.float32, allow_missing=True),
        "real_valued_var_len_column":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_var_len_dense_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=dtypes.float32, allow_missing=True,
                default_value=4.0),
    }

    self.assertDictEqual(expected_feature_spec, feature_spec)
  def testCreateSequenceFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
    real_valued_col2 = fc.real_valued_column(
        "real_valued_default_column", dimension=5, default_value=3.0)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)

    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, real_valued_col4
    ])

    feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)

    expected_feature_spec = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[2], dtype=dtypes.float32, allow_missing=False),
        "real_valued_default_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[5], dtype=dtypes.float32, allow_missing=True),
        "real_valued_var_len_column":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_var_len_dense_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=dtypes.float32, allow_missing=True,
                default_value=4.0),
    }

    self.assertDictEqual(expected_feature_spec, feature_spec)
Esempio n. 31
0
    def testPrepareInputsForRnnSparseAndDense(self):
        num_unroll = 2
        embedding_dimension = 8
        dense_dimension = 2

        expected = [
            np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.],
                      [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.],
                      [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]),
            np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.],
                      [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.],
                      [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]])
        ]

        sequence_features = {
            'wire_cast':
            sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1,
                                                            0], [1, 0, 0],
                                                [1, 1, 0], [1, 1, 1],
                                                [2, 0, 0], [2, 1, 1]],
                                       values=[
                                           b'marlo', b'stringer', b'omar',
                                           b'stringer', b'marlo', b'marlo',
                                           b'omar'
                                       ],
                                       dense_shape=[3, 2, 2]),
            'seq_feature0':
            constant_op.constant([[[111., 112.], [121., 122.]],
                                  [[211., 212.], [221., 222.]],
                                  [[311., 312.], [321., 322.]]])
        }

        wire_cast = feature_column.sparse_column_with_keys(
            'wire_cast', ['marlo', 'omar', 'stringer'])
        wire_cast_embedded = feature_column.embedding_column(
            wire_cast,
            dimension=embedding_dimension,
            combiner='sum',
            initializer=init_ops.ones_initializer())
        seq_feature0_column = feature_column.real_valued_column(
            'seq_feature0', dimension=dense_dimension)

        sequence_feature_columns = [seq_feature0_column, wire_cast_embedded]

        context_features = None

        self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                          sequence_feature_columns, num_unroll,
                                          expected)
 def testMissingValueInOneHotColumnForWeightedSparseColumn(self):
   # Github issue 12583
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   one_hot = fc.one_hot_column(weighted_ids)
   features = {
       'ids': constant_op.constant([['marlo', 'unknown', 'omar']]),
       'weights': constant_op.constant([[2., 4., 6.]])
   }
   one_hot_tensor = feature_column_ops.input_from_feature_columns(
     features, [one_hot])
   with self.test_session() as sess:
     sess.run(variables.global_variables_initializer())
     sess.run(lookup_ops.tables_initializer())
     self.assertAllEqual([[2., 6., 0.]], one_hot_tensor.eval())
  def testPrepareInputsForRnnSparseAndDense(self):
    num_unroll = 2
    embedding_dimension = 8
    dense_dimension = 2

    expected = [
        np.array([[1., 1., 1., 1., 1., 1., 1., 1., 111., 112.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 211., 212.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 311., 312.]]),
        np.array([[1., 1., 1., 1., 1., 1., 1., 1., 121., 122.],
                  [2., 2., 2., 2., 2., 2., 2., 2., 221., 222.],
                  [1., 1., 1., 1., 1., 1., 1., 1., 321., 322.]])
    ]

    sequence_features = {
        'wire_cast':
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
        'seq_feature0':
            constant_op.constant([[[111., 112.], [121., 122.]],
                                  [[211., 212.], [221., 222.]],
                                  [[311., 312.], [321., 322.]]])
    }

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    wire_cast_embedded = feature_column.embedding_column(
        wire_cast,
        dimension=embedding_dimension,
        combiner='sum',
        initializer=init_ops.ones_initializer())
    seq_feature0_column = feature_column.real_valued_column(
        'seq_feature0', dimension=dense_dimension)

    sequence_feature_columns = [seq_feature0_column, wire_cast_embedded]

    context_features = None

    self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                      sequence_feature_columns, num_unroll,
                                      expected)
  def testWeightedSparseColumnDtypes(self):
    ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
    weighted_ids = fc.weighted_sparse_column(ids, "weights")
    self.assertDictEqual({
        "ids": parsing_ops.VarLenFeature(dtypes.string),
        "weights": parsing_ops.VarLenFeature(dtypes.float32)
    }, weighted_ids.config)

    weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32)
    self.assertDictEqual({
        "ids": parsing_ops.VarLenFeature(dtypes.string),
        "weights": parsing_ops.VarLenFeature(dtypes.int32)
    }, weighted_ids.config)

    with self.assertRaisesRegexp(ValueError,
                                 "dtype is not convertible to float"):
      weighted_ids = fc.weighted_sparse_column(
          ids, "weights", dtype=dtypes.string)
Esempio n. 35
0
  def testOneHotReshaping(self):
    """Tests reshaping behavior of `OneHotColumn`."""
    id_tensor_shape = [3, 2, 4, 5]

    sparse_column = fc.sparse_column_with_keys(
        "animals", ["squirrel", "moose", "dragon", "octopus"])
    one_hot = fc.one_hot_column(sparse_column)

    vocab_size = len(sparse_column.lookup_config.keys)
    id_tensor = _sparse_id_tensor(id_tensor_shape, vocab_size)

    for output_rank in range(1, len(id_tensor_shape) + 1):
      with variable_scope.variable_scope("output_rank_{}".format(output_rank)):
        one_hot_output = one_hot._to_dnn_input_layer(
            id_tensor, output_rank=output_rank)
      with self.test_session() as sess:
        one_hot_value = sess.run(one_hot_output)
        expected_shape = (id_tensor_shape[:output_rank - 1] + [vocab_size])
        self.assertEquals(expected_shape, list(one_hot_value.shape))
  def testOneHotReshaping(self):
    """Tests reshaping behavior of `OneHotColumn`."""
    id_tensor_shape = [3, 2, 4, 5]

    sparse_column = fc.sparse_column_with_keys(
        "animals", ["squirrel", "moose", "dragon", "octopus"])
    one_hot = fc.one_hot_column(sparse_column)

    vocab_size = len(sparse_column.lookup_config.keys)
    id_tensor = _sparse_id_tensor(id_tensor_shape, vocab_size)

    for output_rank in range(1, len(id_tensor_shape) + 1):
      with variable_scope.variable_scope("output_rank_{}".format(output_rank)):
        one_hot_output = one_hot._to_dnn_input_layer(
            id_tensor, output_rank=output_rank)
      with self.test_session() as sess:
        one_hot_value = sess.run(one_hot_output)
        expected_shape = (id_tensor_shape[:output_rank - 1] + [vocab_size])
        self.assertEquals(expected_shape, list(one_hot_value.shape))
Esempio n. 37
0
    def testPrepareInputsForRnnSparse(self):
        num_unroll = 2
        embedding_dimension = 8

        expected = [
            np.array([[1., 1., 1., 1., 1., 1., 1., 1.],
                      [1., 1., 1., 1., 1., 1., 1., 1.],
                      [1., 1., 1., 1., 1., 1., 1., 1.]]),
            np.array([[1., 1., 1., 1., 1., 1., 1., 1.],
                      [2., 2., 2., 2., 2., 2., 2., 2.],
                      [1., 1., 1., 1., 1., 1., 1., 1.]])
        ]

        sequence_features = {
            'wire_cast':
            sparse_tensor.SparseTensor(indices=[[0, 0, 0], [0, 1,
                                                            0], [1, 0, 0],
                                                [1, 1, 0], [1, 1, 1],
                                                [2, 0, 0], [2, 1, 1]],
                                       values=[
                                           b'marlo', b'stringer', b'omar',
                                           b'stringer', b'marlo', b'marlo',
                                           b'omar'
                                       ],
                                       dense_shape=[3, 2, 2])
        }

        wire_cast = feature_column.sparse_column_with_keys(
            'wire_cast', ['marlo', 'omar', 'stringer'])
        sequence_feature_columns = [
            feature_column.embedding_column(
                wire_cast,
                dimension=embedding_dimension,
                combiner='sum',
                initializer=init_ops.ones_initializer())
        ]

        context_features = None

        self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                          sequence_feature_columns, num_unroll,
                                          expected)
  def testPrepareInputsForRnnSparse(self):
    num_unroll = 2
    embedding_dimension = 8

    expected = [
        np.array([[1., 1., 1., 1., 1., 1., 1., 1.],
                  [1., 1., 1., 1., 1., 1., 1., 1.],
                  [1., 1., 1., 1., 1., 1., 1., 1.]]),
        np.array([[1., 1., 1., 1., 1., 1., 1., 1.],
                  [2., 2., 2., 2., 2., 2., 2., 2.],
                  [1., 1., 1., 1., 1., 1., 1., 1.]])
    ]

    sequence_features = {
        'wire_cast':
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2])
    }

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    sequence_feature_columns = [
        feature_column.embedding_column(
            wire_cast,
            dimension=embedding_dimension,
            combiner='sum',
            initializer=init_ops.ones_initializer())
    ]

    context_features = None

    self._test_prepare_inputs_for_rnn(sequence_features, context_features,
                                      sequence_feature_columns, num_unroll,
                                      expected)
    def testWeightedSparseColumnDtypes(self):
        ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
        weighted_ids = fc.weighted_sparse_column(ids, "weights")
        self.assertDictEqual(
            {
                "ids": parsing_ops.VarLenFeature(dtypes.string),
                "weights": parsing_ops.VarLenFeature(dtypes.float32)
            }, weighted_ids.config)

        weighted_ids = fc.weighted_sparse_column(ids,
                                                 "weights",
                                                 dtype=dtypes.int32)
        self.assertDictEqual(
            {
                "ids": parsing_ops.VarLenFeature(dtypes.string),
                "weights": parsing_ops.VarLenFeature(dtypes.int32)
            }, weighted_ids.config)

        with self.assertRaisesRegexp(ValueError,
                                     "dtype is not convertible to float"):
            weighted_ids = fc.weighted_sparse_column(ids,
                                                     "weights",
                                                     dtype=dtypes.string)
Esempio n. 40
0
    def gen_feature_column(self, feature_conf):
        feature_name = feature_conf['feature_name']

        if "comment" in feature_conf:
            return None

        if "vocab_size" in feature_conf:
            id_feature = fc.sparse_column_with_keys(
                column_name=feature_name,
                keys=[str(i) for i in range(feature_conf['vocab_size'])])

            return fc._EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                shared_embedding_name=feature_conf.get('name'),
            )
        elif 'hash_bucket_size' in feature_conf:
            id_feature = tf.contrib.layers.sparse_column_with_hash_bucket(
                column_name=feature_name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                # use_hashmap=use_hashmap
            )
            return fc._EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)

        else:
            return tf.contrib.layers.real_valued_column(
                column_name=feature_name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ],
                normalizer=None if 'l2_norm' not in feature_conf else
                lambda x: tf.nn.l2_normalize(x, dim=-1))
 def testOneHotColumnForWeightedSparseColumn(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   one_hot = fc.one_hot_column(weighted_ids)
   self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights")
   self.assertEqual(one_hot.length, 3)
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
  def testInitCrossedColumnWeightsFromCkpt(self):
    sparse_col_1 = fc.sparse_column_with_hash_bucket(
        column_name="col_1", hash_bucket_size=4)
    sparse_col_2 = fc.sparse_column_with_keys(
        column_name="col_2", keys=("foo", "bar", "baz"))
    sparse_col_3 = fc.sparse_column_with_keys(
        column_name="col_3", keys=(42, 1, -1000), dtype=dtypes.int64)

    crossed_col = fc.crossed_column(
        columns=[sparse_col_1, sparse_col_2, sparse_col_3], hash_bucket_size=4)

    input_tensor = sparse_tensor_lib.SparseTensor(
        indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
        values=[0, 1, 2, 3],
        dense_shape=[4, 4])

    # Invoking 'weighted_sum_from_feature_columns' will create the crossed
    # column weights variable.
    with variable_scope.variable_scope("run_1"):
      with variable_scope.variable_scope(crossed_col.name):
        # Returns looked up column weights which is same as crossed column
        # weights as well as actual references to weights variables.
        _, col_weights, _ = (
            feature_column_ops.weighted_sum_from_feature_columns({
                sparse_col_1.name: input_tensor,
                sparse_col_2.name: input_tensor,
                sparse_col_3.name: input_tensor
            }, [crossed_col], 1))
        # Update the weights since default initializer initializes all weights
        # to 0.0.
        for weight in col_weights.values():
          assign_op = state_ops.assign(weight[0], weight[0] + 0.5)

    save = saver.Saver()
    ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                   "init_crossed_col_w_from_ckpt")
    ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
    checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      sess.run(assign_op)
      saved_col_weights = col_weights[crossed_col][0].eval()
      save.save(sess, checkpoint_path)

    crossed_col_initialized = fc.crossed_column(
        columns=[sparse_col_1, sparse_col_2],
        hash_bucket_size=4,
        ckpt_to_load_from=checkpoint_path,
        tensor_name_in_ckpt=("run_1/col_1_X_col_2_X_col_3/"
                             "weighted_sum_from_feature_columns/"
                             "col_1_X_col_2_X_col_3/weights"))

    with variable_scope.variable_scope("run_2"):
      # This will initialize the crossed column weights from provided checkpoint
      # and return a [4, 1] tensor which is same as weights variable. Since we
      # won't modify weights, this should be same as 'saved_col_weights'.
      _, col_weights, _ = (feature_column_ops.weighted_sum_from_feature_columns(
          {
              sparse_col_1.name: input_tensor,
              sparse_col_2.name: input_tensor
          }, [crossed_col_initialized], 1))
      col_weights_from_ckpt = col_weights[crossed_col_initialized][0]

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      loaded_col_weights = col_weights_from_ckpt.eval()

    self.assertAllClose(saved_col_weights, loaded_col_weights)
Esempio n. 44
0
    def testInitCrossedColumnWeightsFromCkpt(self):
        sparse_col_1 = fc.sparse_column_with_hash_bucket(column_name="col_1",
                                                         hash_bucket_size=4)
        sparse_col_2 = fc.sparse_column_with_keys(column_name="col_2",
                                                  keys=("foo", "bar", "baz"))
        sparse_col_3 = fc.sparse_column_with_keys(column_name="col_3",
                                                  keys=(42, 1, -1000),
                                                  dtype=dtypes.int64)

        crossed_col = fc.crossed_column(
            columns=[sparse_col_1, sparse_col_2, sparse_col_3],
            hash_bucket_size=4)

        input_tensor = sparse_tensor_lib.SparseTensor(indices=[[0, 0], [1, 1],
                                                               [2, 2], [3, 3]],
                                                      values=[0, 1, 2, 3],
                                                      dense_shape=[4, 4])

        # Invoking 'weighted_sum_from_feature_columns' will create the crossed
        # column weights variable.
        with variable_scope.variable_scope("run_1"):
            with variable_scope.variable_scope(crossed_col.name):
                # Returns looked up column weights which is same as crossed column
                # weights as well as actual references to weights variables.
                _, col_weights, _ = (
                    feature_column_ops.weighted_sum_from_feature_columns(
                        {
                            sparse_col_1.name: input_tensor,
                            sparse_col_2.name: input_tensor,
                            sparse_col_3.name: input_tensor
                        }, [crossed_col], 1))
                # Update the weights since default initializer initializes all weights
                # to 0.0.
                for weight in col_weights.values():
                    assign_op = state_ops.assign(weight[0], weight[0] + 0.5)

        save = saver.Saver()
        ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                       "init_crossed_col_w_from_ckpt")
        ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
        checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            sess.run(assign_op)
            saved_col_weights = col_weights[crossed_col][0].eval()
            save.save(sess, checkpoint_path)

        crossed_col_initialized = fc.crossed_column(
            columns=[sparse_col_1, sparse_col_2],
            hash_bucket_size=4,
            ckpt_to_load_from=checkpoint_path,
            tensor_name_in_ckpt=("run_1/col_1_X_col_2_X_col_3/"
                                 "weighted_sum_from_feature_columns/"
                                 "col_1_X_col_2_X_col_3/weights"))

        with variable_scope.variable_scope("run_2"):
            # This will initialize the crossed column weights from provided checkpoint
            # and return a [4, 1] tensor which is same as weights variable. Since we
            # won't modify weights, this should be same as 'saved_col_weights'.
            _, col_weights, _ = (
                feature_column_ops.weighted_sum_from_feature_columns(
                    {
                        sparse_col_1.name: input_tensor,
                        sparse_col_2.name: input_tensor
                    }, [crossed_col_initialized], 1))
            col_weights_from_ckpt = col_weights[crossed_col_initialized][0]

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            loaded_col_weights = col_weights_from_ckpt.eval()

        self.assertAllClose(saved_col_weights, loaded_col_weights)
Esempio n. 45
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
Esempio n. 46
0
def gen_feature(feature_conf):
    name = feature_conf[feature_name_key]
    value_type = feature_conf[value_type_key]

    if "vocab_size" in feature_conf:
        id_feature = fc.sparse_column_with_keys(
            column_name=name,
            keys=range(feature_conf['vocab_size']),
            dtype=tf.string)

        return fc._EmbeddingColumn(
            id_feature,
            dimension=feature_conf['embedding_dimension'],
            shared_embedding_name=feature_conf.get(feature_name_key),
        )
    elif "hash_bucket_size" in feature_conf \
            and "embedding_dimension" not in feature_conf:
        if value_type == "Int":
            id_feature = layers.sparse_column_with_integerized_feature(
                column_name=name,
                bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        return id_feature
    elif "embedding_dimension" in feature_conf \
            and "hash_bucket_size" in feature_conf \
            and "boundaries" not in feature_conf \
            and "vocabulary_file" not in feature_conf:
        if value_type == "Int":
            return _EmbeddingColumn(
                sparse_id_column=layers.sparse_column_with_integerized_feature(
                    column_name=name,
                    bucket_size=feature_conf['hash_bucket_size'],
                    combiner=_get_combiner(feature_conf),
                    # use_hashmap=use_hashmap
                ),
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None))
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                # use_hashmap=use_hashmap
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" not in feature_conf and "vocabulary_file" in feature_conf:
        use_hashmap = feature_conf.get("use_hashmap", False)
        if value_type == "Int":
            raise Exception(
                "embedding with vocabulary_file does not support Int type")
        else:
            id_feature = fc.sparse_column_with_vocabulary_file(
                column_name=name,
                vocabulary_file=feature_conf["vocabulary_file"],
                num_oov_buckets=feature_conf["num_oov_buckets"],
                vocab_size=feature_conf["vocab_size"],
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" in feature_conf:
        return embedding_bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ],
            embedding_dimension=feature_conf["embedding_dimension"],
            max_norm=None,
            shared_name=feature_conf.get('shared_name', None),
            add_random=feature_conf.get('add_random', False))
    elif "embedding_dimension" not in feature_conf \
            and "boundaries" in feature_conf:
        return layers.bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ])
    else:
        return layers.real_valued_column(
            column_name=name,
            dimension=feature_conf.get('dimension', 1),
            default_value=[
                0.0 for _ in range(int(feature_conf.get('dimension', 1)))
            ],
            normalizer=None if 'l2_norm' not in feature_conf else
            lambda x: tf.nn.l2_normalize(x, dim=-1))
 def testWeightedSparseColumn(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
Esempio n. 48
0
  def testPrepareFeaturesForSQSS(self):
    mode = model_fn_lib.ModeKeys.TRAIN
    seq_feature_name = 'seq_feature'
    sparse_seq_feature_name = 'wire_cast'
    ctx_feature_name = 'ctx_feature'
    sequence_length = 4
    embedding_dimension = 8

    features = {
        sparse_seq_feature_name:
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
        seq_feature_name:
            constant_op.constant(
                1.0, shape=[sequence_length]),
        ctx_feature_name:
            constant_op.constant(2.0)
    }

    labels = constant_op.constant(5.0, shape=[sequence_length])

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    sequence_feature_columns = [
        feature_column.real_valued_column(
            seq_feature_name, dimension=1), feature_column.embedding_column(
                wire_cast,
                dimension=embedding_dimension,
                initializer=init_ops.ones_initializer())
    ]

    context_feature_columns = [
        feature_column.real_valued_column(
            ctx_feature_name, dimension=1)
    ]

    expected_sequence = {
        rnn_common.RNNKeys.LABELS_KEY:
            np.array([5., 5., 5., 5.]),
        seq_feature_name:
            np.array([1., 1., 1., 1.]),
        sparse_seq_feature_name:
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
    }

    expected_context = {ctx_feature_name: 2.}

    sequence, context = ssre._prepare_features_for_sqss(
        features, labels, mode, sequence_feature_columns,
        context_feature_columns)

    def assert_equal(expected, got):
      self.assertEqual(sorted(expected), sorted(got))
      for k, v in expected.items():
        if isinstance(v, sparse_tensor.SparseTensor):
          self.assertAllEqual(v.values.eval(), got[k].values)
          self.assertAllEqual(v.indices.eval(), got[k].indices)
          self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape)
        else:
          self.assertAllEqual(v, got[k])

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      sess.run(lookup_ops.tables_initializer())
      actual_sequence, actual_context = sess.run(
          [sequence, context])
      assert_equal(expected_sequence, actual_sequence)
      assert_equal(expected_context, actual_context)
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc.real_valued_column(
        "real_valued_column3", dimension=None)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2,
        cross_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
  def testPrepareFeaturesForSQSS(self):
    mode = model_fn_lib.ModeKeys.TRAIN
    seq_feature_name = 'seq_feature'
    sparse_seq_feature_name = 'wire_cast'
    ctx_feature_name = 'ctx_feature'
    sequence_length = 4
    embedding_dimension = 8

    features = {
        sparse_seq_feature_name:
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
        seq_feature_name:
            constant_op.constant(
                1.0, shape=[sequence_length]),
        ctx_feature_name:
            constant_op.constant(2.0)
    }

    labels = constant_op.constant(5.0, shape=[sequence_length])

    wire_cast = feature_column.sparse_column_with_keys(
        'wire_cast', ['marlo', 'omar', 'stringer'])
    sequence_feature_columns = [
        feature_column.real_valued_column(
            seq_feature_name, dimension=1), feature_column.embedding_column(
                wire_cast,
                dimension=embedding_dimension,
                initializer=init_ops.ones_initializer())
    ]

    context_feature_columns = [
        feature_column.real_valued_column(
            ctx_feature_name, dimension=1)
    ]

    expected_sequence = {
        rnn_common.RNNKeys.LABELS_KEY:
            np.array([5., 5., 5., 5.]),
        seq_feature_name:
            np.array([1., 1., 1., 1.]),
        sparse_seq_feature_name:
            sparse_tensor.SparseTensor(
                indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0], [1, 1, 1],
                         [2, 0, 0], [2, 1, 1]],
                values=[
                    b'marlo', b'stringer', b'omar', b'stringer', b'marlo',
                    b'marlo', b'omar'
                ],
                dense_shape=[3, 2, 2]),
    }

    expected_context = {ctx_feature_name: 2.}

    sequence, context = ssre._prepare_features_for_sqss(
        features, labels, mode, sequence_feature_columns,
        context_feature_columns)

    def assert_equal(expected, got):
      self.assertEqual(sorted(expected), sorted(got))
      for k, v in expected.items():
        if isinstance(v, sparse_tensor.SparseTensor):
          self.assertAllEqual(v.values.eval(), got[k].values)
          self.assertAllEqual(v.indices.eval(), got[k].indices)
          self.assertAllEqual(v.dense_shape.eval(), got[k].dense_shape)
        else:
          self.assertAllEqual(v, got[k])

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      sess.run(data_flow_ops.initialize_all_tables())
      actual_sequence, actual_context = sess.run(
          [sequence, context])
      assert_equal(expected_sequence, actual_sequence)
      assert_equal(expected_context, actual_context)
Esempio n. 51
0
  def testLearnLyrics(self):
    lyrics = 'if I go there will be trouble and if I stay it will be double'
    lyrics_list = lyrics.split()
    sequence_length = len(lyrics_list)
    vocab = set(lyrics_list)
    batch_size = 16
    num_classes = len(vocab)
    num_unroll = 7  # not a divisor of sequence_length
    train_steps = 350
    eval_steps = 30
    num_units = [4]
    learning_rate = 0.4
    accuracy_threshold = 0.65

    def get_lyrics_input_fn(seed):

      def input_fn():
        start = random_ops.random_uniform(
            (), minval=0, maxval=sequence_length, dtype=dtypes.int32, seed=seed)
        # Concatenate lyrics_list so inputs and labels wrap when start > 0.
        lyrics_list_concat = lyrics_list + lyrics_list
        inputs_dense = array_ops.slice(lyrics_list_concat, [start],
                                       [sequence_length])
        indices = array_ops.constant(
            [[i, 0] for i in range(sequence_length)], dtype=dtypes.int64)
        dense_shape = [sequence_length, 1]
        inputs = sparse_tensor.SparseTensor(
            indices=indices, values=inputs_dense, dense_shape=dense_shape)
        table = lookup.string_to_index_table_from_tensor(
            mapping=list(vocab), default_value=-1, name='lookup')
        labels = table.lookup(
            array_ops.slice(lyrics_list_concat, [start + 1], [sequence_length]))
        return {'lyrics': inputs}, labels

      return input_fn

    sequence_feature_columns = [
        feature_column.embedding_column(
            feature_column.sparse_column_with_keys('lyrics', vocab),
            dimension=8)
    ]
    config = run_config.RunConfig(tf_random_seed=21212)
    sequence_estimator = ssre.StateSavingRnnEstimator(
        constants.ProblemType.CLASSIFICATION,
        num_units=num_units,
        cell_type='basic_rnn',
        num_unroll=num_unroll,
        batch_size=batch_size,
        sequence_feature_columns=sequence_feature_columns,
        num_classes=num_classes,
        learning_rate=learning_rate,
        config=config,
        predict_probabilities=True,
        queue_capacity=2 + batch_size,
        seed=1234)

    train_input_fn = get_lyrics_input_fn(seed=12321)
    eval_input_fn = get_lyrics_input_fn(seed=32123)

    sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps)

    evaluation = sequence_estimator.evaluate(
        input_fn=eval_input_fn, steps=eval_steps)
    accuracy = evaluation['accuracy']
    self.assertGreater(accuracy, accuracy_threshold,
                       'Accuracy should be higher than {}; got {}'.format(
                           accuracy_threshold, accuracy))
  def testLearnLyrics(self):
    lyrics = 'if I go there will be trouble and if I stay it will be double'
    lyrics_list = lyrics.split()
    sequence_length = len(lyrics_list)
    vocab = set(lyrics_list)
    batch_size = 16
    num_classes = len(vocab)
    num_unroll = 7  # not a divisor of sequence_length
    train_steps = 350
    eval_steps = 30
    num_units = [4]
    learning_rate = 0.4
    accuracy_threshold = 0.65

    def get_lyrics_input_fn(seed):

      def input_fn():
        start = random_ops.random_uniform(
            (), minval=0, maxval=sequence_length, dtype=dtypes.int32, seed=seed)
        # Concatenate lyrics_list so inputs and labels wrap when start > 0.
        lyrics_list_concat = lyrics_list + lyrics_list
        inputs_dense = array_ops.slice(lyrics_list_concat, [start],
                                       [sequence_length])
        indices = array_ops.constant(
            [[i, 0] for i in range(sequence_length)], dtype=dtypes.int64)
        dense_shape = [sequence_length, 1]
        inputs = sparse_tensor.SparseTensor(
            indices=indices, values=inputs_dense, dense_shape=dense_shape)
        table = lookup.string_to_index_table_from_tensor(
            mapping=list(vocab), default_value=-1, name='lookup')
        labels = table.lookup(
            array_ops.slice(lyrics_list_concat, [start + 1], [sequence_length]))
        return {'lyrics': inputs}, labels

      return input_fn

    sequence_feature_columns = [
        feature_column.embedding_column(
            feature_column.sparse_column_with_keys('lyrics', vocab),
            dimension=8)
    ]
    config = run_config.RunConfig(tf_random_seed=21212)
    sequence_estimator = ssre.StateSavingRnnEstimator(
        constants.ProblemType.CLASSIFICATION,
        num_units=num_units,
        cell_type='basic_rnn',
        num_unroll=num_unroll,
        batch_size=batch_size,
        sequence_feature_columns=sequence_feature_columns,
        num_classes=num_classes,
        learning_rate=learning_rate,
        config=config,
        predict_probabilities=True,
        queue_capacity=2 + batch_size,
        seed=1234)

    train_input_fn = get_lyrics_input_fn(seed=12321)
    eval_input_fn = get_lyrics_input_fn(seed=32123)

    sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps)

    evaluation = sequence_estimator.evaluate(
        input_fn=eval_input_fn, steps=eval_steps)
    accuracy = evaluation['accuracy']
    self.assertGreater(accuracy, accuracy_threshold,
                       'Accuracy should be higher than {}; got {}'.format(
                           accuracy_threshold, accuracy))
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
 def testWeightedSparseColumn(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
    def testCreateFeatureSpec(self):
        sparse_col = fc.sparse_column_with_hash_bucket("sparse_column",
                                                       hash_bucket_size=100)
        embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
                                            dimension=4)
        sparse_id_col = fc.sparse_column_with_keys(
            "id_column", ["marlo", "omar", "stringer"])
        weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                    "id_weights_column")
        real_valued_col1 = fc.real_valued_column("real_valued_column1")
        real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
        real_valued_col3 = fc.real_valued_column("real_valued_column3",
                                                 dimension=None)
        bucketized_col1 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization1"),
            [0, 4])
        bucketized_col2 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization2", 4),
            [0, 4])
        a = fc.sparse_column_with_hash_bucket("cross_aaa",
                                              hash_bucket_size=100)
        b = fc.sparse_column_with_hash_bucket("cross_bbb",
                                              hash_bucket_size=100)
        cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
        feature_columns = set([
            sparse_col, embedding_col, weighted_id_col, real_valued_col1,
            real_valued_col2, real_valued_col3, bucketized_col1,
            bucketized_col2, cross_col
        ])
        expected_config = {
            "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
            "real_valued_column1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column2":
            parsing_ops.FixedLenFeature([5], dtype=dtypes.float32),
            "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
            "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature([4], dtype=dtypes.float32),
            "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
            "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
        }

        config = fc.create_feature_spec_for_parsing(feature_columns)
        self.assertDictEqual(expected_config, config)

        # Test that the same config is parsed out if we pass a dictionary.
        feature_columns_dict = {
            str(i): val
            for i, val in enumerate(feature_columns)
        }
        config = fc.create_feature_spec_for_parsing(feature_columns_dict)
        self.assertDictEqual(expected_config, config)
Esempio n. 56
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)