Ejemplo n.º 1
0
  def testInt32WeightedSparseInt64ColumnDtypes(self):
    ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int64)
    weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32)
    self.assertDictEqual({
        "ids": parsing_ops.VarLenFeature(dtypes.int64),
        "weights": parsing_ops.VarLenFeature(dtypes.int32)
    }, weighted_ids.config)

    with self.assertRaisesRegexp(ValueError,
                                 "dtype is not convertible to float"):
      weighted_ids = fc.weighted_sparse_column(
          ids, "weights", dtype=dtypes.string)
Ejemplo n.º 2
0
  def testInt32WeightedSparseInt64ColumnDtypes(self):
    ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int64)
    weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32)
    self.assertDictEqual({
        "ids": parsing_ops.VarLenFeature(dtypes.int64),
        "weights": parsing_ops.VarLenFeature(dtypes.int32)
    }, weighted_ids.config)

    with self.assertRaisesRegexp(ValueError,
                                 "dtype is not convertible to float"):
      weighted_ids = fc.weighted_sparse_column(
          ids, "weights", dtype=dtypes.string)
Ejemplo n.º 3
0
 def testFloat32WeightedSparseStringColumnDtypes(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertDictEqual({
       "ids": parsing_ops.VarLenFeature(dtypes.string),
       "weights": parsing_ops.VarLenFeature(dtypes.float32)
   }, weighted_ids.config)
Ejemplo n.º 4
0
 def testFloat32WeightedSparseInt32ColumnDtypes(self):
   ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int32)
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertDictEqual({
       "ids": parsing_ops.VarLenFeature(dtypes.int32),
       "weights": parsing_ops.VarLenFeature(dtypes.float32)
   }, weighted_ids.config)
Ejemplo n.º 5
0
 def testWeightedSparseColumnDeepCopy(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted = fc.weighted_sparse_column(ids, "weights")
   weighted_copy = copy.deepcopy(weighted)
   self.assertEqual(weighted_copy.sparse_id_column.name, "ids")
   self.assertEqual(weighted_copy.weight_column_name, "weights")
   self.assertEqual(weighted_copy.name, "ids_weighted_by_weights")
Ejemplo n.º 6
0
 def testWeightedSparseColumnDeepCopy(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted = fc.weighted_sparse_column(ids, "weights")
   weighted_copy = copy.deepcopy(weighted)
   self.assertEqual(weighted_copy.sparse_id_column.name, "ids")
   self.assertEqual(weighted_copy.weight_column_name, "weights")
   self.assertEqual(weighted_copy.name, "ids_weighted_by_weights")
 def testOneHotColumnForWeightedSparseColumn(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     one_hot = fc.one_hot_column(weighted_ids)
     self.assertEqual(one_hot.sparse_id_column.name,
                      "ids_weighted_by_weights")
     self.assertEqual(one_hot.length, 3)
Ejemplo n.º 8
0
  def testWeightedSparseFeatures(self):
    """Tests SDCALogisticClassifier with weighted sparse features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              sparse_tensor.SparseTensor(
                  values=[2., 3., 1.],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 5]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 5])
      }, constant_op.constant([[1], [0], [1]])

    country = feature_column_lib.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    country_weighted_by_price = feature_column_lib.weighted_sparse_column(
        country, 'price')
    classifier = sdca_estimator.SDCALogisticClassifier(
        example_id_column='example_id',
        feature_columns=[country_weighted_by_price])
    classifier.fit(input_fn=input_fn, steps=50)
    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
    self.assertGreater(metrics['accuracy'], 0.9)
Ejemplo n.º 9
0
 def testFloat32WeightedSparseStringColumnDtypes(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertDictEqual({
       "ids": parsing_ops.VarLenFeature(dtypes.string),
       "weights": parsing_ops.VarLenFeature(dtypes.float32)
   }, weighted_ids.config)
Ejemplo n.º 10
0
    def testWeightedSparseFeatures(self):
        """Tests SDCALogisticClassifier with weighted sparse features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                sparse_tensor.SparseTensor(values=[2., 3., 1.],
                                           indices=[[0, 0], [1, 0], [2, 0]],
                                           dense_shape=[3, 5]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 0], [2, 0]],
                                           dense_shape=[3, 5])
            }, constant_op.constant([[1], [0], [1]])

        with self._single_threaded_test_session():
            country = feature_column_lib.sparse_column_with_hash_bucket(
                'country', hash_bucket_size=5)
            country_weighted_by_price = feature_column_lib.weighted_sparse_column(
                country, 'price')
            classifier = sdca_estimator.SDCALogisticClassifier(
                example_id_column='example_id',
                feature_columns=[country_weighted_by_price])
            classifier.fit(input_fn=input_fn, steps=50)
            metrics = classifier.evaluate(input_fn=input_fn, steps=1)
            self.assertGreater(metrics['accuracy'], 0.9)
Ejemplo n.º 11
0
 def testFloat32WeightedSparseInt32ColumnDtypes(self):
   ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int32)
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertDictEqual({
       "ids": parsing_ops.VarLenFeature(dtypes.int32),
       "weights": parsing_ops.VarLenFeature(dtypes.float32)
   }, weighted_ids.config)
Ejemplo n.º 12
0
  def testWeightedSparseColumnDtypes(self):
    ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
    weighted_ids = fc.weighted_sparse_column(ids, "weights")
    self.assertDictEqual({
        "ids": parsing_ops.VarLenFeature(dtypes.string),
        "weights": parsing_ops.VarLenFeature(dtypes.float32)
    }, weighted_ids.config)

    weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32)
    self.assertDictEqual({
        "ids": parsing_ops.VarLenFeature(dtypes.string),
        "weights": parsing_ops.VarLenFeature(dtypes.int32)
    }, weighted_ids.config)

    with self.assertRaisesRegexp(ValueError,
                                 "dtype is not convertible to float"):
      weighted_ids = fc.weighted_sparse_column(
          ids, "weights", dtype=dtypes.string)
Ejemplo n.º 13
0
 def testWeightedSparseColumnWithVocabularyFile(self):
   ids = fc.sparse_column_with_vocabulary_file(
       "ids", "a_file", num_oov_buckets=7, vocab_size=3)
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
   self.assertEqual(weighted_ids.lookup_config, ids.lookup_config)
   self.assertEqual(weighted_ids.lookup_config.vocab_size, 3)
   self.assertEqual(weighted_ids.lookup_config.num_oov_buckets, 7)
   self.assertEqual(weighted_ids.lookup_config.vocabulary_file, "a_file")
Ejemplo n.º 14
0
  def testSharedEmbeddingColumnWithWeightedSparseColumn(self):
    # Tests creation of shared embeddings containing weighted sparse columns.
    sparse_col = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"])
    ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
    weighted_sparse_col = fc.weighted_sparse_column(ids, "weights")
    self.assertEqual(weighted_sparse_col.name, "ids_weighted_by_weights")

    b = fc.shared_embedding_columns([sparse_col, weighted_sparse_col],
                                    dimension=4, combiner="mean")
    self.assertEqual(len(b), 2)
    self.assertEqual(b[0].shared_embedding_name,
                     "a1_ids_weighted_by_weights_shared_embedding")
    self.assertEqual(b[1].shared_embedding_name,
                     "a1_ids_weighted_by_weights_shared_embedding")

    # Tries reversing order to check compatibility condition.
    b = fc.shared_embedding_columns([weighted_sparse_col, sparse_col],
                                    dimension=4, combiner="mean")
    self.assertEqual(len(b), 2)
    self.assertEqual(b[0].shared_embedding_name,
                     "a1_ids_weighted_by_weights_shared_embedding")
    self.assertEqual(b[1].shared_embedding_name,
                     "a1_ids_weighted_by_weights_shared_embedding")

    # Tries adding two weighted columns to check compatibility between them.
    weighted_sparse_col_2 = fc.weighted_sparse_column(ids, "weights_2")
    b = fc.shared_embedding_columns([weighted_sparse_col,
                                     weighted_sparse_col_2],
                                    dimension=4, combiner="mean")
    self.assertEqual(len(b), 2)
    self.assertEqual(
        b[0].shared_embedding_name,
        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
    )
    self.assertEqual(
        b[1].shared_embedding_name,
        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
    )
Ejemplo n.º 15
0
  def testCreateSequenceFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
    real_valued_col2 = fc.real_valued_column(
        "real_valued_default_column", dimension=5, default_value=3.0)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)

    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, real_valued_col4
    ])

    feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)

    expected_feature_spec = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[2], dtype=dtypes.float32, allow_missing=False),
        "real_valued_default_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[5], dtype=dtypes.float32, allow_missing=True),
        "real_valued_var_len_column":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_var_len_dense_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=dtypes.float32, allow_missing=True,
                default_value=4.0),
    }

    self.assertDictEqual(expected_feature_spec, feature_spec)
Ejemplo n.º 16
0
  def testCreateSequenceFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
    real_valued_col2 = fc.real_valued_column(
        "real_valued_default_column", dimension=5, default_value=3.0)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)

    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, real_valued_col4
    ])

    feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)

    expected_feature_spec = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[2], dtype=dtypes.float32, allow_missing=False),
        "real_valued_default_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[5], dtype=dtypes.float32, allow_missing=True),
        "real_valued_var_len_column":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_var_len_dense_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=dtypes.float32, allow_missing=True,
                default_value=4.0),
    }

    self.assertDictEqual(expected_feature_spec, feature_spec)
    def testWeightedSparseColumnDtypes(self):
        ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
        weighted_ids = fc.weighted_sparse_column(ids, "weights")
        self.assertDictEqual(
            {
                "ids": parsing_ops.VarLenFeature(dtypes.string),
                "weights": parsing_ops.VarLenFeature(dtypes.float32)
            }, weighted_ids.config)

        weighted_ids = fc.weighted_sparse_column(ids,
                                                 "weights",
                                                 dtype=dtypes.int32)
        self.assertDictEqual(
            {
                "ids": parsing_ops.VarLenFeature(dtypes.string),
                "weights": parsing_ops.VarLenFeature(dtypes.int32)
            }, weighted_ids.config)

        with self.assertRaisesRegexp(ValueError,
                                     "dtype is not convertible to float"):
            weighted_ids = fc.weighted_sparse_column(ids,
                                                     "weights",
                                                     dtype=dtypes.string)
Ejemplo n.º 18
0
 def testMissingValueInOneHotColumnForWeightedSparseColumn(self):
   # Github issue 12583
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   one_hot = fc.one_hot_column(weighted_ids)
   features = {
       'ids': constant_op.constant([['marlo', 'unknown', 'omar']]),
       'weights': constant_op.constant([[2., 4., 6.]])
   }
   one_hot_tensor = feature_column_ops.input_from_feature_columns(
     features, [one_hot])
   with self.test_session() as sess:
     sess.run(variables.global_variables_initializer())
     sess.run(lookup_ops.tables_initializer())
     self.assertAllEqual([[2., 6., 0.]], one_hot_tensor.eval())
Ejemplo n.º 19
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc.real_valued_column(
        "real_valued_column3", dimension=None)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2,
        cross_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
Ejemplo n.º 20
0
 def testOneHotColumnForWeightedSparseColumn(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   one_hot = fc.one_hot_column(weighted_ids)
   self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights")
   self.assertEqual(one_hot.length, 3)
Ejemplo n.º 21
0
 def testWeightedSparseColumn(self):
   ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
   weighted_ids = fc.weighted_sparse_column(ids, "weights")
   self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
Ejemplo n.º 22
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
 def testWeightedSparseColumn(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
Ejemplo n.º 24
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
Ejemplo n.º 25
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
    def testCreateFeatureSpec(self):
        sparse_col = fc.sparse_column_with_hash_bucket("sparse_column",
                                                       hash_bucket_size=100)
        embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
                                            dimension=4)
        sparse_id_col = fc.sparse_column_with_keys(
            "id_column", ["marlo", "omar", "stringer"])
        weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                    "id_weights_column")
        real_valued_col1 = fc.real_valued_column("real_valued_column1")
        real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
        real_valued_col3 = fc.real_valued_column("real_valued_column3",
                                                 dimension=None)
        bucketized_col1 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization1"),
            [0, 4])
        bucketized_col2 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization2", 4),
            [0, 4])
        a = fc.sparse_column_with_hash_bucket("cross_aaa",
                                              hash_bucket_size=100)
        b = fc.sparse_column_with_hash_bucket("cross_bbb",
                                              hash_bucket_size=100)
        cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
        feature_columns = set([
            sparse_col, embedding_col, weighted_id_col, real_valued_col1,
            real_valued_col2, real_valued_col3, bucketized_col1,
            bucketized_col2, cross_col
        ])
        expected_config = {
            "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
            "real_valued_column1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column2":
            parsing_ops.FixedLenFeature([5], dtype=dtypes.float32),
            "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
            "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature([4], dtype=dtypes.float32),
            "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
            "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
        }

        config = fc.create_feature_spec_for_parsing(feature_columns)
        self.assertDictEqual(expected_config, config)

        # Test that the same config is parsed out if we pass a dictionary.
        feature_columns_dict = {
            str(i): val
            for i, val in enumerate(feature_columns)
        }
        config = fc.create_feature_spec_for_parsing(feature_columns_dict)
        self.assertDictEqual(expected_config, config)
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)