def testJointLinearModel(self):
    """Tests that loss goes down with training."""

    def input_fn():
      return {
          'age':
              sparse_tensor.SparseTensor(
                  values=['1'], indices=[[0, 0]], dense_shape=[1, 1]),
          'language':
              sparse_tensor.SparseTensor(
                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
      }, constant_op.constant([[1]])

    language = feature_column.sparse_column_with_hash_bucket('language', 100)
    age = feature_column.sparse_column_with_hash_bucket('age', 2)

    head = head_lib._multi_class_head(n_classes=2)
    classifier = _joint_linear_estimator(head, feature_columns=[age, language])

    classifier.fit(input_fn=input_fn, steps=1000)
    loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
    classifier.fit(input_fn=input_fn, steps=2000)
    loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
    self.assertLess(loss2, loss1)
    self.assertLess(loss2, 0.01)
  def testCrossedFeatures(self):
    """Tests SDCALogisticClassifier with crossed features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'language':
              sparse_tensor.SparseTensor(
                  values=['english', 'italian', 'spanish'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 1]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['US', 'IT', 'MX'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 1])
      }, constant_op.constant([[0], [0], [1]])

    language = feature_column_lib.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=5)
    country = feature_column_lib.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    country_language = feature_column_lib.crossed_column(
        [language, country], hash_bucket_size=10)
    classifier = sdca_estimator.SDCALogisticClassifier(
        example_id_column='example_id', feature_columns=[country_language])
    classifier.fit(input_fn=input_fn, steps=10)
    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
    self.assertGreater(metrics['accuracy'], 0.9)
  def testSparseFeaturesWithDuplicates(self):
    """Tests SDCALogisticClassifier with duplicated sparse features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2']),
          'age':
              sparse_tensor.SparseTensor(
                  values=['20-29'] * 5 + ['31-40'] * 5,
                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
                           [1, 0], [1, 0], [1, 0], [1, 0]],
                  dense_shape=[2, 1]),
          'gender':
              sparse_tensor.SparseTensor(
                  values=['m'] * 5 + ['f'] * 5,
                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
                           [1, 0], [1, 0], [1, 0], [1, 0]],
                  dense_shape=[2, 1]),
      }, constant_op.constant([[1], [0]])

    with self._single_threaded_test_session():
      age = feature_column_lib.sparse_column_with_hash_bucket(
          'age', hash_bucket_size=10)
      gender = feature_column_lib.sparse_column_with_hash_bucket(
          'gender', hash_bucket_size=10)
      classifier = sdca_estimator.SDCALogisticClassifier(
          example_id_column='example_id', feature_columns=[age, gender])
      classifier.fit(input_fn=input_fn, steps=50)
      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
      self.assertLess(metrics['loss'], 0.060)
  def testCrossedColumnNameCreatesSortedNames(self):
    a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
    bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4])
    crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000)

    self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed.name,
                     "name should be generated by sorted column names")
    self.assertEqual("aaa", crossed.columns[0].name)
    self.assertEqual("bbb", crossed.columns[1].name)
    self.assertEqual("cost_bucketized", crossed.columns[2].name)
  def testSparseColumnWithHashBucket(self):
    a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100)
    self.assertEqual(a.name, "aaa")
    self.assertEqual(a.dtype, dtypes.string)

    a = fc.sparse_column_with_hash_bucket(
        "aaa", hash_bucket_size=100, dtype=dtypes.int64)
    self.assertEqual(a.name, "aaa")
    self.assertEqual(a.dtype, dtypes.int64)

    with self.assertRaisesRegexp(ValueError, "dtype must be string or integer"):
      a = fc.sparse_column_with_hash_bucket(
          "aaa", hash_bucket_size=100, dtype=dtypes.float32)
  def testCreateSequenceFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
    real_valued_col2 = fc.real_valued_column(
        "real_valued_default_column", dimension=5, default_value=3.0)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)

    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, real_valued_col4
    ])

    feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)

    expected_feature_spec = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[2], dtype=dtypes.float32, allow_missing=False),
        "real_valued_default_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[5], dtype=dtypes.float32, allow_missing=True),
        "real_valued_var_len_column":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_var_len_dense_column":
            parsing_ops.FixedLenSequenceFeature(
                shape=[], dtype=dtypes.float32, allow_missing=True,
                default_value=4.0),
    }

    self.assertDictEqual(expected_feature_spec, feature_spec)
Beispiel #7
0
  def testRegression_TensorData(self):
    """Tests regression using tensor data as input."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.15], [0.]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32)

    language_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            language_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    regressor = dnn.DNNRegressor(
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    regressor.fit(input_fn=_input_fn, steps=200)

    scores = regressor.evaluate(input_fn=_input_fn, steps=1)
    self.assertIn('loss', scores)
Beispiel #8
0
  def testExport(self):
    """Tests export model for servo."""

    def input_fn():
      return {
          'age':
              constant_op.constant([1]),
          'language':
              sparse_tensor.SparseTensor(
                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
      }, constant_op.constant([[1]])

    language = feature_column.sparse_column_with_hash_bucket('language', 100)
    feature_columns = [
        feature_column.real_valued_column('age'),
        feature_column.embedding_column(
            language, dimension=1)
    ]

    classifier = dnn.DNNClassifier(
        feature_columns=feature_columns, hidden_units=[3, 3])
    classifier.fit(input_fn=input_fn, steps=5)

    export_dir = tempfile.mkdtemp()
    classifier.export(export_dir)
 def testCrossedColumnNotSupportRealValuedColumn(self):
   b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
   with self.assertRaisesRegexp(
       TypeError, "columns must be a set of _SparseColumn, _CrossedColumn, "
       "or _BucketizedColumn instances"):
     fc.crossed_column(
         set([b, fc.real_valued_column("real")]), hash_bucket_size=10000)
  def testMakePlaceHolderTensorsForBaseFeatures(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    real_valued_col = fc.real_valued_column("real_valued_column", 5)
    vlen_real_valued_col = fc.real_valued_column(
        "vlen_real_valued_column", dimension=None)

    bucketized_col = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization"), [0, 4])
    feature_columns = set(
        [sparse_col, real_valued_col, vlen_real_valued_col, bucketized_col])
    placeholders = (
        fc.make_place_holder_tensors_for_base_features(feature_columns))

    self.assertEqual(4, len(placeholders))
    self.assertTrue(
        isinstance(placeholders["sparse_column"],
                   sparse_tensor_lib.SparseTensor))
    self.assertTrue(
        isinstance(placeholders["vlen_real_valued_column"],
                   sparse_tensor_lib.SparseTensor))
    placeholder = placeholders["real_valued_column"]
    self.assertGreaterEqual(
        placeholder.name.find(u"Placeholder_real_valued_column"), 0)
    self.assertEqual(dtypes.float32, placeholder.dtype)
    self.assertEqual([None, 5], placeholder.get_shape().as_list())
    placeholder = placeholders["real_valued_column_for_bucketization"]
    self.assertGreaterEqual(
        placeholder.name.find(
            u"Placeholder_real_valued_column_for_bucketization"), 0)
    self.assertEqual(dtypes.float32, placeholder.dtype)
    self.assertEqual([None, 1], placeholder.get_shape().as_list())
Beispiel #11
0
    def testWeightedSparseFeatures(self):
        """Tests SDCALogisticClassifier with weighted sparse features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                sparse_tensor.SparseTensor(values=[2., 3., 1.],
                                           indices=[[0, 0], [1, 0], [2, 0]],
                                           dense_shape=[3, 5]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 0], [2, 0]],
                                           dense_shape=[3, 5])
            }, constant_op.constant([[1], [0], [1]])

        country = feature_column_lib.sparse_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        country_weighted_by_price = feature_column_lib.weighted_sparse_column(
            country, 'price')
        classifier = sdca_estimator.SDCALogisticClassifier(
            example_id_column='example_id',
            feature_columns=[country_weighted_by_price])
        classifier.fit(input_fn=input_fn, steps=50)
        metrics = classifier.evaluate(input_fn=input_fn, steps=1)
        self.assertGreater(metrics['accuracy'], 0.9)
Beispiel #12
0
    def testExport(self):
        """Tests export model for servo."""
        def input_fn():
            return {
                'age':
                constant_op.constant([1]),
                'language':
                sparse_tensor.SparseTensor(values=['english'],
                                           indices=[[0, 0]],
                                           dense_shape=[1, 1])
            }, constant_op.constant([[1]])

        language = feature_column.sparse_column_with_hash_bucket(
            'language', 100)
        feature_columns = [
            feature_column.real_valued_column('age'),
            feature_column.embedding_column(language, dimension=1)
        ]

        classifier = debug.DebugClassifier(config=run_config.RunConfig(
            tf_random_seed=1))
        classifier.fit(input_fn=input_fn, steps=5)

        def default_input_fn(unused_estimator, examples):
            return feature_column_ops.parse_feature_columns_from_examples(
                examples, feature_columns)

        export_dir = tempfile.mkdtemp()
        classifier.export(export_dir, input_fn=default_input_fn)
Beispiel #13
0
    def testLinearModel(self):
        """Tests that loss goes down with training."""
        def input_fn():
            return {
                'age':
                constant_op.constant([1]),
                'language':
                sparse_tensor.SparseTensor(values=['english'],
                                           indices=[[0, 0]],
                                           dense_shape=[1, 1])
            }, constant_op.constant([[1]])

        language = feature_column.sparse_column_with_hash_bucket(
            'language', 100)
        age = feature_column.real_valued_column('age')

        head = head_lib._multi_class_head(n_classes=2)
        classifier = _linear_estimator(head, feature_columns=[age, language])

        classifier.fit(input_fn=input_fn, steps=1000)
        loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
        classifier.fit(input_fn=input_fn, steps=2000)
        loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
        self.assertLess(loss2, loss1)
        self.assertLess(loss2, 0.01)
Beispiel #14
0
  def testSparseFeatures(self):
    """Tests SDCALogisticClassifier with sparse features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.4], [0.6], [0.3]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[1.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    with self._single_threaded_test_session():
      price = feature_column_lib.real_valued_column('price')
      country = feature_column_lib.sparse_column_with_hash_bucket(
          'country', hash_bucket_size=5)
      classifier = sdca_estimator.SDCALogisticClassifier(
          example_id_column='example_id',
          feature_columns=[price, country],
          weight_column_name='weights')
      classifier.fit(input_fn=input_fn, steps=50)
      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
      self.assertGreater(metrics['accuracy'], 0.9)
Beispiel #15
0
    def testSparseFeatures(self):
        """Tests SVM classifier with (hashed) sparse features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                constant_op.constant([[0.8], [0.6], [0.3]]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 0], [2, 0]],
                                           dense_shape=[3, 1]),
            }, constant_op.constant([[0], [1], [1]])

        price = feature_column.real_valued_column('price')
        country = feature_column.sparse_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        svm_classifier = svm.SVM(feature_columns=[price, country],
                                 example_id_column='example_id',
                                 l1_regularization=0.0,
                                 l2_regularization=1.0)
        svm_classifier.fit(input_fn=input_fn, steps=30)
        accuracy = svm_classifier.evaluate(input_fn=input_fn,
                                           steps=1)['accuracy']
        self.assertAlmostEqual(accuracy, 1.0, places=3)
 def testEmbeddingColumn(self):
   a = fc.sparse_column_with_hash_bucket(
       "aaa", hash_bucket_size=100, combiner="sum")
   b = fc.embedding_column(a, dimension=4, combiner="mean")
   self.assertEqual(b.sparse_id_column.name, "aaa")
   self.assertEqual(b.dimension, 4)
   self.assertEqual(b.combiner, "mean")
Beispiel #17
0
  def testExport(self):
    """Tests export model for servo."""

    def input_fn():
      return {
          'age':
              constant_op.constant([1]),
          'language':
              sparse_tensor.SparseTensor(
                  values=['english'], indices=[[0, 0]], dense_shape=[1, 1])
      }, constant_op.constant([[1]])

    language = feature_column.sparse_column_with_hash_bucket('language', 100)
    feature_columns = [
        feature_column.real_valued_column('age'),
        feature_column.embedding_column(
            language, dimension=1)
    ]

    classifier = debug.DebugClassifier(config=run_config.RunConfig(
        tf_random_seed=1))
    classifier.fit(input_fn=input_fn, steps=5)

    def default_input_fn(unused_estimator, examples):
      return feature_column_ops.parse_feature_columns_from_examples(
          examples, feature_columns)

    export_dir = tempfile.mkdtemp()
    classifier.export(export_dir, input_fn=default_input_fn)
Beispiel #18
0
  def testSparseFeatures(self):
    """Tests SVM classifier with (hashed) sparse features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.8], [0.6], [0.3]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 1]),
      }, constant_op.constant([[0], [1], [1]])

    price = feature_column.real_valued_column('price')
    country = feature_column.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    svm_classifier = svm.SVM(feature_columns=[price, country],
                             example_id_column='example_id',
                             l1_regularization=0.0,
                             l2_regularization=1.0)
    svm_classifier.fit(input_fn=input_fn, steps=30)
    accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy']
    self.assertAlmostEqual(accuracy, 1.0, places=3)
Beispiel #19
0
  def testWeightedSparseFeatures(self):
    """Tests SDCALogisticClassifier with weighted sparse features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              sparse_tensor.SparseTensor(
                  values=[2., 3., 1.],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 5]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 5])
      }, constant_op.constant([[1], [0], [1]])

    country = feature_column_lib.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    country_weighted_by_price = feature_column_lib.weighted_sparse_column(
        country, 'price')
    classifier = sdca_estimator.SDCALogisticClassifier(
        example_id_column='example_id',
        feature_columns=[country_weighted_by_price])
    classifier.fit(input_fn=input_fn, steps=50)
    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
    self.assertGreater(metrics['accuracy'], 0.9)
Beispiel #20
0
    def testMakePlaceHolderTensorsForBaseFeatures(self):
        sparse_col = fc.sparse_column_with_hash_bucket("sparse_column",
                                                       hash_bucket_size=100)
        real_valued_col = fc.real_valued_column("real_valued_column", 5)
        vlen_real_valued_col = fc.real_valued_column("vlen_real_valued_column",
                                                     dimension=None)

        bucketized_col = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization"),
            [0, 4])
        feature_columns = set([
            sparse_col, real_valued_col, vlen_real_valued_col, bucketized_col
        ])
        placeholders = (
            fc.make_place_holder_tensors_for_base_features(feature_columns))

        self.assertEqual(4, len(placeholders))
        self.assertTrue(
            isinstance(placeholders["sparse_column"],
                       sparse_tensor_lib.SparseTensor))
        self.assertTrue(
            isinstance(placeholders["vlen_real_valued_column"],
                       sparse_tensor_lib.SparseTensor))
        placeholder = placeholders["real_valued_column"]
        self.assertGreaterEqual(
            placeholder.name.find(u"Placeholder_real_valued_column"), 0)
        self.assertEqual(dtypes.float32, placeholder.dtype)
        self.assertEqual([None, 5], placeholder.get_shape().as_list())
        placeholder = placeholders["real_valued_column_for_bucketization"]
        self.assertGreaterEqual(
            placeholder.name.find(
                u"Placeholder_real_valued_column_for_bucketization"), 0)
        self.assertEqual(dtypes.float32, placeholder.dtype)
        self.assertEqual([None, 1], placeholder.get_shape().as_list())
Beispiel #21
0
  def testMixedFeatures(self):
    """Tests SDCALogisticClassifier with a mix of features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    price = feature_column_lib.real_valued_column('price')
    sq_footage_bucket = feature_column_lib.bucketized_column(
        feature_column_lib.real_valued_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column_lib.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column_lib.crossed_column(
        [sq_footage_bucket, country], hash_bucket_size=10)
    classifier = sdca_estimator.SDCALogisticClassifier(
        example_id_column='example_id',
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        weight_column_name='weights')
    classifier.fit(input_fn=input_fn, steps=50)
    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
    self.assertGreater(metrics['accuracy'], 0.9)
Beispiel #22
0
  def testSparseFeatures(self):
    """Tests SDCALogisticClassifier with sparse features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.4], [0.6], [0.3]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[1.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    price = feature_column_lib.real_valued_column('price')
    country = feature_column_lib.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    classifier = sdca_estimator.SDCALogisticClassifier(
        example_id_column='example_id',
        feature_columns=[price, country],
        weight_column_name='weights')
    classifier.fit(input_fn=input_fn, steps=50)
    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
    self.assertGreater(metrics['accuracy'], 0.9)
Beispiel #23
0
    def testSparseColumnWithHashBucket(self):
        a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100)
        self.assertEqual(a.name, "aaa")
        self.assertEqual(a.dtype, dtypes.string)

        a = fc.sparse_column_with_hash_bucket("aaa",
                                              hash_bucket_size=100,
                                              dtype=dtypes.int64)
        self.assertEqual(a.name, "aaa")
        self.assertEqual(a.dtype, dtypes.int64)

        with self.assertRaisesRegexp(ValueError,
                                     "dtype must be string or integer"):
            a = fc.sparse_column_with_hash_bucket("aaa",
                                                  hash_bucket_size=100,
                                                  dtype=dtypes.float32)
 def testEmbeddingColumn(self):
     a = fc.sparse_column_with_hash_bucket("aaa",
                                           hash_bucket_size=100,
                                           combiner="sum")
     b = fc.embedding_column(a, dimension=4, combiner="mean")
     self.assertEqual(b.sparse_id_column.name, "aaa")
     self.assertEqual(b.dimension, 4)
     self.assertEqual(b.combiner, "mean")
 def testCrossedColumnNotSupportRealValuedColumn(self):
     b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
     with self.assertRaisesRegexp(
             TypeError,
             "columns must be a set of _SparseColumn, _CrossedColumn, "
             "or _BucketizedColumn instances"):
         fc.crossed_column(set([b, fc.real_valued_column("real")]),
                           hash_bucket_size=10000)
Beispiel #26
0
 def testSparseColumnHashBucketDeepCopy(self):
   """Tests deepcopy of sparse_column_with_hash_bucket."""
   column = fc.sparse_column_with_hash_bucket("a", 10)
   self.assertEqual("a", column.name)
   column_copy = copy.deepcopy(column)
   self.assertEqual("a", column_copy.name)
   self.assertEqual(10, column_copy.bucket_size)
   self.assertFalse(column_copy.is_integerized)
 def testSparseColumnHashBucketDeepCopy(self):
   """Tests deepcopy of sparse_column_with_hash_bucket."""
   column = fc.sparse_column_with_hash_bucket("a", 10)
   self.assertEqual("a", column.name)
   column_copy = copy.deepcopy(column)
   self.assertEqual("a", column_copy.name)
   self.assertEqual(10, column_copy.bucket_size)
   self.assertFalse(column_copy.is_integerized)
Beispiel #28
0
  def testMultipliesGradient(self):
    embedding_language = feature_column.embedding_column(
        feature_column.sparse_column_with_hash_bucket('language', 10),
        dimension=1,
        initializer=init_ops.constant_initializer(0.1))
    embedding_wire = feature_column.embedding_column(
        feature_column.sparse_column_with_hash_bucket('wire', 10),
        dimension=1,
        initializer=init_ops.constant_initializer(0.1))

    params = {
        'feature_columns': [embedding_language, embedding_wire],
        'head': head_lib._multi_class_head(2),
        'hidden_units': [1],
        # Set lr mult to 0. to keep embeddings constant.
        'embedding_lr_multipliers': {
            embedding_language: 0.0
        },
    }
    features = {
        'language':
            sparse_tensor.SparseTensor(
                values=['en', 'fr', 'zh'],
                indices=[[0, 0], [1, 0], [2, 0]],
                dense_shape=[3, 1]),
        'wire':
            sparse_tensor.SparseTensor(
                values=['omar', 'stringer', 'marlo'],
                indices=[[0, 0], [1, 0], [2, 0]],
                dense_shape=[3, 1]),
    }
    labels = constant_op.constant([[0], [0], [0]], dtype=dtypes.int32)
    model_ops = dnn._dnn_model_fn(features, labels, model_fn.ModeKeys.TRAIN,
                                  params)
    with monitored_session.MonitoredSession() as sess:
      language_var = dnn_linear_combined._get_embedding_variable(
          embedding_language, 'dnn', 'dnn/input_from_feature_columns')
      wire_var = dnn_linear_combined._get_embedding_variable(
          embedding_wire, 'dnn', 'dnn/input_from_feature_columns')
      for _ in range(2):
        _, language_value, wire_value = sess.run(
            [model_ops.train_op, language_var, wire_var])
      initial_value = np.full_like(language_value, 0.1)
      self.assertTrue(np.all(np.isclose(language_value, initial_value)))
      self.assertFalse(np.all(np.isclose(wire_value, initial_value)))
 def testEmbeddingColumnDeepCopy(self):
   a = fc.sparse_column_with_hash_bucket(
       "aaa", hash_bucket_size=100, combiner="sum")
   column = fc.embedding_column(a, dimension=4, combiner="mean")
   column_copy = copy.deepcopy(column)
   self.assertEqual(column_copy.name, "aaa_embedding")
   self.assertEqual(column_copy.sparse_id_column.name, "aaa")
   self.assertEqual(column_copy.dimension, 4)
   self.assertEqual(column_copy.combiner, "mean")
Beispiel #30
0
  def test_make_parsing_export_strategy(self):
    """Only tests that an ExportStrategy instance is created."""
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    feature_columns = [sparse_col, embedding_col, real_valued_col1,
                       bucketized_col1]

    export_strategy = saved_model_export_utils.make_parsing_export_strategy(
        feature_columns=feature_columns)
    self.assertTrue(
        isinstance(export_strategy, export_strategy_lib.ExportStrategy))
Beispiel #31
0
 def testEmbeddingColumnDeepCopy(self):
   a = fc.sparse_column_with_hash_bucket(
       "aaa", hash_bucket_size=100, combiner="sum")
   column = fc.embedding_column(a, dimension=4, combiner="mean")
   column_copy = copy.deepcopy(column)
   self.assertEqual(column_copy.name, "aaa_embedding")
   self.assertEqual(column_copy.sparse_id_column.name, "aaa")
   self.assertEqual(column_copy.dimension, 4)
   self.assertEqual(column_copy.combiner, "mean")
  def test_make_parsing_export_strategy(self):
    """Only tests that an ExportStrategy instance is created."""
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    feature_columns = [sparse_col, embedding_col, real_valued_col1,
                       bucketized_col1]

    export_strategy = saved_model_export_utils.make_parsing_export_strategy(
        feature_columns=feature_columns)
    self.assertTrue(
        isinstance(export_strategy, export_strategy_lib.ExportStrategy))
Beispiel #33
0
  def testInitEmbeddingColumnWeightsFromCkpt(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        column_name="object_in_image", hash_bucket_size=4)
    # Create _EmbeddingColumn which randomly initializes embedding of size
    # [4, 16].
    embedding_col = fc.embedding_column(sparse_col, dimension=16)

    # Creating a SparseTensor which has all the ids possible for the given
    # vocab.
    input_tensor = sparse_tensor_lib.SparseTensor(
        indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
        values=[0, 1, 2, 3],
        dense_shape=[4, 4])

    # Invoking 'layers.input_from_feature_columns' will create the embedding
    # variable. Creating under scope 'run_1' so as to prevent name conflicts
    # when creating embedding variable for 'embedding_column_pretrained'.
    with variable_scope.variable_scope("run_1"):
      with variable_scope.variable_scope(embedding_col.name):
        # This will return a [4, 16] tensor which is same as embedding variable.
        embeddings = feature_column_ops.input_from_feature_columns({
            embedding_col: input_tensor
        }, [embedding_col])

    save = saver.Saver()
    ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                   "init_embedding_col_w_from_ckpt")
    ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
    checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      saved_embedding = embeddings.eval()
      save.save(sess, checkpoint_path)

    embedding_col_initialized = fc.embedding_column(
        sparse_id_column=sparse_col,
        dimension=16,
        ckpt_to_load_from=checkpoint_path,
        tensor_name_in_ckpt=("run_1/object_in_image_embedding/"
                             "input_from_feature_columns/object"
                             "_in_image_embedding/weights"))

    with variable_scope.variable_scope("run_2"):
      # This will initialize the embedding from provided checkpoint and return a
      # [4, 16] tensor which is same as embedding variable. Since we didn't
      # modify embeddings, this should be same as 'saved_embedding'.
      pretrained_embeddings = feature_column_ops.input_from_feature_columns({
          embedding_col_initialized: input_tensor
      }, [embedding_col_initialized])

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      loaded_embedding = pretrained_embeddings.eval()

    self.assertAllClose(saved_embedding, loaded_embedding)
  def testInitEmbeddingColumnWeightsFromCkpt(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        column_name="object_in_image", hash_bucket_size=4)
    # Create _EmbeddingColumn which randomly initializes embedding of size
    # [4, 16].
    embedding_col = fc.embedding_column(sparse_col, dimension=16)

    # Creating a SparseTensor which has all the ids possible for the given
    # vocab.
    input_tensor = sparse_tensor_lib.SparseTensor(
        indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
        values=[0, 1, 2, 3],
        dense_shape=[4, 4])

    # Invoking 'layers.input_from_feature_columns' will create the embedding
    # variable. Creating under scope 'run_1' so as to prevent name conflicts
    # when creating embedding variable for 'embedding_column_pretrained'.
    with variable_scope.variable_scope("run_1"):
      with variable_scope.variable_scope(embedding_col.name):
        # This will return a [4, 16] tensor which is same as embedding variable.
        embeddings = feature_column_ops.input_from_feature_columns({
            embedding_col: input_tensor
        }, [embedding_col])

    save = saver.Saver()
    ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                   "init_embedding_col_w_from_ckpt")
    ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
    checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      saved_embedding = embeddings.eval()
      save.save(sess, checkpoint_path)

    embedding_col_initialized = fc.embedding_column(
        sparse_id_column=sparse_col,
        dimension=16,
        ckpt_to_load_from=checkpoint_path,
        tensor_name_in_ckpt=("run_1/object_in_image_embedding/"
                             "input_from_feature_columns/object"
                             "_in_image_embedding/weights"))

    with variable_scope.variable_scope("run_2"):
      # This will initialize the embedding from provided checkpoint and return a
      # [4, 16] tensor which is same as embedding variable. Since we didn't
      # modify embeddings, this should be same as 'saved_embedding'.
      pretrained_embeddings = feature_column_ops.input_from_feature_columns({
          embedding_col_initialized: input_tensor
      }, [embedding_col_initialized])

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      loaded_embedding = pretrained_embeddings.eval()

    self.assertAllClose(saved_embedding, loaded_embedding)
Beispiel #35
0
  def testOneHotColumn(self):
    a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
    onehot_a = fc.one_hot_column(a)
    self.assertEqual(onehot_a.sparse_id_column.name, "a")
    self.assertEqual(onehot_a.length, 4)

    b = fc.sparse_column_with_hash_bucket(
        "b", hash_bucket_size=100, combiner="sum")
    onehot_b = fc.one_hot_column(b)
    self.assertEqual(onehot_b.sparse_id_column.name, "b")
    self.assertEqual(onehot_b.length, 100)
  def testOneHotColumn(self):
    a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
    onehot_a = fc.one_hot_column(a)
    self.assertEqual(onehot_a.sparse_id_column.name, "a")
    self.assertEqual(onehot_a.length, 4)

    b = fc.sparse_column_with_hash_bucket(
        "b", hash_bucket_size=100, combiner="sum")
    onehot_b = fc.one_hot_column(b)
    self.assertEqual(onehot_b.sparse_id_column.name, "b")
    self.assertEqual(onehot_b.length, 100)
Beispiel #37
0
 def testEmbeddingMultiplier(self):
   embedding_language = feature_column.embedding_column(
       feature_column.sparse_column_with_hash_bucket('language', 10),
       dimension=1,
       initializer=init_ops.constant_initializer(0.1))
   classifier = dnn.DNNClassifier(
       feature_columns=[embedding_language],
       hidden_units=[3, 3],
       embedding_lr_multipliers={embedding_language: 0.8})
   self.assertEqual({
       embedding_language: 0.8
   }, classifier._estimator.params['embedding_lr_multipliers'])
Beispiel #38
0
  def testTrainWithPartitionedVariables(self):
    """Tests training with partitioned variables."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.2], [.1]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)

    # The given hash_bucket_size results in variables larger than the
    # default min_slice_size attribute, so the variables are partitioned.
    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=2e7)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1)
    ]

    tf_config = {
        'cluster': {
            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
        }
    }
    with test.mock.patch.dict('os.environ',
                              {'TF_CONFIG': json.dumps(tf_config)}):
      config = run_config.RunConfig(tf_random_seed=1)
      # Because we did not start a distributed cluster, we need to pass an
      # empty ClusterSpec, otherwise the device_setter will look for
      # distributed jobs, such as "/job:ps" which are not present.
      config._cluster_spec = server_lib.ClusterSpec({})

    classifier = dnn.DNNClassifier(
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=config)

    classifier.fit(input_fn=_input_fn, steps=5)
    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
    self._assertInRange(0.0, 1.0, scores['accuracy'])
    self.assertIn('loss', scores)
Beispiel #39
0
 def testExtractFeaturesWithTransformation(self):
     """Tests feature extraction."""
     with self.test_session():
         features = {}
         features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32)
         features["sparse_float"] = sparse_tensor.SparseTensor(
             array_ops.zeros([2, 2], dtypes.int64),
             array_ops.zeros([2], dtypes.float32),
             array_ops.zeros([2], dtypes.int64))
         features["sparse_categorical"] = sparse_tensor.SparseTensor(
             array_ops.zeros([2, 2], dtypes.int64),
             array_ops.zeros([2], dtypes.string),
             array_ops.zeros([2], dtypes.int64))
         feature_columns = set()
         feature_columns.add(layers.real_valued_column("dense_float"))
         feature_columns.add(
             layers.feature_column._real_valued_var_len_column(
                 "sparse_float", is_sparse=True))
         feature_columns.add(
             feature_column_lib.sparse_column_with_hash_bucket(
                 "sparse_categorical", hash_bucket_size=1000000))
         (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
          sparse_float_shapes, sparse_int_indices, sparse_int_values,
          sparse_int_shapes) = (gbdt_batch.extract_features(
              features, feature_columns))
         self.assertEqual(len(fc_names), 3)
         self.assertAllEqual(
             fc_names,
             ["dense_float", "sparse_float", "sparse_categorical"])
         self.assertEqual(len(dense_floats), 1)
         self.assertEqual(len(sparse_float_indices), 1)
         self.assertEqual(len(sparse_float_values), 1)
         self.assertEqual(len(sparse_float_shapes), 1)
         self.assertEqual(len(sparse_int_indices), 1)
         self.assertEqual(len(sparse_int_values), 1)
         self.assertEqual(len(sparse_int_shapes), 1)
         self.assertAllEqual(dense_floats[0].eval(),
                             features["dense_float"].eval())
         self.assertAllEqual(sparse_float_indices[0].eval(),
                             features["sparse_float"].indices.eval())
         self.assertAllEqual(sparse_float_values[0].eval(),
                             features["sparse_float"].values.eval())
         self.assertAllEqual(sparse_float_shapes[0].eval(),
                             features["sparse_float"].dense_shape.eval())
         self.assertAllEqual(sparse_int_indices[0].eval(),
                             features["sparse_categorical"].indices.eval())
         self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263])
         self.assertAllEqual(
             sparse_int_shapes[0].eval(),
             features["sparse_categorical"].dense_shape.eval())
  def benchmarkTensorData(self):

    def _input_fn():
      iris = test_data.prepare_iris_data_for_logistic_regression()
      features = {}
      for i in range(4):
        # The following shows how to provide the Tensor data for
        # RealValuedColumns.
        features.update({
            str(i):
                array_ops.reshape(
                    constant_op.constant(
                        iris.data[:, i], dtype=dtypes.float32), (-1, 1))
        })
      # The following shows how to provide the SparseTensor data for
      # a SparseColumn.
      features['dummy_sparse_column'] = sparse_tensor.SparseTensor(
          values=('en', 'fr', 'zh'),
          indices=((0, 0), (0, 1), (60, 0)),
          dense_shape=(len(iris.target), 2))
      labels = array_ops.reshape(
          constant_op.constant(
              iris.target, dtype=dtypes.int32), (-1, 1))
      return features, labels

    iris = test_data.prepare_iris_data_for_logistic_regression()
    cont_features = [
        feature_column.real_valued_column(str(i)) for i in range(4)
    ]
    linear_features = [
        feature_column.bucketized_column(
            cont_features[i],
            test_data.get_quantile_based_buckets(iris.data[:, i], 10))
        for i in range(4)
    ]
    linear_features.append(
        feature_column.sparse_column_with_hash_bucket(
            'dummy_sparse_column', hash_bucket_size=100))

    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
        model_dir=tempfile.mkdtemp(),
        linear_feature_columns=linear_features,
        dnn_feature_columns=cont_features,
        dnn_hidden_units=(3, 3))

    metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate(
        input_fn=_input_fn, steps=100)
    self._assertSingleClassMetrics(metrics)
Beispiel #41
0
 def testExtractFeaturesWithTransformation(self):
   """Tests feature extraction."""
   with self.test_session():
     features = {}
     features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32)
     features["sparse_float"] = sparse_tensor.SparseTensor(
         array_ops.zeros([2, 2], dtypes.int64),
         array_ops.zeros([2], dtypes.float32),
         array_ops.zeros([2], dtypes.int64))
     features["sparse_categorical"] = sparse_tensor.SparseTensor(
         array_ops.zeros([2, 2], dtypes.int64),
         array_ops.zeros(
             [2], dtypes.string), array_ops.zeros([2], dtypes.int64))
     feature_columns = set()
     feature_columns.add(layers.real_valued_column("dense_float"))
     feature_columns.add(
         layers.feature_column._real_valued_var_len_column(
             "sparse_float", is_sparse=True))
     feature_columns.add(
         feature_column_lib.sparse_column_with_hash_bucket(
             "sparse_categorical", hash_bucket_size=1000000))
     (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
      sparse_float_shapes, sparse_int_indices, sparse_int_values,
      sparse_int_shapes) = (gbdt_batch.extract_features(
          features, feature_columns))
     self.assertEqual(len(fc_names), 3)
     self.assertAllEqual(fc_names,
                         ["dense_float", "sparse_float", "sparse_categorical"])
     self.assertEqual(len(dense_floats), 1)
     self.assertEqual(len(sparse_float_indices), 1)
     self.assertEqual(len(sparse_float_values), 1)
     self.assertEqual(len(sparse_float_shapes), 1)
     self.assertEqual(len(sparse_int_indices), 1)
     self.assertEqual(len(sparse_int_values), 1)
     self.assertEqual(len(sparse_int_shapes), 1)
     self.assertAllEqual(dense_floats[0].eval(),
                         features["dense_float"].eval())
     self.assertAllEqual(sparse_float_indices[0].eval(),
                         features["sparse_float"].indices.eval())
     self.assertAllEqual(sparse_float_values[0].eval(),
                         features["sparse_float"].values.eval())
     self.assertAllEqual(sparse_float_shapes[0].eval(),
                         features["sparse_float"].dense_shape.eval())
     self.assertAllEqual(sparse_int_indices[0].eval(),
                         features["sparse_categorical"].indices.eval())
     self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263])
     self.assertAllEqual(sparse_int_shapes[0].eval(),
                         features["sparse_categorical"].dense_shape.eval())
    def benchmarkTensorData(self):
        def _input_fn():
            iris = test_data.prepare_iris_data_for_logistic_regression()
            features = {}
            for i in range(4):
                # The following shows how to provide the Tensor data for
                # RealValuedColumns.
                features.update({
                    str(i):
                    array_ops.reshape(
                        constant_op.constant(iris.data[:, i],
                                             dtype=dtypes.float32), (-1, 1))
                })
            # The following shows how to provide the SparseTensor data for
            # a SparseColumn.
            features['dummy_sparse_column'] = sparse_tensor.SparseTensor(
                values=('en', 'fr', 'zh'),
                indices=((0, 0), (0, 1), (60, 0)),
                dense_shape=(len(iris.target), 2))
            labels = array_ops.reshape(
                constant_op.constant(iris.target, dtype=dtypes.int32), (-1, 1))
            return features, labels

        iris = test_data.prepare_iris_data_for_logistic_regression()
        cont_features = [
            feature_column.real_valued_column(str(i)) for i in range(4)
        ]
        linear_features = [
            feature_column.bucketized_column(
                cont_features[i],
                test_data.get_quantile_based_buckets(iris.data[:, i], 10))
            for i in range(4)
        ]
        linear_features.append(
            feature_column.sparse_column_with_hash_bucket(
                'dummy_sparse_column', hash_bucket_size=100))

        classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
            model_dir=tempfile.mkdtemp(),
            linear_feature_columns=linear_features,
            dnn_feature_columns=cont_features,
            dnn_hidden_units=(3, 3))

        metrics = classifier.fit(input_fn=_input_fn,
                                 steps=_ITERS).evaluate(input_fn=_input_fn,
                                                        steps=100)
        self._assertSingleClassMetrics(metrics)
Beispiel #43
0
  def testPredict_AsIterable(self):
    """Tests predict and predict_prob methods with as_iterable=True."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.2], [.1]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)

    language_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            language_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    classifier = dnn.DNNClassifier(
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    classifier.fit(input_fn=_input_fn, steps=200)

    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
    self._assertInRange(0.0, 1.0, scores['accuracy'])
    self.assertIn('loss', scores)
    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
    predictions = list(
        classifier.predict(
            input_fn=predict_input_fn, as_iterable=True))
    self.assertListEqual(predictions, [1, 0, 0])
    predictions = list(
        classifier.predict_proba(
            input_fn=predict_input_fn, as_iterable=True))
    self.assertAllClose(
        predictions, [[0., 1., 0.], [1., 0., 0.], [1., 0., 0.]], atol=0.3)
Beispiel #44
0
    def benchmarkLogisticFloatLabel(self):
        def _input_fn(num_epochs=None):
            features = {
                'age':
                input_lib.limit_epochs(constant_op.constant(
                    ((50, ), (20, ), (10, ))),
                                       num_epochs=num_epochs),
                'language':
                sparse_tensor.SparseTensor(values=input_lib.limit_epochs(
                    ('en', 'fr', 'zh'), num_epochs=num_epochs),
                                           indices=((0, 0), (0, 1), (2, 0)),
                                           dense_shape=(3, 2))
            }
            return features, constant_op.constant(((0.8, ), (0., ), (0.2, )),
                                                  dtype=dtypes.float32)

        lang_column = feature_column.sparse_column_with_hash_bucket(
            'language', hash_bucket_size=20)
        n_classes = 2
        classifier = dnn.DNNClassifier(
            n_classes=n_classes,
            feature_columns=(feature_column.embedding_column(lang_column,
                                                             dimension=1),
                             feature_column.real_valued_column('age')),
            hidden_units=(3, 3),
            config=run_config.RunConfig(tf_random_seed=1))
        steps = 1000
        metrics = classifier.fit(input_fn=_input_fn,
                                 steps=steps).evaluate(input_fn=_input_fn,
                                                       steps=1)
        estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step',
                                             metrics)

        # Prediction probabilities mirror the labels column, which proves that the
        # classifier learns from float input.
        self._report_metrics(metrics)
        self._report_predictions(
            classifier=classifier,
            input_fn=functools.partial(_input_fn, num_epochs=1),
            iters=metrics['global_step'],
            n_examples=3,
            n_classes=n_classes,
            expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)),
            expected_classes=(1, 0, 0),
            benchmark_name_override=(
                'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions'
            ))
  def benchmarkLogisticFloatLabel(self):

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant(((50,), (20,), (10,))),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ('en', 'fr', 'zh'), num_epochs=num_epochs),
                  indices=((0, 0), (0, 1), (2, 0)),
                  dense_shape=(3, 2))
      }
      return features, constant_op.constant(
          ((0.8,), (0.,), (0.2,)), dtype=dtypes.float32)

    lang_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    n_classes = 2
    classifier = dnn.DNNClassifier(
        n_classes=n_classes,
        feature_columns=(feature_column.embedding_column(
            lang_column, dimension=1),
                         feature_column.real_valued_column('age')),
        hidden_units=(3, 3),
        config=run_config.RunConfig(tf_random_seed=1))
    steps = 1000
    metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate(
        input_fn=_input_fn, steps=1)
    estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step',
                                         metrics)

    # Prediction probabilities mirror the labels column, which proves that the
    # classifier learns from float input.
    self._report_metrics(metrics)
    self._report_predictions(
        classifier=classifier,
        input_fn=functools.partial(_input_fn, num_epochs=1),
        iters=metrics['global_step'],
        n_examples=3,
        n_classes=n_classes,
        expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)),
        expected_classes=(1, 0, 0),
        benchmark_name_override=(
            'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions'))
Beispiel #46
0
  def testTrainSaveLoad(self):
    """Tests that insures you can save and reload a trained model."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[.8], [.2], [.1]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)

    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1)
    ]

    model_dir = tempfile.mkdtemp()
    classifier = dnn.DNNClassifier(
        model_dir=model_dir,
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    classifier.fit(input_fn=_input_fn, steps=5)
    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
    predictions1 = classifier.predict(input_fn=predict_input_fn)
    del classifier

    classifier2 = dnn.DNNClassifier(
        model_dir=model_dir,
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))
    predictions2 = classifier2.predict(input_fn=predict_input_fn)
    self.assertEqual(list(predictions1), list(predictions2))
Beispiel #47
0
  def testTrainSaveLoad(self):
    """Tests that insures you can save and reload a trained model."""

    def _input_fn(num_epochs=None):
      features = {
          'age':
              input_lib.limit_epochs(
                  constant_op.constant([[0.8], [0.15], [0.]]),
                  num_epochs=num_epochs),
          'language':
              sparse_tensor.SparseTensor(
                  values=input_lib.limit_epochs(
                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
                  indices=[[0, 0], [0, 1], [2, 0]],
                  dense_shape=[3, 2])
      }
      return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32)

    sparse_column = feature_column.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=20)
    feature_columns = [
        feature_column.embedding_column(
            sparse_column, dimension=1),
        feature_column.real_valued_column('age')
    ]

    model_dir = tempfile.mkdtemp()
    regressor = dnn.DNNRegressor(
        model_dir=model_dir,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))

    regressor.fit(input_fn=_input_fn, steps=5)
    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
    predictions = list(regressor.predict(input_fn=predict_input_fn))
    del regressor

    regressor2 = dnn.DNNRegressor(
        model_dir=model_dir,
        feature_columns=feature_columns,
        hidden_units=[3, 3],
        config=run_config.RunConfig(tf_random_seed=1))
    predictions2 = list(regressor2.predict(input_fn=predict_input_fn))
    self.assertAllClose(predictions, predictions2)
Beispiel #48
0
    def benchmarkLogisticTensorData(self):
        def _input_fn(num_epochs=None):
            features = {
                'age':
                input_lib.limit_epochs(constant_op.constant(
                    ((.8, ), (0.2, ), (.1, ))),
                                       num_epochs=num_epochs),
                'language':
                sparse_tensor.SparseTensor(values=input_lib.limit_epochs(
                    ('en', 'fr', 'zh'), num_epochs=num_epochs),
                                           indices=((0, 0), (0, 1), (2, 0)),
                                           dense_shape=(3, 2))
            }
            return features, constant_op.constant(((1, ), (0, ), (0, )),
                                                  dtype=dtypes.int32)

        lang_column = feature_column.sparse_column_with_hash_bucket(
            'language', hash_bucket_size=20)
        classifier = dnn.DNNClassifier(
            feature_columns=(feature_column.embedding_column(lang_column,
                                                             dimension=1),
                             feature_column.real_valued_column('age')),
            hidden_units=(3, 3),
            config=run_config.RunConfig(tf_random_seed=1))
        steps = 100
        metrics = classifier.fit(input_fn=_input_fn,
                                 steps=steps).evaluate(input_fn=_input_fn,
                                                       steps=1)
        estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step',
                                             metrics)
        estimator_test_utils.assert_in_range(0.9, 1.0, 'accuracy', metrics)
        estimator_test_utils.assert_in_range(0.0, 0.3, 'loss', metrics)

        self._report_metrics(metrics)
        self._report_predictions(
            classifier=classifier,
            input_fn=functools.partial(_input_fn, num_epochs=1),
            iters=metrics['global_step'],
            n_examples=3,
            n_classes=2,
            expected_classes=(1, 0, 0),
            benchmark_name_override=(
                'DNNClassifierBenchmark.benchmarkLogisticTensorData_predictions'
            ))
    def benchmarkPartitionedVariables(self):
        def _input_fn():
            features = {
                'language':
                sparse_tensor.SparseTensor(values=('en', 'fr', 'zh'),
                                           indices=((0, 0), (0, 1), (2, 0)),
                                           dense_shape=(3, 2))
            }
            labels = constant_op.constant(((1, ), (0, ), (0, )))
            return features, labels

        # The given hash_bucket_size results in variables larger than the
        # default min_slice_size attribute, so the variables are partitioned.
        sparse_feature = feature_column.sparse_column_with_hash_bucket(
            'language', hash_bucket_size=2e7)
        embedding_feature = feature_column.embedding_column(sparse_feature,
                                                            dimension=1)

        tf_config = {
            'cluster': {
                run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
            }
        }
        with test.mock.patch.dict('os.environ',
                                  {'TF_CONFIG': json.dumps(tf_config)}):
            config = run_config.RunConfig()
            # Because we did not start a distributed cluster, we need to pass an
            # empty ClusterSpec, otherwise the device_setter will look for
            # distributed jobs, such as "/job:ps" which are not present.
            config._cluster_spec = server_lib.ClusterSpec({})

        classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=(sparse_feature, ),
            dnn_feature_columns=(embedding_feature, ),
            dnn_hidden_units=(3, 3),
            config=config)

        metrics = classifier.fit(input_fn=_input_fn,
                                 steps=_ITERS).evaluate(input_fn=_input_fn,
                                                        steps=100)
        self._assertCommonMetrics(metrics)
Beispiel #50
0
  def testMixedFeaturesArbitraryWeightsPartitioned(self):
    """Tests SDCALinearRegressor works with a mix of features (partitioned)."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [5.0], [7.0]])
      }, constant_op.constant([[1.55], [-1.25], [-3.0]])

    with self._single_threaded_test_session():
      price = feature_column_lib.real_valued_column('price')
      sq_footage_bucket = feature_column_lib.bucketized_column(
          feature_column_lib.real_valued_column('sq_footage'),
          boundaries=[650.0, 800.0])
      country = feature_column_lib.sparse_column_with_hash_bucket(
          'country', hash_bucket_size=5)
      sq_footage_country = feature_column_lib.crossed_column(
          [sq_footage_bucket, country], hash_bucket_size=10)
      regressor = sdca_estimator.SDCALinearRegressor(
          example_id_column='example_id',
          feature_columns=[
              price, sq_footage_bucket, country, sq_footage_country
          ],
          l2_regularization=1.0,
          weight_column_name='weights',
          partitioner=partitioned_variables.fixed_size_partitioner(
              num_shards=2, axis=0))
      regressor.fit(input_fn=input_fn, steps=20)
      loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
      self.assertLess(loss, 0.05)
Beispiel #51
0
  def testPartitionedMixedFeatures(self):
    """Tests SDCALogisticClassifier with a mix of features (partitioned)."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([900.0, 700.0, 600.0]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    with self._single_threaded_test_session():
      price = feature_column_lib.real_valued_column('price')
      sq_footage_bucket = feature_column_lib.bucketized_column(
          feature_column_lib.real_valued_column('sq_footage'),
          boundaries=[650.0, 800.0])
      country = feature_column_lib.sparse_column_with_hash_bucket(
          'country', hash_bucket_size=5)
      sq_footage_country = feature_column_lib.crossed_column(
          [sq_footage_bucket, country], hash_bucket_size=10)
      classifier = sdca_estimator.SDCALogisticClassifier(
          example_id_column='example_id',
          feature_columns=[
              price, sq_footage_bucket, country, sq_footage_country
          ],
          weight_column_name='weights',
          partitioner=partitioned_variables.fixed_size_partitioner(
              num_shards=2, axis=0))
      classifier.fit(input_fn=input_fn, steps=50)
      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
      self.assertGreater(metrics['accuracy'], 0.9)
Beispiel #52
0
    def testMixedFeatures(self):
        """Tests SVM classifier with a mix of features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                constant_op.constant([[0.6], [0.8], [0.3]]),
                'sq_footage':
                constant_op.constant([[900.0], [700.0], [600.0]]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 3], [2, 1]],
                                           dense_shape=[3, 5]),
                'weights':
                constant_op.constant([[3.0], [1.0], [1.0]])
            }, constant_op.constant([[1], [0], [1]])

        price = feature_column.real_valued_column('price')
        sq_footage_bucket = feature_column.bucketized_column(
            feature_column.real_valued_column('sq_footage'),
            boundaries=[650.0, 800.0])
        country = feature_column.sparse_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        sq_footage_country = feature_column.crossed_column(
            [sq_footage_bucket, country], hash_bucket_size=10)
        svm_classifier = svm.SVM(feature_columns=[
            price, sq_footage_bucket, country, sq_footage_country
        ],
                                 example_id_column='example_id',
                                 weight_column_name='weights',
                                 l1_regularization=0.1,
                                 l2_regularization=1.0)

        svm_classifier.fit(input_fn=input_fn, steps=30)
        accuracy = svm_classifier.evaluate(input_fn=input_fn,
                                           steps=1)['accuracy']
        self.assertAlmostEqual(accuracy, 1.0, places=3)
    def testInitCrossedColumnWeightsFromCkpt(self):
        sparse_col_1 = fc.sparse_column_with_hash_bucket(column_name="col_1",
                                                         hash_bucket_size=4)
        sparse_col_2 = fc.sparse_column_with_hash_bucket(column_name="col_2",
                                                         hash_bucket_size=4)

        crossed_col = fc.crossed_column(columns=[sparse_col_1, sparse_col_2],
                                        hash_bucket_size=4)

        input_tensor = sparse_tensor_lib.SparseTensor(indices=[[0, 0], [1, 1],
                                                               [2, 2], [3, 3]],
                                                      values=[0, 1, 2, 3],
                                                      dense_shape=[4, 4])

        # Invoking 'weighted_sum_from_feature_columns' will create the crossed
        # column weights variable.
        with variable_scope.variable_scope("run_1"):
            with variable_scope.variable_scope(crossed_col.name):
                # Returns looked up column weights which is same as crossed column
                # weights as well as actual references to weights variables.
                _, col_weights, _ = (
                    feature_column_ops.weighted_sum_from_feature_columns(
                        {
                            sparse_col_1.name: input_tensor,
                            sparse_col_2.name: input_tensor
                        }, [crossed_col], 1))
                # Update the weights since default initializer initializes all weights
                # to 0.0.
                for weight in col_weights.values():
                    assign_op = state_ops.assign(weight[0], weight[0] + 0.5)

        save = saver.Saver()
        ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                       "init_crossed_col_w_from_ckpt")
        ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
        checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            sess.run(assign_op)
            saved_col_weights = col_weights[crossed_col][0].eval()
            save.save(sess, checkpoint_path)

        crossed_col_initialized = fc.crossed_column(
            columns=[sparse_col_1, sparse_col_2],
            hash_bucket_size=4,
            ckpt_to_load_from=checkpoint_path,
            tensor_name_in_ckpt=("run_1/col_1_X_col_2/"
                                 "weighted_sum_from_feature_columns/"
                                 "col_1_X_col_2/weights"))

        with variable_scope.variable_scope("run_2"):
            # This will initialize the crossed column weights from provided checkpoint
            # and return a [4, 1] tensor which is same as weights variable. Since we
            # won't modify weights, this should be same as 'saved_col_weights'.
            _, col_weights, _ = (
                feature_column_ops.weighted_sum_from_feature_columns(
                    {
                        sparse_col_1.name: input_tensor,
                        sparse_col_2.name: input_tensor
                    }, [crossed_col_initialized], 1))
            col_weights_from_ckpt = col_weights[crossed_col_initialized][0]

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            loaded_col_weights = col_weights_from_ckpt.eval()

        self.assertAllClose(saved_col_weights, loaded_col_weights)
 def testImmutability(self):
     a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100)
     with self.assertRaises(AttributeError):
         a.column_name = "bbb"
    def testCreateFeatureSpec(self):
        sparse_col = fc.sparse_column_with_hash_bucket("sparse_column",
                                                       hash_bucket_size=100)
        embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
                                            dimension=4)
        sparse_id_col = fc.sparse_column_with_keys(
            "id_column", ["marlo", "omar", "stringer"])
        weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                    "id_weights_column")
        real_valued_col1 = fc.real_valued_column("real_valued_column1")
        real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
        real_valued_col3 = fc.real_valued_column("real_valued_column3",
                                                 dimension=None)
        bucketized_col1 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization1"),
            [0, 4])
        bucketized_col2 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization2", 4),
            [0, 4])
        a = fc.sparse_column_with_hash_bucket("cross_aaa",
                                              hash_bucket_size=100)
        b = fc.sparse_column_with_hash_bucket("cross_bbb",
                                              hash_bucket_size=100)
        cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
        feature_columns = set([
            sparse_col, embedding_col, weighted_id_col, real_valued_col1,
            real_valued_col2, real_valued_col3, bucketized_col1,
            bucketized_col2, cross_col
        ])
        expected_config = {
            "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
            "real_valued_column1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column2":
            parsing_ops.FixedLenFeature([5], dtype=dtypes.float32),
            "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
            "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature([4], dtype=dtypes.float32),
            "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
            "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
        }

        config = fc.create_feature_spec_for_parsing(feature_columns)
        self.assertDictEqual(expected_config, config)

        # Test that the same config is parsed out if we pass a dictionary.
        feature_columns_dict = {
            str(i): val
            for i, val in enumerate(feature_columns)
        }
        config = fc.create_feature_spec_for_parsing(feature_columns_dict)
        self.assertDictEqual(expected_config, config)
Beispiel #56
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
    def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
        """SDCALinearRegressor works with sparse features and L1 regularization."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                constant_op.constant([0.4, 0.6, 0.3]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 3], [2, 1]],
                                           dense_shape=[3, 5]),
                'weights':
                constant_op.constant([[10.0], [10.0], [10.0]])
            }, constant_op.constant([[1.4], [-0.8], [2.6]])

        with self._single_threaded_test_session():
            price = feature_column_lib.real_valued_column('price')
            country = feature_column_lib.sparse_column_with_hash_bucket(
                'country', hash_bucket_size=5)
            # Regressor with no L1 regularization.
            regressor = sdca_estimator.SDCALinearRegressor(
                example_id_column='example_id',
                feature_columns=[price, country],
                weight_column_name='weights')
            regressor.fit(input_fn=input_fn, steps=20)
            no_l1_reg_loss = regressor.evaluate(input_fn=input_fn,
                                                steps=1)['loss']
            variable_names = regressor.get_variable_names()
            self.assertIn('linear/price/weight', variable_names)
            self.assertIn('linear/country/weights', variable_names)
            no_l1_reg_weights = {
                'linear/price/weight':
                regressor.get_variable_value('linear/price/weight'),
                'linear/country/weights':
                regressor.get_variable_value('linear/country/weights'),
            }

            # Regressor with L1 regularization.
            regressor = sdca_estimator.SDCALinearRegressor(
                example_id_column='example_id',
                feature_columns=[price, country],
                l1_regularization=1.0,
                weight_column_name='weights')
            regressor.fit(input_fn=input_fn, steps=20)
            l1_reg_loss = regressor.evaluate(input_fn=input_fn,
                                             steps=1)['loss']
            l1_reg_weights = {
                'linear/price/weight':
                regressor.get_variable_value('linear/price/weight'),
                'linear/country/weights':
                regressor.get_variable_value('linear/country/weights'),
            }

            # Unregularized loss is lower when there is no L1 regularization.
            self.assertLess(no_l1_reg_loss, l1_reg_loss)
            self.assertLess(no_l1_reg_loss, 0.05)

            # But weights returned by the regressor with L1 regularization have
            # smaller L1 norm.
            l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0
            for var_name in sorted(l1_reg_weights):
                l1_reg_weights_norm += sum(
                    np.absolute(l1_reg_weights[var_name].flatten()))
                no_l1_reg_weights_norm += sum(
                    np.absolute(no_l1_reg_weights[var_name].flatten()))
                print('Var name: %s, value: %s' %
                      (var_name, no_l1_reg_weights[var_name].flatten()))
            self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)