def sequence_categorical_column_with_vocabulary_list(key,
                                                     vocabulary_list,
                                                     dtype=None,
                                                     default_value=-1,
                                                     num_oov_buckets=0):
    """A sequence of categorical terms where ids use an in-memory list.

  Pass this to `embedding_column` or `indicator_column` to convert sequence
  categorical data into dense representation for input to sequence NN, such as
  RNN.

  Example:

  ```python
  colors = sequence_categorical_column_with_vocabulary_list(
      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
      num_oov_buckets=2)
  colors_embedding = embedding_column(colors, dimension=3)
  columns = [colors_embedding]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Args:
    key: A unique string identifying the input feature.
    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
      is mapped to the index of its value (if present) in `vocabulary_list`.
      Must be castable to `dtype`.
    dtype: The type of features. Only string and integer types are supported.
      If `None`, it will be inferred from `vocabulary_list`.
    default_value: The integer ID value to return for out-of-vocabulary feature
      values, defaults to `-1`. This can not be specified with a positive
      `num_oov_buckets`.
    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
      hash of the input value. A positive `num_oov_buckets` can not be specified
      with `default_value`.

  Returns:
    A `_SequenceCategoricalColumn`.

  Raises:
    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
    ValueError: `num_oov_buckets` is a negative integer.
    ValueError: `num_oov_buckets` and `default_value` are both specified.
    ValueError: if `dtype` is not integer or string.
  """
    return fc._SequenceCategoricalColumn(
        fc.categorical_column_with_vocabulary_list(
            key=key,
            vocabulary_list=vocabulary_list,
            dtype=dtype,
            default_value=default_value,
            num_oov_buckets=num_oov_buckets))
Esempio n. 2
0
  def test_dnn_classifier(self):
    embedding = feature_column_lib.embedding_column(
        feature_column_lib.categorical_column_with_vocabulary_list(
            'wire_cast', ['kima', 'omar', 'stringer']), 8)
    dnn = estimator_lib.DNNClassifier(
        feature_columns=[embedding], hidden_units=[3, 1])

    def train_input_fn():
      return dataset_ops.Dataset.from_tensors(({
          'wire_cast': [['omar'], ['kima']]
      }, [[0], [1]])).repeat(3)

    def eval_input_fn():
      return dataset_ops.Dataset.from_tensors(({
          'wire_cast': [['stringer'], ['kima']]
      }, [[0], [1]])).repeat(2)

    evaluator = hooks_lib.InMemoryEvaluatorHook(
        dnn, eval_input_fn, name='in-memory')
    dnn.train(train_input_fn, hooks=[evaluator])
    self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory')))
    step_keyword_to_value = summary_step_keyword_to_value_mapping(
        dnn.eval_dir('in-memory'))

    final_metrics = dnn.evaluate(eval_input_fn)
    step = final_metrics[ops.GraphKeys.GLOBAL_STEP]
    for summary_tag in final_metrics:
      if summary_tag == ops.GraphKeys.GLOBAL_STEP:
        continue
      self.assertEqual(final_metrics[summary_tag],
                       step_keyword_to_value[step][summary_tag])
Esempio n. 3
0
    def test_dnn_classifier(self):
        embedding = feature_column_lib.embedding_column(
            feature_column_lib.categorical_column_with_vocabulary_list(
                'wire_cast', ['kima', 'omar', 'stringer']), 8)
        dnn = estimator_lib.DNNClassifier(feature_columns=[embedding],
                                          hidden_units=[3, 1])

        def train_input_fn():
            return dataset_ops.Dataset.from_tensors(({
                'wire_cast': [['omar'], ['kima']]
            }, [[0], [1]])).repeat(3)

        def eval_input_fn():
            return dataset_ops.Dataset.from_tensors(({
                'wire_cast': [['stringer'], ['kima']]
            }, [[0], [1]])).repeat(2)

        evaluator = hooks_lib.InMemoryEvaluatorHook(dnn,
                                                    eval_input_fn,
                                                    name='in-memory')
        dnn.train(train_input_fn, hooks=[evaluator])
        self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory')))
        step_keyword_to_value = summary_step_keyword_to_value_mapping(
            dnn.eval_dir('in-memory'))

        final_metrics = dnn.evaluate(eval_input_fn)
        step = final_metrics[ops.GraphKeys.GLOBAL_STEP]
        for summary_tag in final_metrics:
            if summary_tag == ops.GraphKeys.GLOBAL_STEP:
                continue
            self.assertEqual(final_metrics[summary_tag],
                             step_keyword_to_value[step][summary_tag])
Esempio n. 4
0
  def testWarmStartMoreSettingsNoPartitioning(self):
    # Create old and new vocabs for sparse column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry",
         "blueberry"], "new_vocab")
    # Create feature columns.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    all_linear_cols = [sc_hash, sc_keys, sc_vocab]

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "some_other_name", shape=[4, 1], initializer=rand())
        variable_scope.get_variable(
            "linear_model/sc_vocab/weights",
            initializer=[[0.5], [1.], [2.], [3.]])
        self._write_checkpoint(sess)
        prev_keys_val = sess.run(sc_keys_weights)

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols,
                                                 partitioner=None)
        vocab_info = ws_util._VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=prev_vocab_path
        )
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(),
            vars_to_warmstart=".*(sc_keys|sc_vocab).*",
            var_name_to_vocab_info={
                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
            },
            var_name_to_prev_var_name={
                ws_util._infer_var_name(cols_to_vars[sc_keys]):
                    "some_other_name"
            })
        ws_util._warmstart(ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.  Var corresponding to
        # sc_hash should not be warm-started.  Var corresponding to sc_vocab
        # should be correctly warmstarted after vocab remapping.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_keys: [prev_keys_val],
            sc_hash: [np.zeros([15, 1])],
            sc_vocab: [np.array([[3.], [2.], [1.], [0.5], [0.], [0.]])]
        }, sess)
  def testWarmStartInputLayerMoreSettings(self):
    # Create old and new vocabs for sparse column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry",
         "blueberry"], "new_vocab")
    # Create feature columns.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    all_linear_cols = [sc_hash, sc_keys, sc_vocab]

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        _ = variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "some_other_name", shape=[4, 1], initializer=rand())
        _ = variable_scope.get_variable(
            "linear_model/sc_vocab/weights",
            initializer=[[0.5], [1.], [2.], [3.]])
        self._write_checkpoint(sess)
        prev_keys_val = sess.run(sc_keys_weights)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(),
            col_to_prev_vocab={sc_vocab: prev_vocab_path},
            col_to_prev_tensor={sc_keys: "some_other_name"},
            exclude_columns=[sc_hash])
        ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.  Var corresponding to
        # sc_hash should not be warm-started.  Var corresponding to sc_vocab
        # should be correctly warmstarted after vocab remapping.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_keys:
                np.split(prev_keys_val, 2),
            sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])],
            sc_vocab: [
                np.array([[3.], [2.], [1.]]),
                np.array([[0.5], [0.], [0.]])
            ]
        }, sess)
def sequence_categorical_column_with_vocabulary_list(
    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
  """A sequence of categorical terms where ids use an in-memory list.

  Pass this to `embedding_column` or `indicator_column` to convert sequence
  categorical data into dense representation for input to sequence NN, such as
  RNN.

  Example:

  ```python
  colors = sequence_categorical_column_with_vocabulary_list(
      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
      num_oov_buckets=2)
  colors_embedding = embedding_column(colors, dimension=3)
  columns = [colors_embedding]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Args:
    key: A unique string identifying the input feature.
    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
      is mapped to the index of its value (if present) in `vocabulary_list`.
      Must be castable to `dtype`.
    dtype: The type of features. Only string and integer types are supported.
      If `None`, it will be inferred from `vocabulary_list`.
    default_value: The integer ID value to return for out-of-vocabulary feature
      values, defaults to `-1`. This can not be specified with a positive
      `num_oov_buckets`.
    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
      hash of the input value. A positive `num_oov_buckets` can not be specified
      with `default_value`.

  Returns:
    A `_SequenceCategoricalColumn`.

  Raises:
    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
    ValueError: `num_oov_buckets` is a negative integer.
    ValueError: `num_oov_buckets` and `default_value` are both specified.
    ValueError: if `dtype` is not integer or string.
  """
  return fc_old._SequenceCategoricalColumn(
      fc_old.categorical_column_with_vocabulary_list(
          key=key,
          vocabulary_list=vocabulary_list,
          dtype=dtype,
          default_value=default_value,
          num_oov_buckets=num_oov_buckets))
Esempio n. 7
0
 def testBaseLinearRegressorTraining3D(self):
     # Tests also a categorical feature with vocabulary list.
     feature_columns = [
         feature_column_lib.numeric_column('x0'),
         feature_column_lib.numeric_column('x1'),
         feature_column_lib.categorical_column_with_vocabulary_list(
             'x2', ['Y', 'N'])
     ]
     self._TestRegressor(feature_columns,
                         self._test_data.threed_input_fn(False, 1))
Esempio n. 8
0
    def test_warm_starting_selective_variables(self):
        """Tests selecting variables to warm-start."""
        age = feature_column.numeric_column('age')
        city = feature_column.embedding_column(
            feature_column.categorical_column_with_vocabulary_list(
                'city', vocabulary_list=['Mountain View', 'Palo Alto']),
            dimension=5)

        # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
        dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=[age],
            dnn_feature_columns=[city],
            dnn_hidden_units=[256, 128],
            model_dir=self._ckpt_and_vocab_dir,
            n_classes=4,
            linear_optimizer='SGD',
            dnn_optimizer='SGD')
        dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)

        # Create a second DNNLinearCombinedClassifier, warm-started from the first.
        # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
        # have accumulator values that change).
        warm_started_dnn_lc_classifier = (
            dnn_linear_combined.DNNLinearCombinedClassifier(
                linear_feature_columns=[age],
                dnn_feature_columns=[city],
                dnn_hidden_units=[256, 128],
                n_classes=4,
                linear_optimizer=gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.0),
                dnn_optimizer=gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.0),
                # The provided regular expression will only warm-start the deep
                # portion of the model.
                warm_start_from=estimator.WarmStartSettings(
                    ckpt_to_initialize_from=dnn_lc_classifier.model_dir,
                    vars_to_warm_start='.*(dnn).*')))

        warm_started_dnn_lc_classifier.train(input_fn=self._input_fn,
                                             max_steps=1)
        for variable_name in warm_started_dnn_lc_classifier.get_variable_names(
        ):
            if 'dnn' in variable_name:
                self.assertAllClose(
                    dnn_lc_classifier.get_variable_value(variable_name),
                    warm_started_dnn_lc_classifier.get_variable_value(
                        variable_name))
            elif 'linear' in variable_name:
                linear_values = warm_started_dnn_lc_classifier.get_variable_value(
                    variable_name)
                # Since they're not warm-started, the linear weights will be
                # zero-initialized.
                self.assertAllClose(np.zeros_like(linear_values),
                                    linear_values)
Esempio n. 9
0
def build_model_columns():
    week_list = fc.categorical_column_with_vocabulary_list("week_list",
                                                           vocabulary_list=['mon', 'tue', 'wed', 'thur', 'fri', 'sat',
                                                                            'sun'])
    week = fc.weighted_categorical_column(week_list, 'week_weight')

    week = fc.embedding_column(week, 3)

    wide = []
    deep = [week]
    return wide, deep
Esempio n. 10
0
def test_reuse():
    data = {
        'gender': [['M'], ['G'], ['M'], ['M']],
        'user': [['A'], ['B'], ['C'], ['C']],
        'pos': [['a'], ['d'], ['f'], ['c']],
        'neg': [['c'], ['e'], ['d'], ['a']]
    }
    user_v_list = ['A', 'B', 'C', 'D']
    item_v_list = ['a', 'b', 'c', 'd', 'e', 'f']

    gender_col = feature_column.categorical_column_with_vocabulary_list(
        'gender', ['M', "G"], dtype=tf.string)
    user_col = feature_column.categorical_column_with_vocabulary_list(
        'user', user_v_list, dtype=tf.string)
    pos_item_col = feature_column.categorical_column_with_vocabulary_list(
        'pos', item_v_list, dtype=tf.string)
    neg_item_col = feature_column.categorical_column_with_vocabulary_list(
        'neg', item_v_list, dtype=tf.string)

    gender_embedding = feature_column.embedding_column(gender_col, 2)
    user_embedding = feature_column.embedding_column(user_col, 2)
    pos_embedding, neg_embedding = feature_column.shared_embedding_columns(
        [pos_item_col, neg_item_col], 3)
    columns = [gender_embedding, user_embedding, pos_embedding, neg_embedding]

    with tf.variable_scope("a") as scope:
        aa = scope.name
        ret = tf.feature_column.input_layer(data, columns)
    print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=aa))

    with tf.variable_scope("b") as scope:
        bb = scope.name
        ret1 = tf.feature_column.input_layer(data, columns)
    print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=bb))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        print(sess.run(ret))
        print('------------------')
        print(sess.run(ret1))
    def testTrainEvaluateAndPredictWithIndicatorColumn(self):
        categorical = feature_column.categorical_column_with_vocabulary_list(
            key='categorical', vocabulary_list=('bad', 'good', 'ok'))
        feature_indicator = feature_column.indicator_column(categorical)
        bucketized_col = feature_column.bucketized_column(
            feature_column.numeric_column('an_uninformative_feature',
                                          dtype=dtypes.float32),
            BUCKET_BOUNDARIES)

        labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32)
        # Our categorical feature defines the labels perfectly
        input_fn = numpy_io.numpy_input_fn(x={
            'an_uninformative_feature':
            np.array([1, 1, 1, 1, 1]),
            'categorical':
            np.array(['bad', 'good', 'good', 'ok', 'bad']),
        },
                                           y=labels,
                                           batch_size=5,
                                           shuffle=False)

        # Train depth 1 tree.
        est = boosted_trees.BoostedTreesRegressor(
            feature_columns=[bucketized_col, feature_indicator],
            n_batches_per_layer=1,
            n_trees=1,
            learning_rate=1.0,
            max_depth=1)

        num_steps = 1
        est.train(input_fn, steps=num_steps)
        ensemble = self._assert_checkpoint_and_return_model(est.model_dir,
                                                            global_step=1,
                                                            finalized_trees=1,
                                                            attempted_layers=1)

        # We learnt perfectly.
        eval_res = est.evaluate(input_fn=input_fn, steps=1)
        self.assertAllClose(eval_res['loss'], 0)

        predictions = list(est.predict(input_fn))
        self.assertAllClose(labels,
                            [pred['predictions'] for pred in predictions])

        self.assertEqual(3, len(ensemble.trees[0].nodes))

        # Check that the split happened on 'good' value, which will be encoded as
        # feature with index 2 (0-numeric, 1 - 'bad')
        self.assertEqual(
            2, ensemble.trees[0].nodes[0].bucketized_split.feature_id)
        self.assertEqual(0,
                         ensemble.trees[0].nodes[0].bucketized_split.threshold)
  def test_warm_starting_selective_variables(self):
    """Tests selecting variables to warm-start."""
    age = feature_column.numeric_column('age')
    city = feature_column.embedding_column(
        feature_column.categorical_column_with_vocabulary_list(
            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
        dimension=5)

    # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
    dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
        linear_feature_columns=[age],
        dnn_feature_columns=[city],
        dnn_hidden_units=[256, 128],
        model_dir=self._ckpt_and_vocab_dir,
        n_classes=4,
        linear_optimizer='SGD',
        dnn_optimizer='SGD')
    dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)

    # Create a second DNNLinearCombinedClassifier, warm-started from the first.
    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
    # have accumulator values that change).
    warm_started_dnn_lc_classifier = (
        dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=[age],
            dnn_feature_columns=[city],
            dnn_hidden_units=[256, 128],
            n_classes=4,
            linear_optimizer=gradient_descent.GradientDescentOptimizer(
                learning_rate=0.0),
            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
                learning_rate=0.0),
            # The provided regular expression will only warm-start the deep
            # portion of the model.
            warm_start_from=estimator.WarmStartSettings(
                ckpt_to_initialize_from=dnn_lc_classifier.model_dir,
                vars_to_warm_start='.*(dnn).*')))

    warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
    for variable_name in warm_started_dnn_lc_classifier.get_variable_names():
      if 'dnn' in variable_name:
        self.assertAllClose(
            dnn_lc_classifier.get_variable_value(variable_name),
            warm_started_dnn_lc_classifier.get_variable_value(variable_name))
      elif 'linear' in variable_name:
        linear_values = warm_started_dnn_lc_classifier.get_variable_value(
            variable_name)
        # Since they're not warm-started, the linear weights will be
        # zero-initialized.
        self.assertAllClose(np.zeros_like(linear_values), linear_values)
Esempio n. 13
0
    def test_forward_in_exported_sparse(self):
        features_columns = [
            fc.indicator_column(
                fc.categorical_column_with_vocabulary_list('x', range(10)))
        ]

        classifier = linear.LinearClassifier(feature_columns=features_columns)

        def train_input_fn():
            dataset = dataset_ops.Dataset.from_tensors({
                'x':
                sparse_tensor.SparseTensor(values=[1, 2, 3],
                                           indices=[[0, 0], [1, 0], [1, 1]],
                                           dense_shape=[2, 2]),
                'labels': [[0], [1]]
            })

            def _split(x):
                labels = x.pop('labels')
                return x, labels

            dataset = dataset.map(_split)
            return dataset

        classifier.train(train_input_fn, max_steps=1)

        classifier = extenders.forward_features(classifier,
                                                keys=['x'],
                                                sparse_default_values={'x': 0})

        def serving_input_fn():
            features_ph = array_ops.placeholder(dtype=dtypes.int32,
                                                name='x',
                                                shape=[None])
            features = {'x': layers.dense_to_sparse(features_ph)}
            return estimator_lib.export.ServingInputReceiver(
                features, {'x': features_ph})

        export_dir, tmpdir = self._export_estimator(classifier,
                                                    serving_input_fn)
        prediction_fn = from_saved_model(export_dir,
                                         signature_def_key='predict')

        features = (0, 2)
        prediction = prediction_fn({'x': features})

        self.assertIn('x', prediction)
        self.assertEqual(features, tuple(prediction['x']))
        gfile.DeleteRecursively(tmpdir)
Esempio n. 14
0
  def testTrainEvaluateAndPredictWithIndicatorColumn(self):
    categorical = feature_column.categorical_column_with_vocabulary_list(
        key='categorical', vocabulary_list=('bad', 'good', 'ok'))
    feature_indicator = feature_column.indicator_column(categorical)
    bucketized_col = feature_column.bucketized_column(
        feature_column.numeric_column(
            'an_uninformative_feature', dtype=dtypes.float32),
        BUCKET_BOUNDARIES)

    labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32)
    # Our categorical feature defines the labels perfectly
    input_fn = numpy_io.numpy_input_fn(
        x={
            'an_uninformative_feature': np.array([1, 1, 1, 1, 1]),
            'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']),
        },
        y=labels,
        batch_size=5,
        shuffle=False)

    # Train depth 1 tree.
    est = boosted_trees.BoostedTreesRegressor(
        feature_columns=[bucketized_col, feature_indicator],
        n_batches_per_layer=1,
        n_trees=1,
        learning_rate=1.0,
        max_depth=1)

    num_steps = 1
    est.train(input_fn, steps=num_steps)
    ensemble = self._assert_checkpoint_and_return_model(
        est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1)

    # We learnt perfectly.
    eval_res = est.evaluate(input_fn=input_fn, steps=1)
    self.assertAllClose(eval_res['loss'], 0)

    predictions = list(est.predict(input_fn))
    self.assertAllClose(
        labels,
        [pred['predictions'] for pred in predictions])

    self.assertEqual(3, len(ensemble.trees[0].nodes))

    # Check that the split happened on 'good' value, which will be encoded as
    # feature with index 2 (0-numeric, 1 - 'bad')
    self.assertEqual(2, ensemble.trees[0].nodes[0].bucketized_split.feature_id)
    self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
 def testCalibratedLinearRegressorTraining3D(self):
   # Tests also categorical features that has a limited number
   # of valid values.
   feature_columns = [
       feature_column_lib.numeric_column('x0'),
       feature_column_lib.numeric_column('x1'),
       feature_column_lib.categorical_column_with_vocabulary_list(
           'x2', ['Y', 'N'])
   ]
   with ops.Graph().as_default():
     estimator = self._CalibratedLinearRegressorWithQuantiles(
         ['x0', 'x1', 'x2'], feature_columns)
   estimator.train(input_fn=self._test_data.threed_input_fn(False, 4))
   results = estimator.evaluate(input_fn=self._test_data.threed_input_fn(
       False, 1))
   # For the record:
   #   average_loss(CalibratedLinear, 4 epochs)=~1e-5
   #   average_loss(LinearRegressor, 100 epochs)=~0.159
   self.assertLess(results['average_loss'], 1e-4)
Esempio n. 16
0
    def test_classifier_basic_warm_starting(self):
        """Tests correctness of DNNLinearCombinedClassifier default warm-start."""
        age = feature_column.numeric_column('age')
        city = feature_column.embedding_column(
            feature_column.categorical_column_with_vocabulary_list(
                'city', vocabulary_list=['Mountain View', 'Palo Alto']),
            dimension=5)

        # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
        dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=[age],
            dnn_feature_columns=[city],
            dnn_hidden_units=[256, 128],
            model_dir=self._ckpt_and_vocab_dir,
            n_classes=4,
            linear_optimizer='SGD',
            dnn_optimizer='SGD')
        dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)

        # Create a second DNNLinearCombinedClassifier, warm-started from the first.
        # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
        # have accumulator values that change).
        warm_started_dnn_lc_classifier = (
            dnn_linear_combined.DNNLinearCombinedClassifier(
                linear_feature_columns=[age],
                dnn_feature_columns=[city],
                dnn_hidden_units=[256, 128],
                n_classes=4,
                linear_optimizer=gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.0),
                dnn_optimizer=gradient_descent.GradientDescentOptimizer(
                    learning_rate=0.0),
                warm_start_from=dnn_lc_classifier.model_dir))

        warm_started_dnn_lc_classifier.train(input_fn=self._input_fn,
                                             max_steps=1)
        for variable_name in warm_started_dnn_lc_classifier.get_variable_names(
        ):
            self.assertAllClose(
                dnn_lc_classifier.get_variable_value(variable_name),
                warm_started_dnn_lc_classifier.get_variable_value(
                    variable_name))
Esempio n. 17
0
  def test_forward_in_exported_sparse(self):
    features_columns = [fc.indicator_column(
        fc.categorical_column_with_vocabulary_list('x', range(10)))]

    classifier = linear.LinearClassifier(feature_columns=features_columns)

    def train_input_fn():
      dataset = dataset_ops.Dataset.from_tensors({
          'x': sparse_tensor.SparseTensor(
              values=[1, 2, 3],
              indices=[[0, 0], [1, 0], [1, 1]],
              dense_shape=[2, 2]),
          'labels': [[0], [1]]
      })
      def _split(x):
        labels = x.pop('labels')
        return x, labels
      dataset = dataset.map(_split)
      return dataset

    classifier.train(train_input_fn, max_steps=1)

    classifier = extenders.forward_features(
        classifier, keys=['x'], sparse_default_values={'x': 0})

    def serving_input_fn():
      features_ph = array_ops.placeholder(dtype=dtypes.int32, name='x',
                                          shape=[None])
      features = {'x': layers.dense_to_sparse(features_ph)}
      return estimator_lib.export.ServingInputReceiver(features,
                                                       {'x': features_ph})
    export_dir, tmpdir = self._export_estimator(classifier, serving_input_fn)
    prediction_fn = from_saved_model(export_dir, signature_def_key='predict')

    features = (0, 2)
    prediction = prediction_fn({'x': features})

    self.assertIn('x', prediction)
    self.assertEqual(features, tuple(prediction['x']))
    gfile.DeleteRecursively(tmpdir)
  def test_classifier_basic_warm_starting(self):
    """Tests correctness of DNNLinearCombinedClassifier default warm-start."""
    age = feature_column.numeric_column('age')
    city = feature_column.embedding_column(
        feature_column.categorical_column_with_vocabulary_list(
            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
        dimension=5)

    # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
    dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
        linear_feature_columns=[age],
        dnn_feature_columns=[city],
        dnn_hidden_units=[256, 128],
        model_dir=self._ckpt_and_vocab_dir,
        n_classes=4,
        linear_optimizer='SGD',
        dnn_optimizer='SGD')
    dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)

    # Create a second DNNLinearCombinedClassifier, warm-started from the first.
    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
    # have accumulator values that change).
    warm_started_dnn_lc_classifier = (
        dnn_linear_combined.DNNLinearCombinedClassifier(
            linear_feature_columns=[age],
            dnn_feature_columns=[city],
            dnn_hidden_units=[256, 128],
            n_classes=4,
            linear_optimizer=gradient_descent.GradientDescentOptimizer(
                learning_rate=0.0),
            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
                learning_rate=0.0),
            warm_start_from=dnn_lc_classifier.model_dir))

    warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
    for variable_name in warm_started_dnn_lc_classifier.get_variable_names():
      self.assertAllClose(
          dnn_lc_classifier.get_variable_value(variable_name),
          warm_started_dnn_lc_classifier.get_variable_value(variable_name))
  def _testAnnotationsPresentForEstimator(self, estimator_class):
    feature_columns = [
        feature_column.numeric_column('x', shape=(1,)),
        feature_column.embedding_column(
            feature_column.categorical_column_with_vocabulary_list(
                'y', vocabulary_list=['a', 'b', 'c']),
            dimension=3)
    ]
    estimator = estimator_class(
        hidden_units=(2, 2),
        feature_columns=feature_columns,
        model_dir=self._model_dir)
    model_fn = estimator.model_fn

    graph = ops.Graph()
    with graph.as_default():
      model_fn({
          'x': array_ops.constant([1.0]),
          'y': array_ops.constant(['a'])
      }, {},
               model_fn_lib.ModeKeys.PREDICT,
               config=None)

      unprocessed_features = self._getLayerAnnotationCollection(
          graph, dnn_with_layer_annotations.LayerAnnotationsCollectionNames
          .UNPROCESSED_FEATURES)
      processed_features = self._getLayerAnnotationCollection(
          graph, dnn_with_layer_annotations.LayerAnnotationsCollectionNames
          .PROCESSED_FEATURES)
      feature_columns = graph.get_collection(
          dnn_with_layer_annotations.LayerAnnotationsCollectionNames
          .FEATURE_COLUMNS)

      self.assertItemsEqual(unprocessed_features.keys(), ['x', 'y'])
      self.assertEqual(2, len(processed_features.keys()))
      self.assertEqual(2, len(feature_columns))
    def _testAnnotationsPresentForEstimator(self, estimator_class):
        feature_columns = [
            feature_column.numeric_column('x', shape=(1, )),
            feature_column.embedding_column(
                feature_column.categorical_column_with_vocabulary_list(
                    'y', vocabulary_list=['a', 'b', 'c']),
                dimension=3)
        ]
        estimator = estimator_class(hidden_units=(2, 2),
                                    feature_columns=feature_columns,
                                    model_dir=self._model_dir)
        model_fn = estimator.model_fn

        graph = ops.Graph()
        with graph.as_default():
            model_fn(
                {
                    'x': array_ops.constant([1.0]),
                    'y': array_ops.constant(['a'])
                }, {},
                model_fn_lib.ModeKeys.PREDICT,
                config=None)

            unprocessed_features = self._getLayerAnnotationCollection(
                graph, dnn_with_layer_annotations.
                LayerAnnotationsCollectionNames.UNPROCESSED_FEATURES)
            processed_features = self._getLayerAnnotationCollection(
                graph, dnn_with_layer_annotations.
                LayerAnnotationsCollectionNames.PROCESSED_FEATURES)
            feature_columns = graph.get_collection(
                dnn_with_layer_annotations.LayerAnnotationsCollectionNames.
                FEATURE_COLUMNS)

            self.assertItemsEqual(unprocessed_features.keys(), ['x', 'y'])
            self.assertEqual(2, len(processed_features.keys()))
            self.assertEqual(2, len(feature_columns))
    def test_complete_flow(self):
        n_classes = 3
        input_dimension = 2
        batch_size = 12

        data = np.linspace(0.,
                           n_classes - 1.,
                           batch_size * input_dimension,
                           dtype=np.float32)
        x_data = data.reshape(batch_size, input_dimension)
        categorical_data = np.random.random_integers(0,
                                                     len(x_data),
                                                     size=len(x_data))
        y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
        train_input_fn = numpy_io.numpy_input_fn(x={
            'x': x_data,
            'categories': categorical_data
        },
                                                 y=y_data,
                                                 batch_size=batch_size,
                                                 num_epochs=None,
                                                 shuffle=True)
        eval_input_fn = numpy_io.numpy_input_fn(x={
            'x': x_data,
            'categories': categorical_data
        },
                                                y=y_data,
                                                batch_size=batch_size,
                                                shuffle=False)
        predict_input_fn = numpy_io.numpy_input_fn(x={
            'x':
            x_data,
            'categories':
            categorical_data
        },
                                                   batch_size=batch_size,
                                                   shuffle=False)

        feature_columns = [
            feature_column.numeric_column('x', shape=(input_dimension, )),
            feature_column.embedding_column(
                feature_column.categorical_column_with_vocabulary_list(
                    'categories',
                    vocabulary_list=np.linspace(0.,
                                                len(x_data),
                                                len(x_data),
                                                dtype=np.int64)), 1)
        ]

        estimator = dnn.DNNClassifier(hidden_units=(2, 2),
                                      feature_columns=feature_columns,
                                      n_classes=n_classes,
                                      model_dir=self._model_dir)

        def optimizer_fn():
            return optimizers.get_optimizer_instance('Adagrad',
                                                     learning_rate=0.05)

        estimator = estimator_lib.Estimator(
            model_fn=replicate_model_fn.replicate_model_fn(
                estimator.model_fn,
                optimizer_fn,
                devices=['/gpu:0', '/gpu:1', '/gpu:2']),
            model_dir=estimator.model_dir,
            config=estimator.config,
            params=estimator.params)

        num_steps = 10
        estimator.train(train_input_fn, steps=num_steps)

        scores = estimator.evaluate(eval_input_fn)
        self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP])
        self.assertIn('loss', six.iterkeys(scores))

        predicted_proba = np.array([
            x[prediction_keys.PredictionKeys.PROBABILITIES]
            for x in estimator.predict(predict_input_fn)
        ])
        self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)

        feature_spec = feature_column.make_parse_example_spec(feature_columns)
        serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
                                                 serving_input_receiver_fn)
        self.assertTrue(gfile.Exists(export_dir))
Esempio n. 22
0
from tensorflow.contrib.learn import LinearRegressor, pandas_input_fn, DNNRegressor, Experiment
from tensorflow.python.feature_column.feature_column import categorical_column_with_hash_bucket, numeric_column, \
    categorical_column_with_vocabulary_list, embedding_column, indicator_column

make = categorical_column_with_hash_bucket('make', 100)
horsepower = numeric_column('horsepower', shape=[])
cylinders = categorical_column_with_vocabulary_list(
    'num-of-cylinders', ['two', 'three', 'four', 'six', 'eight'])

###############
regressor = DNNRegressor(feature_columns=[
    embedding_column(make, 10), horsepower,
    indicator_column(cylinders, 3)
],
                         hidden_units=[50, 30, 10])
################
regressor = LinearRegressor(feature_columns=[make, horsepower, cylinders])

# any python generator
train_input_fn = pandas_input_fn(x=input_data,
                                 y=input_label,
                                 batch_size=64,
                                 shuffle=True,
                                 num_epochs=None)

regressor.train(train_input_fn, steps=10000)


def expirement_fn(run_config, hparams):
    regressor = DNNRegressor(...,
                             config=run_config,
  def _complete_flow_with_mode(self, mode):
    n_classes = 3
    input_dimension = 2
    batch_size = 12

    data = np.linspace(
        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
    x_data = data.reshape(batch_size, input_dimension)
    categorical_data = np.random.random_integers(
        0, len(x_data), size=len(x_data))
    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
    train_input_fn = numpy_io.numpy_input_fn(
        x={'x': x_data,
           'categories': categorical_data},
        y=y_data,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = numpy_io.numpy_input_fn(
        x={'x': x_data,
           'categories': categorical_data},
        y=y_data,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = numpy_io.numpy_input_fn(
        x={'x': x_data,
           'categories': categorical_data},
        batch_size=batch_size,
        shuffle=False)

    feature_columns = [
        feature_column.numeric_column('x', shape=(input_dimension,)),
        feature_column.embedding_column(
            feature_column.categorical_column_with_vocabulary_list(
                'categories',
                vocabulary_list=np.linspace(
                    0., len(x_data), len(x_data), dtype=np.int64)), 1)
    ]

    estimator = dnn.DNNClassifier(
        hidden_units=(2, 2),
        feature_columns=feature_columns,
        n_classes=n_classes,
        model_dir=self._model_dir)

    def optimizer_fn():
      return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)

    if not mode:  # Use the public `replicate_model_fn`.
      model_fn = replicate_model_fn.replicate_model_fn(
          estimator.model_fn,
          optimizer_fn,
          devices=['/gpu:0', '/gpu:1', '/gpu:2'])
    else:
      model_fn = replicate_model_fn._replicate_model_fn_with_mode(
          estimator.model_fn,
          optimizer_fn,
          devices=['/gpu:0', '/gpu:1', '/gpu:2'],
          mode=mode)

    estimator = estimator_lib.Estimator(
        model_fn=model_fn,
        model_dir=estimator.model_dir,
        config=estimator.config,
        params=estimator.params)

    num_steps = 10
    estimator.train(train_input_fn, steps=num_steps)

    scores = estimator.evaluate(eval_input_fn)
    self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP])
    self.assertIn('loss', six.iterkeys(scores))

    predicted_proba = np.array([
        x[prediction_keys.PredictionKeys.PROBABILITIES]
        for x in estimator.predict(predict_input_fn)
    ])
    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)

    feature_spec = feature_column.make_parse_example_spec(feature_columns)
    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
        feature_spec)
    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
                                             serving_input_receiver_fn)
    self.assertTrue(gfile.Exists(export_dir))
Esempio n. 24
0
    def _build_feature_columns(self, ):
        multi_hot_feature_columns = {}
        multi_hot_feature_columns_deep = {}
        multi_category_feature_columns = {}
        continuous_feature_columns = {}
        crossed_feature_columns = []
        bucketized_feature_columns = []
        embedding_feature_columns = []

        if self._data_conf.multi_hot_columns is not None:
            for column in self._data_conf.multi_hot_columns:
                multi_hot_feature_columns[
                    column] = categorical_column_with_vocabulary_list(
                        column,
                        self._data_conf.multi_hot_columns[column],
                        dtype=tf.string)
                multi_hot_feature_columns_deep[column] = indicator_column(
                    multi_hot_feature_columns[column])

        if self._data_conf.multi_category_columns is not None:
            multi_category_feature_columns = {
                column:
                categorical_column_with_hash_bucket(column,
                                                    hash_bucket_size=1000)
                for column in self._data_conf.multi_category_columns
            }

        if self._data_conf.continuous_columns is not None:
            continuous_feature_columns = {
                column: numeric_column(column)
                for column in self._data_conf.continuous_columns
            }

        if self._data_conf.crossed_columns is not None:
            crossed_feature_columns = [
                crossed_column(_, hash_bucket_size=100000)
                for _ in self._data_conf.crossed_columns
            ]

        if self._data_conf.bucketized_columns is not None:
            [
                bucketized_feature_columns.append(
                    bucketized_column(continuous_feature_columns[column],
                                      boundaries=boundary)) for column,
                boundary in self._data_conf.bucketized_columns.items
            ]

        if len(multi_category_feature_columns) > 0:
            embedding_feature_columns = [
                embedding_column(
                    _, dimension=self._model_conf.embedding_dimension)
                for _ in multi_category_feature_columns.values()
            ]

        self._feature_mapping = {
            0: list(multi_hot_feature_columns.values()),
            1: list(multi_category_feature_columns.values()),
            2: list(continuous_feature_columns.values()),
            3: crossed_feature_columns,
            4: bucketized_feature_columns,
            5: embedding_feature_columns,
            6: list(multi_hot_feature_columns_deep.values())
        }

        self._build_feature_columns_for_model()
    def testWarmStartInputLayerMoreSettings(self):
        # Create old and new vocabs for sparse column "sc_vocab".
        prev_vocab_path = self._write_vocab(
            ["apple", "banana", "guava", "orange"], "old_vocab")
        new_vocab_path = self._write_vocab(
            ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
            "new_vocab")
        # Create feature columns.
        sc_hash = fc.categorical_column_with_hash_bucket("sc_hash",
                                                         hash_bucket_size=15)
        sc_keys = fc.categorical_column_with_vocabulary_list(
            "sc_keys", vocabulary_list=["a", "b", "c", "e"])
        sc_vocab = fc.categorical_column_with_vocabulary_file(
            "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
        all_linear_cols = [sc_hash, sc_keys, sc_vocab]

        # Save checkpoint from which to warm-start.
        with ops.Graph().as_default() as g:
            with self.test_session(graph=g) as sess:
                _ = variable_scope.get_variable("linear_model/sc_hash/weights",
                                                shape=[15, 1],
                                                initializer=norms())
                sc_keys_weights = variable_scope.get_variable(
                    "some_other_name", shape=[4, 1], initializer=rand())
                _ = variable_scope.get_variable(
                    "linear_model/sc_vocab/weights",
                    initializer=[[0.5], [1.], [2.], [3.]])
                self._write_checkpoint(sess)
                prev_keys_val = sess.run(sc_keys_weights)

        def _partitioner(shape, dtype):  # pylint:disable=unused-argument
            # Partition each var into 2 equal slices.
            partitions = [1] * len(shape)
            partitions[0] = min(2, shape[0].value)
            return partitions

        # New graph, new session with warmstarting.
        with ops.Graph().as_default() as g:
            with self.test_session(graph=g) as sess:
                cols_to_vars = self._create_linear_model(
                    all_linear_cols, _partitioner)
                ws_settings = ws_util._WarmStartSettings(
                    self.get_temp_dir(),
                    col_to_prev_vocab={sc_vocab: prev_vocab_path},
                    col_to_prev_tensor={sc_keys: "some_other_name"},
                    exclude_columns=[sc_hash])
                ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
                sess.run(variables.global_variables_initializer())
                # Verify weights were correctly warmstarted.  Var corresponding to
                # sc_hash should not be warm-started.  Var corresponding to sc_vocab
                # should be correctly warmstarted after vocab remapping.
                self._assert_cols_to_vars(
                    cols_to_vars, {
                        sc_keys:
                        np.split(prev_keys_val, 2),
                        sc_hash: [np.zeros([8, 1]),
                                  np.zeros([7, 1])],
                        sc_vocab: [
                            np.array([[3.], [2.], [1.]]),
                            np.array([[0.5], [0.], [0.]])
                        ]
                    }, sess)
  def testWarmStartVarsToWarmstartIsNone(self):
    # Create old and new vocabs for sparse column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry",
         "blueberry"], "new_vocab")
    # Create feature columns.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    all_linear_cols = [sc_hash, sc_keys, sc_vocab]

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        variable_scope.get_variable(
            "some_other_name", shape=[4, 1], initializer=rand())
        variable_scope.get_variable(
            "linear_model/sc_vocab/weights",
            initializer=[[0.5], [1.], [2.], [3.]])
        self._write_checkpoint(sess)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # New graph, new session with warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
        vocab_info = ws_util.VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=prev_vocab_path)
        ws_settings = ws_util.WarmStartSettings(
            self.get_temp_dir(),
            # The special value of None here will ensure that only the variable
            # specified in var_name_to_vocab_info (sc_vocab embedding) is
            # warm-started.
            vars_to_warm_start=None,
            var_name_to_vocab_info={
                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
            },
            # Even though this is provided, the None value for
            # vars_to_warm_start overrides the logic, and this will not be
            # warm-started.
            var_name_to_prev_var_name={
                ws_util._infer_var_name(cols_to_vars[sc_keys]):
                    "some_other_name"
            })
        ws_util._warm_start(ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warm-started.  Var corresponding to
        # sc_vocab should be correctly warm-started after vocab remapping,
        # and neither of the other two should be warm-started..
        self._assert_cols_to_vars(cols_to_vars, {
            sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])],
            sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])],
            sc_vocab: [
                np.array([[3.], [2.], [1.]]),
                np.array([[0.5], [0.], [0.]])
            ]
        }, sess)
  def testWarmStart_MultipleCols(self):
    # Create vocab for sparse column "sc_vocab".
    vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                   "vocab")

    # Create feature columns.
    sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4)
    real = fc.numeric_column("real")
    real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])
    cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20)
    all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross]

    # Save checkpoint from which to warm-start.  Also create a bias variable,
    # so we can check that it's also warm-started.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        sc_int_weights = variable_scope.get_variable(
            "linear_model/sc_int/weights", shape=[10, 1], initializer=ones())
        sc_hash_weights = variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand())
        sc_vocab_weights = variable_scope.get_variable(
            "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones())
        real_bucket_weights = variable_scope.get_variable(
            "linear_model/real_bucketized/weights",
            shape=[5, 1],
            initializer=norms())
        cross_weights = variable_scope.get_variable(
            "linear_model/sc_keys_X_sc_vocab/weights",
            shape=[20, 1],
            initializer=rand())
        bias = variable_scope.get_variable(
            "linear_model/bias_weights",
            shape=[1],
            initializer=rand())
        self._write_checkpoint(sess)
        (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val,
         prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([
             sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights,
             real_bucket_weights, cross_weights, bias
         ])

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warm-starting, all weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [np.zeros([10, 1])],
            sc_hash: [np.zeros([15, 1])],
            sc_keys: [np.zeros([4, 1])],
            sc_vocab: [np.zeros([4, 1])],
            real_bucket: [np.zeros([5, 1])],
            cross: [np.zeros([20, 1])],
        }, sess)

    # New graph, new session with warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        vocab_info = ws_util.VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=vocab_path)
        ws_util._warm_start(
            ws_util.WarmStartSettings(
                self.get_temp_dir(),
                var_name_to_vocab_info={
                    "linear_model/sc_vocab/weights": vocab_info
                }))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warm-started.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [prev_int_val],
            sc_hash: [prev_hash_val],
            sc_keys: [prev_keys_val],
            sc_vocab: [prev_vocab_val],
            real_bucket: [prev_bucket_val],
            cross: [prev_cross_val],
            "bias": [prev_bias_val],
        }, sess)
Esempio n. 28
0
  def testWarmStartVarsToWarmstartIsNone(self):
    # Create old and new vocabs for sparse column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry",
         "blueberry"], "new_vocab")
    # Create feature columns.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    all_linear_cols = [sc_hash, sc_keys, sc_vocab]

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        variable_scope.get_variable(
            "some_other_name", shape=[4, 1], initializer=rand())
        variable_scope.get_variable(
            "linear_model/sc_vocab/weights",
            initializer=[[0.5], [1.], [2.], [3.]])
        self._write_checkpoint(sess)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
        vocab_info = ws_util._VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=prev_vocab_path
        )
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(),
            # The special value of None here will ensure that only the variable
            # specified in var_name_to_vocab_info (sc_vocab embedding) is
            # warmstarted.
            vars_to_warmstart=None,
            var_name_to_vocab_info={
                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
            },
            # Even though this is provided, the None value for vars_to_warmstart
            # overrides the logic, and this will not be warmstarted.
            var_name_to_prev_var_name={
                ws_util._infer_var_name(cols_to_vars[sc_keys]):
                    "some_other_name"
            })
        ws_util._warmstart(ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.  Var corresponding to
        # sc_vocab should be correctly warmstarted after vocab remapping,
        # and neither of the other two should be warmstarted..
        self._assert_cols_to_vars(cols_to_vars, {
            sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])],
            sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])],
            sc_vocab: [
                np.array([[3.], [2.], [1.]]),
                np.array([[0.5], [0.], [0.]])
            ]
        }, sess)
Esempio n. 29
0
  def testWarmStart_MultipleCols(self):
    # Create vocab for sparse column "sc_vocab".
    vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                   "vocab")

    # Create feature columns.
    sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4)
    real = fc.numeric_column("real")
    real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])
    cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20)
    all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross]

    # Save checkpoint from which to warm-start.  Also create a bias variable,
    # so we can check that it's also warmstarted.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        sc_int_weights = variable_scope.get_variable(
            "linear_model/sc_int/weights", shape=[10, 1], initializer=ones())
        sc_hash_weights = variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand())
        sc_vocab_weights = variable_scope.get_variable(
            "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones())
        real_bucket_weights = variable_scope.get_variable(
            "linear_model/real_bucketized/weights",
            shape=[5, 1],
            initializer=norms())
        cross_weights = variable_scope.get_variable(
            "linear_model/sc_keys_X_sc_vocab/weights",
            shape=[20, 1],
            initializer=rand())
        bias = variable_scope.get_variable(
            "linear_model/bias_weights",
            shape=[1],
            initializer=rand())
        self._write_checkpoint(sess)
        (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val,
         prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([
             sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights,
             real_bucket_weights, cross_weights, bias
         ])

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warmstarting, all weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [np.zeros([10, 1])],
            sc_hash: [np.zeros([15, 1])],
            sc_keys: [np.zeros([4, 1])],
            sc_vocab: [np.zeros([4, 1])],
            real_bucket: [np.zeros([5, 1])],
            cross: [np.zeros([20, 1])],
        }, sess)

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        vocab_info = ws_util._VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=vocab_path
        )
        ws_util._warmstart(
            ws_util._WarmStartSettings(
                self.get_temp_dir(),
                var_name_to_vocab_info={
                    "linear_model/sc_vocab/weights": vocab_info
                }))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [prev_int_val],
            sc_hash: [prev_hash_val],
            sc_keys: [prev_keys_val],
            sc_vocab: [prev_vocab_val],
            real_bucket: [prev_bucket_val],
            cross: [prev_cross_val],
            "bias": [prev_bias_val],
        }, sess)