Ejemplo n.º 1
0
 def call(self, inputs):
     # TODO(tanzheny): Add ragged support.
     # TODO(tanzheny): Add int support.
     if isinstance(inputs, sparse_tensor.SparseTensor):
         sparse_values = inputs.values
         sparse_hashed_values = string_ops.string_to_hash_bucket_fast(
             sparse_values, self._num_bins, name='lookup')
         return sparse_tensor.SparseTensor(indices=inputs.indices,
                                           values=sparse_hashed_values,
                                           dense_shape=inputs.dense_shape)
     # string_to_hash_bucket_fast uses FarmHash as hash function.
     return string_ops.string_to_hash_bucket_fast(inputs,
                                                  self._num_bins,
                                                  name='lookup')
  def testStringToOneHashBucketFast(self):
    with self.cached_session():
      input_string = array_ops.placeholder(dtypes.string)
      output = string_ops.string_to_hash_bucket_fast(input_string, 1)
      result = output.eval(feed_dict={input_string: ['a', 'b', 'c']})

      self.assertAllEqual([0, 0, 0], result)
Ejemplo n.º 3
0
    def _transform_feature(self, inputs):
        input_tensor = inputs.get(self.key)
        if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
            raise ValueError('SparseColumn input must be a SparseTensor.')

        if (input_tensor.dtype != dtypes.string
                and not input_tensor.dtype.is_integer):
            raise ValueError('input tensors dtype must be string or integer. '
                             'dtype: {}, column_name: {}'.format(
                                 input_tensor.dtype, self.key))

        if self.dtype.is_integer != input_tensor.dtype.is_integer:
            raise ValueError(
                'Column dtype and SparseTensors dtype must be compatible. '
                'key: {}, column dtype: {}, tensor dtype: {}'.format(
                    self.key, self.dtype, input_tensor.dtype))

        if self.dtype == dtypes.string:
            sparse_values = input_tensor.values
        else:
            sparse_values = string_ops.as_string(input_tensor.values)

        sparse_id_values = string_ops.string_to_hash_bucket_fast(
            sparse_values, self.hash_bucket_size, name='lookup')
        return sparse_tensor_lib.SparseTensor(input_tensor.indices,
                                              sparse_id_values,
                                              input_tensor.dense_shape)
Ejemplo n.º 4
0
def default_partition_fn(keys, shard_num):
    """The default partition function.
      partition keys by "mod" strategy.

      keys: a tensor presents the keys to be partitioned.
      shard_num: the num of partitions
    Returns:
      a tensor with same shape as keys with type of `tf.int32`,
        represents the corresponding partition-ids of keys.
    """
    keys_op = ops.convert_to_tensor(keys, name="keys")
    gpu_mode = _pywrap_util_port.IsGoogleCudaEnabled()

    with ops.colocate_with(keys_op):
        if keys_op.dtype == dtypes.int64 and gpu_mode:
            # This branch has low performance on some multi-CPU scenario,
            # so we try to use default branch when GPUs are not available.
            mask = constant_op.constant(0x7fffffff, dtypes.int64)
            keys_int32 = math_ops.cast(bitwise_ops.bitwise_and(keys_op, mask),
                                       dtypes.int32)
            mod = math_ops.mod(keys_int32,
                               constant_op.constant(shard_num, dtypes.int32))
            ids = math_ops.cast(mod, dtype=dtypes.int32)
        elif keys_op.dtype == dtypes.string:
            ids = string_ops.string_to_hash_bucket_fast(keys_op, shard_num)
            mask = constant_op.constant(0x7fffffff, dtypes.int64)
            ids = math_ops.cast(bitwise_ops.bitwise_and(ids, mask),
                                dtypes.int32)
        else:
            ids = math_ops.cast(math_ops.mod(keys_op, shard_num),
                                dtype=dtypes.int32)
    return ids
Ejemplo n.º 5
0
 def _shard_indices(self, keys):
     if self._key_dtype == dtypes.string:
         indices = string_ops.string_to_hash_bucket_fast(
             keys, self._num_shards)
     else:
         indices = math_ops.mod(keys, self._num_shards)
     return math_ops.cast(indices, dtypes.int32)
Ejemplo n.º 6
0
  def _transform_feature(self, inputs):
    input_tensor = inputs.get(self.key)
    if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
      raise ValueError('SparseColumn input must be a SparseTensor.')

    if (input_tensor.dtype != dtypes.string and
        not input_tensor.dtype.is_integer):
      raise ValueError('input tensors dtype must be string or integer. '
                       'dtype: {}, column_name: {}'.format(
                           input_tensor.dtype, self.key))

    if self.dtype.is_integer != input_tensor.dtype.is_integer:
      raise ValueError(
          'Column dtype and SparseTensors dtype must be compatible. '
          'key: {}, column dtype: {}, tensor dtype: {}'.format(
              self.key, self.dtype, input_tensor.dtype))

    if self.dtype == dtypes.string:
      sparse_values = input_tensor.values
    else:
      sparse_values = string_ops.as_string(input_tensor.values)

    sparse_id_values = string_ops.string_to_hash_bucket_fast(
        sparse_values, self.hash_bucket_size, name='lookup')
    return sparse_tensor_lib.SparseTensor(
        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
Ejemplo n.º 7
0
 def insert_transformed_feature(self, columns_to_tensors):
   """Handles sparse column to id conversion."""
   sparse_id_values = string_ops.string_to_hash_bucket_fast(
       columns_to_tensors[self.name].values,
       self.bucket_size,
       name=self.name + "_lookup")
   columns_to_tensors[self] = ops.SparseTensor(
       columns_to_tensors[self.name].indices, sparse_id_values,
       columns_to_tensors[self.name].shape)
Ejemplo n.º 8
0
 def insert_transformed_feature(self, columns_to_tensors):
   """Handles sparse column to id conversion."""
   sparse_id_values = string_ops.string_to_hash_bucket_fast(
       columns_to_tensors[self.name].values,
       self.bucket_size,
       name=self.name + "_lookup")
   columns_to_tensors[self] = ops.SparseTensor(
       columns_to_tensors[self.name].indices, sparse_id_values,
       columns_to_tensors[self.name].shape)
Ejemplo n.º 9
0
    def input_fn(params):
        """Generates an input function for training or evaluation.
    This uses the input pipeline based approach using file name queue
    to read data so that entire data is not loaded in memory.

    Args:
      params (dict): Dictionary of additional params like batch_size
    Returns:
      A function () -> (features, indices) where features is a dictionary of
      Tensors, and indices is a single Tensor of label indices.
    """
        if FLAGS.use_tpu:
            batch_size = params['batch_size']
        else:
            batch_size = FLAGS.train_batch_size
        shuffle = True

        dataset = tf.contrib.data.TextLineDataset([filename])
        dataset = dataset.cache().repeat(FLAGS.num_epochs)
        if shuffle:
            dataset = dataset.shuffle(batch_size * 10)
        dataset = dataset.batch(batch_size)
        iterator = dataset.make_one_shot_iterator()
        rows = iterator.get_next()

        # Parse the CSV File
        features = parse_csv(rows)

        table = tf.contrib.lookup.index_table_from_tensor(tf.constant(LABELS))
        label_tensor = table.lookup(features.pop(LABEL_COLUMN))

        # Convert categorical (string) values to one_hot values
        for col, vals in CATEGORICAL_COLS:
            bucket_size = vals if isinstance(vals, int) else len(vals)

            if isinstance(vals, int):
                indices = string_ops.string_to_hash_bucket_fast(
                    features[col], bucket_size)
            else:
                table = tf.contrib.lookup.index_table_from_tensor(vals)
                indices = table.lookup(features[col])

            indices = tf.cast(indices, tf.int32)
            features[col] = tf.reshape(
                indices,
                [batch_size, indices.get_shape().as_list()[1]])

        for feature in CONTINUOUS_COLS:
            real_valued_tensor = tf.to_float(features[feature])
            features[feature] = tf.reshape(
                real_valued_tensor,
                [batch_size,
                 real_valued_tensor.get_shape().as_list()[1]])

        labels = tf.reshape(tf.cast(label_tensor, tf.int32), [batch_size])
        return features, labels
Ejemplo n.º 10
0
 def call(self, inputs):
     # TODO(tanzheny): Add int support.
     # string_to_hash_bucket_fast uses FarmHash as hash function.
     if ragged_tensor.is_ragged(inputs):
         return ragged_functional_ops.map_flat_values(
             string_ops.string_to_hash_bucket_fast,
             inputs,
             num_buckets=self._num_bins,
             name='hash')
     elif isinstance(inputs, sparse_tensor.SparseTensor):
         sparse_values = inputs.values
         sparse_hashed_values = string_ops.string_to_hash_bucket_fast(
             sparse_values, self._num_bins, name='hash')
         return sparse_tensor.SparseTensor(indices=inputs.indices,
                                           values=sparse_hashed_values,
                                           dense_shape=inputs.dense_shape)
     else:
         return string_ops.string_to_hash_bucket_fast(inputs,
                                                      self._num_bins,
                                                      name='hash')
Ejemplo n.º 11
0
  def replace_oov_buckets(self, inputs, lookups):
    if self.num_oov_tokens <= 1:
      return lookups

    if inputs.dtype.is_integer:
      inputs = string_ops.as_string(inputs)
    hashed_inputs = string_ops.string_to_hash_bucket_fast(
        inputs, num_buckets=self.num_oov_tokens)
    if self.reserve_zero:
      hashed_inputs = math_ops.add(hashed_inputs, 1)
    return array_ops.where(math_ops.equal(lookups, -1), hashed_inputs, lookups)
Ejemplo n.º 12
0
 def get_indices(col, embedding_size, bucket_size):
     if col_type != 'int':
         indices = string_ops.string_to_hash_bucket_fast(
             features[col],
             bucket_size,
             name="deep_shared_hash_" + col + str(shared_flag))
     else:
         table = tf.contrib.lookup.index_table_from_tensor(
             embedding_size)
         indices = table.lookup(features[col])
     return indices
  def testStringToHashBucketsFast(self):
    with self.cached_session():
      input_string = array_ops.placeholder(dtypes.string)
      output = string_ops.string_to_hash_bucket_fast(input_string, 10)
      result = output.eval(feed_dict={input_string: ['a', 'b', 'c', 'd']})

      # Fingerprint64('a') -> 12917804110809363939 -> mod 10 -> 9
      # Fingerprint64('b') -> 11795596070477164822 -> mod 10 -> 2
      # Fingerprint64('c') -> 11430444447143000872 -> mod 10 -> 2
      # Fingerprint64('d') -> 4470636696479570465 -> mod 10 -> 5
      self.assertAllEqual([9, 2, 2, 5], result)
Ejemplo n.º 14
0
    def _transform_feature(self, inputs):
        input_tensor = inputs.get(self.key)
        flat_input = array_ops.reshape(input_tensor, (-1,))
        input_tensor = tf.string_split(flat_input, self.category_delimiter)

        if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
            raise ValueError('SparseColumn input must be a SparseTensor.')

        sparse_values = input_tensor.values
        # tf.summary.text(self.key, flat_input)
        sparse_id_values = string_ops.string_to_hash_bucket_fast(sparse_values, self.num_buckets, name='lookup')
        return sparse_tensor_lib.SparseTensor(input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
Ejemplo n.º 15
0
    def _apply_transform(self, input_tensors, **kwargs):
        """Applies the transformation to the `transform_input`.

    Args:
      input_tensors: a list of Tensors representing the input to
        the Transform.
      **kwargs: additional keyword arguments, unused here.

    Returns:
        A namedtuple of Tensors representing the transformed output.
    """
        result = string_ops.string_to_hash_bucket_fast(input_tensors[0], self._num_buckets, name=None)
        # pylint: disable=not-callable
        return self.return_type(result)
Ejemplo n.º 16
0
  def _apply_transform(self, input_tensors, **kwargs):
    """Applies the transformation to the `transform_input`.

    Args:
      input_tensors: a list of Tensors representing the input to
        the Transform.
      **kwargs: additional keyword arguments, unused here.

    Returns:
        A namedtuple of Tensors representing the transformed output.
    """
    result = string_ops.string_to_hash_bucket_fast(input_tensors[0],
                                                   self._num_buckets,
                                                   name=None)
    # pylint: disable=not-callable
    return self.return_type(result)
Ejemplo n.º 17
0
    def _replace_oov_buckets(self, inputs, lookups):
        """Replace the default OOV value with one of the OOV bucket values."""
        if self.oov_tokens is None:
            return lookups

        num_oov_elements = self.oov_tokens.shape.num_elements()
        if inputs.dtype.is_integer:
            oov_indices = math_ops.floormod(inputs, num_oov_elements)
        else:
            oov_indices = string_ops.string_to_hash_bucket_fast(
                inputs, num_buckets=num_oov_elements)

        oov_values = array_ops.gather(self.oov_tokens, oov_indices)
        oov_locations = math_ops.equal(lookups, self.table._default_value)  # pylint: disable=protected-access

        return array_ops.where(oov_locations, oov_values, lookups)
 def model_fn(features, labels, mode):
   _, _ = features, labels
   v = variables.Variable(0, name='some_var', dtype=dtypes.int64)
   # We verify the value of filepath_tensor is replaced with a path to the
   # saved model's assets directory by assigning a hash of filepath_tensor
   # to some_var.
   filepath_tensor = ops.convert_to_tensor(absolute_filepath)
   ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filepath_tensor)
   scaffold = monitored_session.Scaffold(
       local_init_op=state_ops.assign(
           v, string_ops.string_to_hash_bucket_fast(
               filepath_tensor, num_buckets)).op
   )
   return model_fn_lib.EstimatorSpec(
       mode,
       scaffold=scaffold,
       train_op=state_ops.assign_add(training.get_global_step(), 1),
       loss=array_ops.identity(0))
Ejemplo n.º 19
0
    def test_with_assets(self):
        filename = 'test_asset'
        tmpdir = tempfile.mkdtemp()
        absolute_filepath = os.path.join(tmpdir, filename)
        num_buckets = 1000
        with open(absolute_filepath, 'w') as f:
            f.write(b'test')

        def model_fn(features, labels, mode):
            _, _ = features, labels
            v = variables.Variable(0, name='some_var', dtype=dtypes.int64)
            # We verify the value of filepath_tensor is replaced with a path to the
            # saved model's assets directory by assigning a hash of filepath_tensor
            # to some_var.
            filepath_tensor = ops.convert_to_tensor(absolute_filepath)
            ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS,
                                  filepath_tensor)
            scaffold = monitored_session.Scaffold(
                local_init_op=state_ops.assign(
                    v,
                    string_ops.string_to_hash_bucket_fast(
                        filepath_tensor, num_buckets)).op)
            return model_fn_lib.EstimatorSpec(mode,
                                              scaffold=scaffold,
                                              train_op=state_ops.assign_add(
                                                  training.get_global_step(),
                                                  1),
                                              loss=array_ops.identity(0))

        export_dir = self._export_estimator(predict=False, model_fn=model_fn)
        sme = saved_model_estimator.SavedModelEstimator(
            export_dir, self._get_tmp_dir())

        with self.session() as sess:
            expected_bucket = sess.run(
                string_ops.string_to_hash_bucket_fast(
                    os.path.join(export_dir, constants.ASSETS_DIRECTORY,
                                 filename), num_buckets))

        sme.train(dummy_input_fn, steps=1)
        self.assertEqual(expected_bucket, sme.get_variable_value('some_var'))
Ejemplo n.º 20
0
 def _shard_indices(self, keys):
   if self._key_dtype == dtypes.string:
     indices = string_ops.string_to_hash_bucket_fast(keys, self._num_shards)
   else:
     indices = math_ops.mod(keys, self._num_shards)
   return math_ops.cast(indices, dtypes.int32)
Ejemplo n.º 21
0
def wide_and_deep(features=None, params=None):
    ###############
    WIDE_CATE_COLS = params['WIDE_CATE_COLS']
    CONTINUOUS_COLS = params['CONTINUOUS_COLS']
    DEEP_EMBEDDING_COLS = params['DEEP_EMBEDDING_COLS']
    WIDE_CROSS_COLS = params['WIDE_CROSS_COLS']
    DEEP_SHARED_EMBEDDING_COLS = params['DEEP_SHARED_EMBEDDING_COLS']
    _HIDDEN_UNITS = params['_HIDDEN_UNITS']
    _LINEAR_LEARNING_RATE = params['_LINEAR_LEARNING_RATE']
    _DNN_LEARNING_RATE = params['_DNN_LEARNING_RATE']

    wide_logits = None
    linear_absolute_scope = None
    if params['WIDE']:
        wide_sum = []
        with variable_scope.variable_scope(
                'linear', values=tuple(six.itervalues(features))) as scope:
            linear_absolute_scope = scope.name
            for col, size in WIDE_CATE_COLS:
                w_wide = tf.get_variable(
                    shape=[size, 1],
                    initializer=init_ops.zeros_initializer,
                    trainable=True,
                    name="Wide_Part_Weights_Cate" + col)
                indices = string_ops.string_to_hash_bucket_fast(
                    features[col], size, name="wide_hash_" + col)
                wide_sum.append(
                    tf.nn.embedding_lookup(w_wide,
                                           indices,
                                           name="wide_cat_lookup_" + col))
            # for col, size in WIDE_BUCKET_COLS:
            #     w_wide = tf.get_variable(shape=[size, 1], initializer=init_ops.zeros_initializer, trainable=True,
            #                              name="Wide_Part_Weights_Bucket" + col)
            #     indices = string_ops.string_to_hash_bucket_fast(
            #         features[col], size, name="wide_hash_" + col)
            #     wide_sum.append(tf.nn.embedding_lookup(w_wide, indices, name="wide_bucket_lookup_" + col))
            for col1, col2, size in WIDE_CROSS_COLS:
                w_wide = tf.get_variable(
                    shape=[size, 1],
                    initializer=init_ops.zeros_initializer,
                    trainable=True,
                    name="Wide_Part_Weights_Cross" + col1 + '_' + col2)
                # cross_input = tf.as_string(tf.string_to_number(features[col1],_dtypes.int64)*tf.string_to_number(features[col2],_dtypes.int64))
                cross_input = tf.string_join([features[col1], features[col2]],
                                             separator="_")
                indices = string_ops.string_to_hash_bucket_fast(
                    cross_input, size, name="wide_hash_" + col1 + '_' + col2)
                wide_sum.append(
                    tf.nn.embedding_lookup(w_wide,
                                           indices,
                                           name="wide_cross_lookup_" + col1 +
                                           '_' + col2))

            w_wide = tf.get_variable(shape=[len(CONTINUOUS_COLS), 1],
                                     initializer=init_ops.zeros_initializer,
                                     trainable=True,
                                     name="Wide_Part_Weights_Continus")
            bias = tf.get_variable(shape=[1],
                                   initializer=init_ops.zeros_initializer,
                                   trainable=True,
                                   name="Wide_Part_Bias")
            x = tf.concat([
                tf.expand_dims(tf.to_float(features[col]), -1)
                for col in CONTINUOUS_COLS
            ],
                          1,
                          name='continus_concat')
            continue_logits = tf.matmul(x, w_wide) + bias

            wide_logits = tf.reduce_sum(wide_sum, 0)
            wide_logits += continue_logits
    ##################
    deep_logits = None
    dnn_absolute_scope = None
    if params['DEEP']:
        # with tf.variable_scope('Deep_model'):
        with variable_scope.variable_scope(
                'Deep_model',
                values=tuple(six.itervalues(features)),
        ) as scope:
            dnn_absolute_scope = scope.name
            # Convert categorical (string) values to embeddings
            deep_sum = []
            for col, vals, embedding_size, col_type in DEEP_EMBEDDING_COLS:
                bucket_size = vals if isinstance(vals, int) else len(vals)
                # embed_initializer = tf.truncated_normal_initializer(
                #     stddev=(1.0 / tf.sqrt(float(embedding_size))))
                embeddings = tf.get_variable(
                    shape=[bucket_size, embedding_size],
                    initializer=init_ops.glorot_uniform_initializer(),
                    name="deep_embedding_" + col)

                if col_type != 'int':
                    indices = string_ops.string_to_hash_bucket_fast(
                        features[col], bucket_size, name="deep_hash_" + col)
                else:
                    table = tf.contrib.lookup.index_table_from_tensor(vals)
                    indices = table.lookup(features[col])
                seq_emb = tf.nn.embedding_lookup(embeddings,
                                                 indices,
                                                 name="deep_lookup_" + col)
                if col_type == 'seq':
                    print("test my seq:", col)
                    seq_emb = tf.reduce_mean(seq_emb, 1)
                deep_sum.append(seq_emb)
            for cols, vals, embedding_size, col_type, shared_flag in DEEP_SHARED_EMBEDDING_COLS:

                def get_indices(col, embedding_size, bucket_size):
                    if col_type != 'int':
                        indices = string_ops.string_to_hash_bucket_fast(
                            features[col],
                            bucket_size,
                            name="deep_shared_hash_" + col + str(shared_flag))
                    else:
                        table = tf.contrib.lookup.index_table_from_tensor(
                            embedding_size)
                        indices = table.lookup(features[col])
                    return indices

                bucket_size = vals if isinstance(vals, int) else len(vals)
                embeddings = tf.get_variable(
                    shape=[bucket_size, embedding_size],
                    initializer=init_ops.glorot_uniform_initializer(),
                    name="deep_shared_embedding_" + '_'.join(c for c in cols) +
                    str(shared_flag))
                for col in cols:
                    indices = get_indices(col, embedding_size, bucket_size)
                    seq_emb = tf.nn.embedding_lookup(
                        embeddings,
                        indices,
                        name="deep_shared_lookup_" + col + str(shared_flag))
                    if col.endswith('seq'):
                        seq_emb = tf.reduce_mean(seq_emb, 1)
                    deep_sum.append(seq_emb)
            for col in CONTINUOUS_COLS:
                deep_sum.append(
                    tf.expand_dims(tf.to_float(features[col]),
                                   -1,
                                   name='continuous_' + col))
            curr_layer = tf.concat(deep_sum, 1, name="deep_inputs_layer")

            # Build the DNN

            for index, layer_size in enumerate(_HIDDEN_UNITS):
                curr_layer = tf.layers.dense(
                    curr_layer,
                    layer_size,
                    activation=tf.nn.relu,
                    kernel_initializer=init_ops.glorot_uniform_initializer(),
                    name="deep_hidden_layer" + str(index))
            deep_logits = tf.layers.dense(curr_layer,
                                          units=1,
                                          name="deep_logits")
    ####################################

    my_head = head._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
        loss_reduction=losses.Reduction.SUM)
    print(my_head.logits_dimension)

    if deep_logits is not None and wide_logits is not None:
        logits = deep_logits + wide_logits
    elif deep_logits is not None:
        logits = deep_logits
    else:
        logits = wide_logits

    dnn_optimizer = optimizers.get_optimizer_instance(
        'Adagrad', learning_rate=_DNN_LEARNING_RATE)

    def _linear_learning_rate(num_linear_feature_columns):
        default_learning_rate = 1. / math.sqrt(num_linear_feature_columns)
        return min(_LINEAR_LEARNING_RATE, default_learning_rate)

    linear_optimizer = optimizers.get_optimizer_instance(
        'Ftrl', learning_rate=_linear_learning_rate(len(WIDE_CATE_COLS)))

    def _train_op_fn(loss):
        train_ops = []
        global_step = training_util.get_global_step()
        if deep_logits is not None:
            train_ops.append(
                dnn_optimizer.minimize(loss,
                                       var_list=ops.get_collection(
                                           ops.GraphKeys.TRAINABLE_VARIABLES,
                                           scope=dnn_absolute_scope)))
        if wide_logits is not None:
            train_ops.append(
                linear_optimizer.minimize(
                    loss,
                    var_list=ops.get_collection(
                        ops.GraphKeys.TRAINABLE_VARIABLES,
                        scope=linear_absolute_scope)))

        train_op = control_flow_ops.group(*train_ops)
        with ops.control_dependencies([train_op]):
            return state_ops.assign_add(global_step, 1).op

    return my_head, logits, _train_op_fn
Ejemplo n.º 22
0
def embedding_layer(features=None, params=None):
    ###############

    CONTINUOUS_COLS = params['CONTINUOUS_COLS']
    DEEP_EMBEDDING_COLS = params['DEEP_EMBEDDING_COLS']

    DEEP_SHARED_EMBEDDING_COLS = params['DEEP_SHARED_EMBEDDING_COLS']
    _HIDDEN_UNITS = params['_HIDDEN_UNITS']

    ##################
    if True:
        with variable_scope.variable_scope(
                'Deep_model',
                values=tuple(six.itervalues(features)),
        ) as scope:
            deep_sum = []
            for col, vals, embedding_size, col_type in DEEP_EMBEDDING_COLS:
                bucket_size = vals if isinstance(vals, int) else len(vals)
                embeddings = tf.get_variable(
                    shape=[bucket_size, embedding_size],
                    initializer=init_ops.glorot_uniform_initializer(),
                    name="deep_embedding_" + col)
                if col_type != 'int':
                    indices = string_ops.string_to_hash_bucket_fast(
                        features[col], bucket_size, name="deep_hash_" + col)
                else:
                    table = tf.contrib.lookup.index_table_from_tensor(vals)
                    indices = table.lookup(features[col])
                seq_emb = tf.nn.embedding_lookup(embeddings,
                                                 indices,
                                                 name="deep_lookup_" + col)
                if col_type == 'list':
                    print("test my seq:", col)
                    seq_emb = tf.reduce_mean(seq_emb, 1)
                    print(seq_emb)
                deep_sum.append(seq_emb)
            for cols, vals, embedding_size, col_type, shared_flag in DEEP_SHARED_EMBEDDING_COLS:

                def get_indices(col, embedding_size, bucket_size):
                    if col_type != 'int':
                        indices = string_ops.string_to_hash_bucket_fast(
                            features[col],
                            bucket_size,
                            name="deep_shared_hash_" + col + str(shared_flag))
                    else:
                        table = tf.contrib.lookup.index_table_from_tensor(
                            embedding_size)
                        indices = table.lookup(features[col])
                    return indices

                bucket_size = vals if isinstance(vals, int) else len(vals)
                embeddings = tf.get_variable(
                    shape=[bucket_size, embedding_size],
                    initializer=init_ops.glorot_uniform_initializer(),
                    name="deep_shared_embedding_" + '_'.join(c for c in cols) +
                    str(shared_flag))
                for col in cols:
                    indices = get_indices(col, embedding_size, bucket_size)
                    seq_emb = tf.nn.embedding_lookup(
                        embeddings,
                        indices,
                        name="deep_shared_lookup_" + col + str(shared_flag))
                    if col.endswith('seq'):
                        print("into...")
                        seq_emb = tf.reduce_mean(seq_emb, 1)
                    deep_sum.append(seq_emb)
            for col in CONTINUOUS_COLS:
                deep_sum.append(
                    tf.expand_dims(tf.to_float(features[col]),
                                   -1,
                                   name='continuous_' + col))
            curr_layer = tf.concat(deep_sum, 1, name="deep_inputs_layer")

            # Build the DNN
    ####################################
    return curr_layer
Ejemplo n.º 23
0
def model_fn(mode,
             features,
             labels,
             embedding_size=8,
             hidden_units=[100, 70, 50, 20],
             learning_rate=0.1):
  """Creates a feed forward network classification network.

  Args:
    mode (str): Mode running training, evaluation or prediction.
    features (dict): Dictionary of input feature Tensors.
    labels (Tensor): Class label Tensor.
    embedding_size (int): Size of embeddings.
    hidden_units (list): Hidden units.
    learning_rate (float): Learning rate for the SGD.

  Returns:
    A Tuple or Dict depending on the mode.
  """
  label_values = tf.constant(LABELS)

  # Keep variance constant with changing embedding sizes.
  embed_initializer = tf.truncated_normal_initializer(
      stddev=(1.0 / tf.sqrt(float(embedding_size))))

  with tf.variable_scope('embeddings', initializer=embed_initializer):
    # Convert categorical (string) values to embeddings
    for col, vals in CATEGORICAL_COLS:
      bucket_size = vals if isinstance(vals, int) else len(vals)
      embeddings = tf.get_variable(col, shape=[bucket_size, embedding_size])
      if isinstance(vals, int):
        indices = string_ops.string_to_hash_bucket_fast(features[col],
                                                        bucket_size)
      else:
        table = tf.contrib.lookup.index_table_from_tensor(vals)
        indices = table.lookup(features[col])

      features[col] = tf.nn.embedding_lookup(embeddings, indices)

  for col in CONTINUOUS_COLS:
    # Give continuous columns an extra trivial dimension
    # So they can be concatenated with embedding tensors
    features[col] = tf.expand_dims(tf.to_float(features[col]), -1)

  # Concatenate the (now all dense) features.
  # We need to sort the tensors so that they end up in the same order for
  # prediction, evaluation, and training
  sorted_feature_tensors = zip(*sorted(features.iteritems()))[1]
  inputs = tf.concat(sorted_feature_tensors, 1)

  # Build the DNN
  curr_layer = inputs

  for layer_size in hidden_units:
    curr_layer = tf.layers.dense(
        curr_layer,
        layer_size,
        activation=tf.nn.relu,
        # This initializer prevents variance from exploding or vanishing when
        # compounded through different sized layers.
        kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
    )

  # Add the output layer
  logits = tf.layers.dense(
      curr_layer,
      len(LABELS),
      # Do not use ReLU on last layer
      activation=None,
      kernel_initializer=tf.contrib.layers.variance_scaling_initializer())

  if mode in (PREDICT, EVAL):
    probabilities = tf.nn.softmax(logits)
    predicted_indices = tf.argmax(probabilities, 1)

  if mode in (TRAIN, EVAL):
    # Convert the string label column to indices
    # Build a lookup table inside the graph
    table = tf.contrib.lookup.index_table_from_tensor(label_values)

    # Use the lookup table to convert string labels to ints
    label_indices = table.lookup(labels)

    # Make labels a vector
    label_indices_vector = tf.squeeze(label_indices)

    # global_step is necessary in eval to correctly load the step
    # of the checkpoint we are evaluating
    global_step = tf.train.get_or_create_global_step()

  if mode == PREDICT:
    # Convert predicted_indices back into strings
    return {
        'predictions': tf.gather(label_values, predicted_indices),
        'confidence': tf.reduce_max(probabilities, axis=1)
    }

  if mode == TRAIN:
    # Build training operation.
    cross_entropy = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=label_indices_vector))
    tf.summary.scalar('loss', cross_entropy)
    train_op = tf.train.FtrlOptimizer(
        learning_rate=learning_rate,
        l1_regularization_strength=3.0,
        l2_regularization_strength=10.0).minimize(
            cross_entropy, global_step=global_step)
    return train_op, global_step

  if mode == EVAL:
    # Return accuracy and area under ROC curve metrics
    # See https://en.wikipedia.org/wiki/Receiver_operating_characteristic
    # See https://www.kaggle.com/wiki/AreaUnderCurve\
    labels_one_hot = tf.one_hot(
        label_indices_vector,
        depth=label_values.shape[0],
        on_value=True,
        off_value=False,
        dtype=tf.bool)
    return {
        'accuracy': tf.metrics.accuracy(label_indices, predicted_indices),
        'auroc': tf.metrics.auc(labels_one_hot, probabilities)
    }
Ejemplo n.º 24
0
def model_fn(mode,
             features,
             labels,
             embedding_size=8,
             hidden_units=[100, 70, 50, 20],
             learning_rate=0.1):
    """Creates a feed forward network classification network.

  Args:
    mode (str): Mode running training, evaluation or prediction.
    features (dict): Dictionary of input feature Tensors.
    labels (Tensor): Class label Tensor.
    embedding_size (int): Size of embeddings.
    hidden_units (list): Hidden units.
    learning_rate (float): Learning rate for the SGD.

  Returns:
    A Tuple or Dict depending on the mode.
  """
    label_values = tf.constant(LABELS)

    # Keep variance constant with changing embedding sizes.
    embed_initializer = tf.truncated_normal_initializer(
        stddev=(1.0 / tf.sqrt(float(embedding_size))))

    with tf.variable_scope('embeddings', initializer=embed_initializer):
        # Convert categorical (string) values to embeddings
        for col, vals in CATEGORICAL_COLS:
            bucket_size = vals if isinstance(vals, int) else len(vals)
            embeddings = tf.get_variable(col,
                                         shape=[bucket_size, embedding_size])
            if isinstance(vals, int):
                indices = string_ops.string_to_hash_bucket_fast(
                    features[col], bucket_size)
            else:
                table = tf.contrib.lookup.index_table_from_tensor(vals)
                indices = table.lookup(features[col])

            features[col] = tf.nn.embedding_lookup(embeddings, indices)

    for col in CONTINUOUS_COLS:
        # Give continuous columns an extra trivial dimension
        # So they can be concatenated with embedding tensors
        features[col] = tf.expand_dims(tf.to_float(features[col]), -1)

    # Concatenate the (now all dense) features.
    # We need to sort the tensors so that they end up in the same order for
    # prediction, evaluation, and training
    sorted_feature_tensors = zip(*sorted(features.iteritems()))[1]
    inputs = tf.concat(sorted_feature_tensors, 1)

    # Build the DNN
    curr_layer = inputs

    for layer_size in hidden_units:
        curr_layer = tf.layers.dense(
            curr_layer,
            layer_size,
            activation=tf.nn.relu,
            # This initializer prevents variance from exploding or vanishing when
            # compounded through different sized layers.
            kernel_initializer=tf.contrib.layers.variance_scaling_initializer(
            ),
        )

    # Add the output layer
    logits = tf.layers.dense(
        curr_layer,
        len(LABELS),
        # Do not use ReLU on last layer
        activation=None,
        kernel_initializer=tf.contrib.layers.variance_scaling_initializer())

    if mode in (PREDICT, EVAL):
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, 1)

    if mode in (TRAIN, EVAL):
        # Convert the string label column to indices
        # Build a lookup table inside the graph
        table = tf.contrib.lookup.index_table_from_tensor(label_values)

        # Use the lookup table to convert string labels to ints
        label_indices = table.lookup(labels)

        # Make labels a vector
        label_indices_vector = tf.squeeze(label_indices)

        # global_step is necessary in eval to correctly load the step
        # of the checkpoint we are evaluating
        global_step = tf.contrib.framework.get_or_create_global_step()

    if mode == PREDICT:
        # Convert predicted_indices back into strings
        return {
            'predictions': tf.gather(label_values, predicted_indices),
            'confidence': tf.reduce_max(probabilities, axis=1)
        }

    if mode == TRAIN:
        # Build training operation.
        cross_entropy = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=label_indices_vector))
        tf.summary.scalar('loss', cross_entropy)
        train_op = tf.train.FtrlOptimizer(
            learning_rate=learning_rate,
            l1_regularization_strength=3.0,
            l2_regularization_strength=10.0).minimize(cross_entropy,
                                                      global_step=global_step)
        return train_op, global_step

    if mode == EVAL:
        # Return accuracy and area under ROC curve metrics
        # See https://en.wikipedia.org/wiki/Receiver_operating_characteristic
        # See https://www.kaggle.com/wiki/AreaUnderCurve\
        labels_one_hot = tf.one_hot(label_indices_vector,
                                    depth=label_values.shape[0],
                                    on_value=True,
                                    off_value=False,
                                    dtype=tf.bool)
        return {
            'accuracy': tf.metrics.accuracy(label_indices, predicted_indices),
            'auroc': tf.metrics.auc(labels_one_hot, probabilities)
        }
Ejemplo n.º 25
0
def model_fn(mode,
             features,
             labels,
             embedding_size=8,
             hidden_units=[100, 70, 50, 20],
             learning_rate=0.1):
    """Create a Feed forward network classification network

  Args:
    mode (string): Mode running training, evaluation or prediction
    features (dict): Dictionary of input feature Tensors
    labels (Tensor): Class label Tensor
    hidden_units (list): Hidden units
    learning_rate (float): Learning rate for the SGD

  Returns:
    Depending on the mode returns Tuple or Dict
  """
    label_values = tf.constant(LABELS)

    # Keep variance constant with changing embedding sizes.
    with tf.variable_scope('embeddings',
                           initializer=tf.truncated_normal_initializer(
                               stddev=(1.0 / tf.sqrt(float(embedding_size))))):
        # Convert categorical (string) values to one_hot values
        for col, bucket_size in CATEGORICAL_COLS:
            embeddings = tf.get_variable(col,
                                         shape=[bucket_size, embedding_size])

            indices = string_ops.string_to_hash_bucket_fast(
                features[col], bucket_size)

            features[col] = tf.squeeze(tf.nn.embedding_lookup(
                embeddings, indices),
                                       axis=[1])

    for feature in CONTINUOUS_COLS:
        features[feature] = tf.to_float(features[feature])

    # Concatenate the (now all dense) features.
    # We need to sort the tensors so that they end up in the same order for
    # prediction, evaluation, and training
    sorted_feature_tensors = zip(*sorted(features.iteritems()))[1]
    inputs = tf.concat(sorted_feature_tensors, 1)

    # Build the DNN

    layers_size = [inputs.get_shape()[1]] + hidden_units
    layers_shape = zip(layers_size[0:], layers_size[1:] + [len(LABELS)])

    curr_layer = inputs
    # Set default initializer to variance_scaling_initializer
    # This initializer prevents variance from exploding or vanishing when
    # compounded through different sized layers.
    with tf.variable_scope(
            'dnn',
            initializer=tf.contrib.layers.variance_scaling_initializer()):
        # Creates the relu hidden layers
        for num, shape in enumerate(layers_shape):
            with tf.variable_scope('relu_{}'.format(num)):

                weights = tf.get_variable('weights', shape)

                biases = tf.get_variable('biases',
                                         shape[1],
                                         initializer=tf.zeros_initializer(
                                             tf.float32))

            activations = tf.matmul(curr_layer, weights) + biases
            if num < len(layers_shape) - 1:
                curr_layer = tf.nn.relu(activations)
            else:
                curr_layer = activations

    # Make predictions
    logits = curr_layer

    if mode in (PREDICT, EVAL):
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, 1)

    if mode in (TRAIN, EVAL):
        # Convert the string label column to indices
        # Build a lookup table inside the graph
        table = tf.contrib.lookup.string_to_index_table_from_tensor(
            label_values)

        # Use the lookup table to convert string labels to ints
        label_indices = table.lookup(labels)

        # Make labels a vector
        label_indices_vector = tf.squeeze(label_indices)

        # global_step is necessary in eval to correctly load the step
        # of the checkpoint we are evaluating
        global_step = tf.contrib.framework.get_or_create_global_step()

    if mode == PREDICT:
        # Convert predicted_indices back into strings
        return {
            'predictions': tf.gather(label_values, predicted_indices),
            'confidence': tf.reduce_max(probabilities, axis=1)
        }

    if mode == TRAIN:
        # Build training operation.
        cross_entropy = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=label_indices_vector))
        tf.summary.scalar('loss', cross_entropy)
        train_op = tf.train.FtrlOptimizer(
            learning_rate=learning_rate,
            l1_regularization_strength=3.0,
            l2_regularization_strength=10.0).minimize(cross_entropy,
                                                      global_step=global_step)
        return train_op, global_step

    if mode == EVAL:
        # Return accuracy and area under ROC curve metrics
        # See https://en.wikipedia.org/wiki/Receiver_operating_characteristic
        # See https://www.kaggle.com/wiki/AreaUnderCurve
        return {
            'accuracy':
            tf.contrib.metrics.streaming_accuracy(predicted_indices,
                                                  label_indices),
            'auroc':
            tf.contrib.metrics.streaming_auc(predicted_indices, label_indices)
        }