def test_maxlength(self): with self.session(): self.assertAllEqual( self.evaluate(bincount_ops.bincount([5], maxlength=3)), [0, 0, 0]) self.assertAllEqual( self.evaluate(bincount_ops.bincount([1], maxlength=3)), [0, 1]) self.assertAllEqual( self.evaluate(bincount_ops.bincount([], maxlength=3)), [])
def test_bincount_determinism_error(self): arr = np.random.randint(0, 1000, size=1000) with test_util.deterministic_ops(), self.assertRaisesRegex( errors_impl.UnimplementedError, "Determinism is not yet supported in GPU implementation of Bincount."): self.evaluate(bincount_ops.bincount(arr, None, axis=None)) arr = np.random.randint(0, 1000, size=(100, 100)) with test_util.deterministic_ops(), self.assertRaisesRegex( errors_impl.UnimplementedError, "Determinism is not yet supported in GPU implementation of " "DenseBincount."): self.evaluate(bincount_ops.bincount(arr, None, axis=-1))
def call(self, inputs): self._called = True if self._max_tokens is None: out_depth = K.get_value(self.num_elements) else: out_depth = self._max_tokens if self._output_mode == TFIDF: # If the input is a sparse tensor, we densify it with the default value of # -1. Because -1 is ignored by one_hot, this effectively drops the non-set # positions from the output encoding. if isinstance(inputs, sparse_tensor.SparseTensor): inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1) one_hot_data = array_ops.one_hot(inputs, depth=out_depth) counts = math_ops.reduce_sum(one_hot_data, axis=1) tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights) tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth))) return tf_idf_data binary_output = (self._output_mode == BINARY) if self._sparse: return bincount_ops.sparse_bincount( inputs, minlength=out_depth, axis=-1, binary_output=binary_output) else: result = bincount_ops.bincount( inputs, minlength=out_depth, dtype=dtypes.int64, axis=-1, binary_output=binary_output) result.set_shape(tensor_shape.TensorShape((None, out_depth))) return result
def call(self, inputs, count_weights=None): if isinstance(inputs, (list, np.ndarray)): inputs = ops.convert_to_tensor_v2(inputs) if inputs.shape.rank == 1: inputs = array_ops.expand_dims(inputs, 1) if count_weights is not None and self._output_mode != COUNT: raise ValueError( "count_weights is not used in `output_mode='tf-idf'`, " "or `output_mode='binary'`. Please pass a single input.") self._called = True if self._max_tokens is None: out_depth = K.get_value(self.num_elements) if out_depth == 0: raise RuntimeError( "If you construct a `CategoryEncoding` layer with " "`max_tokens=None`, you need to call `adapt()` " "on it before using it") else: out_depth = self._max_tokens if self._output_mode == TFIDF: # If the input is a sparse tensor, we densify it with the default value of # -1. Because -1 is ignored by one_hot, this effectively drops the non-set # positions from the output encoding. if self._sparse: raise ValueError("`sparse=True` with `output_mode=tfidf` " "is not supported.") if isinstance(inputs, sparse_tensor.SparseTensor): inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1) one_hot_data = array_ops.one_hot(inputs, depth=out_depth) counts = math_ops.reduce_sum(one_hot_data, axis=1) tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights) tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth))) return tf_idf_data binary_output = (self._output_mode == BINARY) if self._sparse: result = bincount_ops.sparse_bincount(inputs, weights=count_weights, minlength=out_depth, axis=-1, binary_output=binary_output) result = math_ops.cast(result, K.floatx()) batch_size = array_ops.shape(result)[0] result = sparse_tensor.SparseTensor( indices=result.indices, values=result.values, dense_shape=[batch_size, out_depth]) return result else: result = bincount_ops.bincount(inputs, weights=count_weights, minlength=out_depth, dtype=K.floatx(), axis=-1, binary_output=binary_output) result.set_shape(tensor_shape.TensorShape((None, out_depth))) return result
def test_sparse_input_col_reduce_binary(self, dtype): num_rows = 128 num_cols = 27 size = 100 np.random.seed(42) inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype) np_out = np.reshape( np.concatenate([ np.where(np.bincount(inp[j, :], minlength=size) > 0, 1, 0) for j in range(num_rows) ], axis=0), (num_rows, size)) # from_dense will filter out 0s. inp = inp + 1 # from_dense will cause OOM in GPU. with ops.device("/CPU:0"): inp_sparse = sparse_ops.from_dense(inp) inp_sparse = sparse_tensor.SparseTensor(inp_sparse.indices, inp_sparse.values - 1, inp_sparse.dense_shape) self.assertAllEqual( np_out, self.evaluate( bincount_ops.bincount(arr=inp_sparse, axis=-1, binary_output=True)))
def test_empty(self): with self.session(): self.assertAllEqual( self.evaluate(bincount_ops.bincount([], minlength=5)), [0, 0, 0, 0, 0]) self.assertAllEqual( self.evaluate(bincount_ops.bincount([], minlength=1)), [0]) self.assertAllEqual( self.evaluate(bincount_ops.bincount([], minlength=0)), []) self.assertEqual( self.evaluate( bincount_ops.bincount([], minlength=0, dtype=np.float32)).dtype, np.float32) self.assertEqual( self.evaluate( bincount_ops.bincount([], minlength=3, dtype=np.float64)).dtype, np.float64)
def test_random_without_weights(self): num_samples = 10000 with self.session(): np.random.seed(42) for dtype in [np.int32, np.float32]: arr = np.random.randint(0, 1000, num_samples) weights = np.ones(num_samples).astype(dtype) self.assertAllClose( self.evaluate(bincount_ops.bincount(arr, None)), np.bincount(arr, weights))
def test_ragged_input_binary(self, dtype): x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]]) # pyformat: disable expected_output = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 1]] # pyformat: enable self.assertAllEqual( expected_output, self.evaluate( bincount_ops.bincount(arr=x, axis=-1, binary_output=True)))
def dense_bincount(inputs, out_depth, multi_hot_output, count_weights=None): """Apply binary or count encoding to an input.""" result = bincount_ops.bincount(inputs, weights=count_weights, minlength=out_depth, maxlength=out_depth, dtype=backend.floatx(), axis=-1, binary_output=multi_hot_output) batch_size = inputs.shape.as_list()[0] result.set_shape(tensor_shape.TensorShape((batch_size, out_depth))) return result
def dense_bincount(inputs, out_depth, binary_output, count_weights=None): """Apply binary or count encoding to an input.""" result = bincount_ops.bincount( inputs, weights=count_weights, minlength=out_depth, maxlength=out_depth, dtype=K.floatx(), axis=-1, binary_output=binary_output) result.set_shape(tensor_shape.TensorShape((None, out_depth))) return result
def test_random_with_weights(self): num_samples = 10000 with self.session(): np.random.seed(42) for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]: arr = np.random.randint(0, 1000, num_samples) if dtype == dtypes.int32 or dtype == dtypes.int64: weights = np.random.randint(-100, 100, num_samples) else: weights = np.random.random(num_samples) self.assertAllClose( self.evaluate(bincount_ops.bincount(arr, weights)), np.bincount(arr, weights))
def test_ragged_input_count_with_weights(self, dtype): x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]]) weights = ragged_factory_ops.constant([[], [], [.1, .2, .3], [], [.2, .5, .6, .3]]) # pyformat: disable expected_output = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [.2, .3, 0, .1, 0, 0], [0, 0, 0, 0, 0, 0], [.5, 0, 0, 0, .9, .2]] # pyformat: enable self.assertAllClose( expected_output, self.evaluate( bincount_ops.bincount(arr=x, weights=weights, axis=-1)))
def test_bincount_determinism_error(self): num_samples = 10000 np.random.seed(42) arr = np.random.randint(0, 1000, num_samples) try: config.enable_deterministic_ops(True) with test_util.use_gpu(): if test_util.is_gpu_available(cuda_only=True): with self.assertRaisesRegexp( errors_impl.UnimplementedError, "Determinism is not yet " "supported for Bincount."): self.evaluate(bincount_ops.bincount(arr, None)) finally: config.enable_deterministic_ops(False)
def test_ragged_input_count_np(self, dtype): np.random.seed(42) num_rows = 128 num_cols = 27 size = 1000 inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype) np_out = np.reshape( np.concatenate( [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)], axis=0), (num_rows, size)) x = ragged_tensor.RaggedTensor.from_tensor(inp) self.assertAllEqual( np_out, self.evaluate(bincount_ops.bincount(arr=x, minlength=size, axis=-1)))
def test_sparse_input_all_count(self, dtype): np.random.seed(42) num_rows = 128 size = 1000 n_elems = 4096 inp_indices = np.random.randint(0, num_rows, (n_elems, 1)) inp_indices = np.concatenate([inp_indices, np.zeros((n_elems, 1))], axis=1) inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype) sparse_inp = sparse_tensor.SparseTensor(inp_indices, inp_vals, [num_rows, 1]) np_out = np.bincount(inp_vals, minlength=size) self.assertAllEqual( np_out, self.evaluate(bincount_ops.bincount(sparse_inp, axis=0)))
def call(self, inputs, count_weights=None): if count_weights is not None and self._output_mode != COUNT: raise ValueError( "count_weights is not used in `output_mode='tf-idf'`, " "or `output_mode='binary'`. Please pass a single input.") self._called = True if self._max_tokens is None: out_depth = K.get_value(self.num_elements) else: out_depth = self._max_tokens if self._output_mode == TFIDF: # If the input is a sparse tensor, we densify it with the default value of # -1. Because -1 is ignored by one_hot, this effectively drops the non-set # positions from the output encoding. if isinstance(inputs, sparse_tensor.SparseTensor): inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1) one_hot_data = array_ops.one_hot(inputs, depth=out_depth) counts = math_ops.reduce_sum(one_hot_data, axis=1) tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights) tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth))) return tf_idf_data binary_output = (self._output_mode == BINARY) if self._sparse: result = bincount_ops.sparse_bincount(inputs, weights=count_weights, minlength=out_depth, axis=-1, binary_output=binary_output) return math_ops.cast(result, K.floatx()) else: result = bincount_ops.bincount(inputs, weights=count_weights, minlength=out_depth, dtype=K.floatx(), axis=-1, binary_output=binary_output) result.set_shape(tensor_shape.TensorShape((None, out_depth))) return result
def test_values(self): with self.session(): self.assertAllEqual( self.evaluate(bincount_ops.bincount([1, 1, 1, 2, 2, 3])), [0, 3, 2, 1]) arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5] self.assertAllEqual( self.evaluate(bincount_ops.bincount(arr)), [0, 5, 4, 3, 2, 1]) arr += [0, 0, 0, 0, 0, 0] self.assertAllEqual( self.evaluate(bincount_ops.bincount(arr)), [6, 5, 4, 3, 2, 1]) self.assertAllEqual(self.evaluate(bincount_ops.bincount([])), []) self.assertAllEqual(self.evaluate(bincount_ops.bincount([0, 0, 0])), [3]) self.assertAllEqual( self.evaluate(bincount_ops.bincount([5])), [0, 0, 0, 0, 0, 1]) self.assertAllEqual( self.evaluate(bincount_ops.bincount(np.arange(10000))), np.ones(10000))
def from_value_rowids(cls, value_rowids, nrows=None, validate=True, preferred_dtype=None): """Creates a `RowPartition` with rows partitioned by `value_rowids`. This `RowPartition` divides a sequence `values` into rows by specifying which row each value should be added to: ```python partitioned_rows = [[] for _ in nrows] for (value, rowid) in zip(values, value_rowids): partitioned_rows[rowid].append(value) `` Args: value_rowids: A 1-D integer tensor with shape `[nvals]`, which corresponds one-to-one with `values`, and specifies each value's row index. Must be nonnegative, and must be sorted in ascending order. nrows: An integer scalar specifying the number of rows. This should be specified if the `RowPartition` may containing empty training rows. Must be greater than `value_rowids[-1]` (or greater than or equal to zero if `value_rowids` is empty). Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty). validate: If true, then use assertions to check that the arguments form a valid `RowPartition`. preferred_dtype: The dtype to encode value_rowids if it doesn't already have one. The default is tf.int64. Returns: A `RowPartition`. Raises: ValueError: If `nrows` is incompatible with `value_rowids`. #### Example: >>> print(RowPartition.from_value_rowids( ... value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], ... nrows=4)) tf.RowPartition(row_splits=tf.Tensor([0 4 4 7 8], shape=(5,), dtype=int64)) """ # Local import bincount_ops to avoid import-cycle since bincount_ops # imports ragged_tensor. from tensorflow.python.ops import bincount_ops # pylint: disable=g-import-not-at-top if not isinstance(validate, bool): raise TypeError("validate must have type bool") with ops.name_scope(None, "RowPartitionFromValueRowIds", [value_rowids, nrows]): value_rowids = cls._convert_row_partition(value_rowids, "value_rowids", preferred_dtype) if nrows is None: const_rowids = tensor_util.constant_value(value_rowids) if const_rowids is None: nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1 const_nrows = None else: const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0 nrows = ops.convert_to_tensor( const_nrows, value_rowids.dtype, name="nrows") else: nrows = ops.convert_to_tensor(nrows, value_rowids.dtype, "nrows") const_nrows = tensor_util.constant_value(nrows) if const_nrows is not None: if const_nrows < 0: raise ValueError("Expected nrows >= 0; got %d" % const_nrows) const_rowids = tensor_util.constant_value(value_rowids) if const_rowids is not None and const_rowids.size > 0: if not const_nrows >= const_rowids[-1] + 1: raise ValueError( "Expected nrows >= value_rowids[-1] + 1; got nrows=%d, " "value_rowids[-1]=%d" % (const_nrows, const_rowids[-1])) value_rowids.shape.assert_has_rank(1) nrows.shape.assert_has_rank(0) if validate: msg = ("Arguments to from_value_rowids do not form a valid " "RowPartition") checks = [ check_ops.assert_rank(value_rowids, 1, message=msg), check_ops.assert_rank(nrows, 0, message=msg), check_ops.assert_non_negative(value_rowids[:1], message=msg), _assert_monotonic_increasing(value_rowids, message=msg), check_ops.assert_less(value_rowids[-1:], nrows, message=msg), ] value_rowids = control_flow_ops.with_dependencies(checks, value_rowids) # Convert value_rowids & nrows to row_splits. # Note: we don't use segment_ids_to_row_splits() here because we want # to save the intermediate value `row_lengths`, so we can cache it. # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the # cast. value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32) nrows_int32 = math_ops.cast(nrows, dtypes.int32) row_lengths = bincount_ops.bincount( value_rowids_int32, minlength=nrows_int32, maxlength=nrows_int32, dtype=value_rowids.dtype) row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0) if const_nrows is not None: row_lengths.set_shape([const_nrows]) row_splits.set_shape([const_nrows + 1]) return cls( row_splits=row_splits, row_lengths=row_lengths, value_rowids=value_rowids, nrows=nrows, internal=_row_partition_factory_key)
def test_negative(self): # unsorted_segment_sum will only report InvalidArgumentError on CPU with self.cached_session(), ops.device("/CPU:0"): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(bincount_ops.bincount([1, 2, 3, -1, 6, 8]))
def test_zero_weights(self): with self.session(): self.assertAllEqual( self.evaluate( bincount_ops.bincount(np.arange(1000), np.zeros(1000))), np.zeros(1000))
def segment_ids_to_row_splits(segment_ids, num_segments=None, out_type=None, name=None): """Generates the RaggedTensor `row_splits` corresponding to a segmentation. Returns an integer vector `splits`, where `splits[0] = 0` and `splits[i] = splits[i-1] + count(segment_ids==i)`. Example: >>> print(tf.ragged.segment_ids_to_row_splits([0, 0, 0, 2, 2, 3, 4, 4, 4])) tf.Tensor([0 3 3 5 6 9], shape=(6,), dtype=int64) Args: segment_ids: A 1-D integer Tensor. num_segments: A scalar integer indicating the number of segments. Defaults to `max(segment_ids) + 1` (or zero if `segment_ids` is empty). out_type: The dtype for the return value. Defaults to `segment_ids.dtype`, or `tf.int64` if `segment_ids` does not have a dtype. name: A name prefix for the returned tensor (optional). Returns: A sorted 1-D integer Tensor, with `shape=[num_segments + 1]`. """ # Local import bincount_ops to avoid import-cycle. from tensorflow.python.ops import bincount_ops # pylint: disable=g-import-not-at-top if out_type is None: if isinstance(segment_ids, ops.Tensor): out_type = segment_ids.dtype elif isinstance(num_segments, ops.Tensor): out_type = num_segments.dtype else: out_type = dtypes.int64 else: out_type = dtypes.as_dtype(out_type) with ops.name_scope(name, "SegmentIdsToRaggedSplits", [segment_ids]) as name: # Note: we cast int64 tensors to int32, since bincount currently only # supports int32 inputs. segment_ids = ragged_util.convert_to_int_tensor(segment_ids, "segment_ids", dtype=dtypes.int32) segment_ids.shape.assert_has_rank(1) if num_segments is not None: num_segments = ragged_util.convert_to_int_tensor( num_segments, "num_segments", dtype=dtypes.int32) num_segments.shape.assert_has_rank(0) row_lengths = bincount_ops.bincount(segment_ids, minlength=num_segments, maxlength=num_segments, dtype=out_type) splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0) # Update shape information, if possible. if num_segments is not None: const_num_segments = tensor_util.constant_value(num_segments) if const_num_segments is not None: splits.set_shape( tensor_shape.TensorShape([const_num_segments + 1])) return splits