def testSampleFromDatasets(self): cluster = data_service_test_base.TestCluster(num_workers=3) random_seed.set_random_seed(1619) num_samples = 5000 rand_probs = np.random.random_sample((5, )) rand_probs = rand_probs / rand_probs.sum() # Use chi-squared test to assert that the observed distribution matches the # expected distribution. Based on the implementation in # "third_party/tensorflow/python/kernel_tests/multinomial_op_test.py". for weights in [[.85, .05, .1], rand_probs, [1.]]: classes = len(weights) # Create a dataset that samples each integer in `[0, num_datasets)` # with probability given by `weights[i]`. ds = interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(i).repeat() for i in range(classes) ], weights) ds = self.make_distributed_dataset( ds, cluster, processing_mode="distributed_epoch") ds = ds.take(num_samples) freqs = np.zeros([classes]) for v in self.getDatasetOutput(ds): freqs[v] += 1 expected = np.asarray(weights) actual = np.asarray(freqs / num_samples) diff = actual - expected chi2 = np.sum(diff * diff / expected, axis=0) self.assertLess(chi2, 1e-2)
def testSampleFromDatasets(self, weights_as_dataset): random_seed.set_random_seed(1619) num_samples = 5000 rand_probs = self._normalize(np.random.random_sample((5, ))) # Use chi-squared test to assert that the observed distribution matches the # expected distribution. Based on the implementation in # "third_party/tensorflow/python/kernel_tests/multinomial_op_test.py". for probs in [[.85, .05, .1], rand_probs, [1.]]: weights = np.asarray(probs) if weights_as_dataset: weights = dataset_ops.Dataset.from_tensors(weights).repeat() classes = len(probs) # Create a dataset that samples each integer in `[0, num_datasets)` # with probability given by `weights[i]`. dataset = interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(i).repeat() for i in range(classes) ], weights) dataset = dataset.take(num_samples) next_element = self.getNext(dataset) freqs = np.zeros([classes]) for _ in range(num_samples): freqs[self.evaluate(next_element())] += 1 with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
def testSampleFromDatasetsNested(self): ds1 = dataset_ops.Dataset.range(10).window(2) ds2 = dataset_ops.Dataset.range(10, 20).window(2) ds = interleave_ops.sample_from_datasets([ds1, ds2], weights=[0.3, 0.7]) ds = ds.flat_map(lambda x: x) next_element = self.getNext(ds) self.evaluate(next_element())
def _build_dataset(self, probs, num_samples): dataset = interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(i).repeat(None) for i in range(len(probs)) ], probs, seed=1813) return dataset.take(num_samples)
def testSampleFromEmptyDataset(self, weights_type): weights = _get_weights_of_type(np.asarray([1., 0.]), weights_type) datasets = [ dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1).repeat() ] sample_dataset = interleave_ops.sample_from_datasets( datasets, weights=weights, stop_on_empty_dataset=True) self.assertDatasetProduces(sample_dataset, [])
def _build_dataset(self, probs, num_samples): dataset = interleave_ops.sample_from_datasets( [ dataset_ops.Dataset.from_tensors(i).repeat(None) for i in range(len(probs)) ], probs, seed=1813) return dataset.take(num_samples)
def testErrors(self): with self.assertRaisesRegex(ValueError, r"must have the same length"): interleave_ops.sample_from_datasets( [dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(20)], weights=[0.25, 0.25, 0.25, 0.25]) with self.assertRaisesRegex(TypeError, "`tf.float32` or `tf.float64`"): interleave_ops.sample_from_datasets( [dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(20)], weights=[1, 1]) with self.assertRaisesRegex(TypeError, "must have the same type"): interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(0.0) ]) with self.assertRaisesRegex( ValueError, r"`datasets` must be a non-empty list of datasets."): interleave_ops.sample_from_datasets(datasets=[], weights=[]) with self.assertRaisesRegex(TypeError, "tf.int64"): interleave_ops.choose_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(1) ], choice_dataset=dataset_ops.Dataset.from_tensors(1.0)) with self.assertRaisesRegex(TypeError, "scalar"): interleave_ops.choose_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(1) ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0])) with self.assertRaisesRegex(errors.InvalidArgumentError, "out of range"): dataset = interleave_ops.choose_from_datasets( [dataset_ops.Dataset.from_tensors(0)], choice_dataset=dataset_ops.Dataset.from_tensors( constant_op.constant(1, dtype=dtypes.int64))) next_element = self.getNext(dataset) self.evaluate(next_element()) with self.assertRaisesRegex( ValueError, r"`datasets` must be a non-empty list of datasets."): interleave_ops.choose_from_datasets( datasets=[], choice_dataset=dataset_ops.Dataset.from_tensors(1.0)) with self.assertRaisesRegex( TypeError, r"`choice_dataset` must be a dataset of scalar"): interleave_ops.choose_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(1) ], choice_dataset=None)
def _apply_fn(dataset): """Function from `Dataset` to `Dataset` that applies the transformation.""" target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist") target_dist_t = math_ops.cast(target_dist_t, dtypes.float32) # Get initial distribution. if initial_dist is not None: initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist") initial_dist_t = math_ops.cast(initial_dist_t, dtypes.float32) acceptance_dist, prob_of_original = ( _calculate_acceptance_probs_with_mixing( initial_dist_t, target_dist_t)) initial_dist_ds = dataset_ops.Dataset.from_tensors( initial_dist_t).repeat() acceptance_dist_ds = dataset_ops.Dataset.from_tensors( acceptance_dist).repeat() prob_of_original_ds = dataset_ops.Dataset.from_tensors( prob_of_original).repeat() else: initial_dist_ds = _estimate_initial_dist_ds( target_dist_t, dataset.map(class_func)) acceptance_and_original_prob_ds = initial_dist_ds.map( lambda initial: _calculate_acceptance_probs_with_mixing( # pylint: disable=g-long-lambda initial, target_dist_t)) acceptance_dist_ds = acceptance_and_original_prob_ds.map( lambda accept_prob, _: accept_prob) prob_of_original_ds = acceptance_and_original_prob_ds.map( lambda _, prob_original: prob_original) filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_func, seed) # Prefetch filtered dataset for speed. filtered_ds = filtered_ds.prefetch(3) prob_original_static = _get_prob_original_static( initial_dist_t, target_dist_t) if initial_dist is not None else None def add_class_value(*x): if len(x) == 1: return class_func(*x), x[0] else: return class_func(*x), x if prob_original_static == 1: return dataset.map(add_class_value) elif prob_original_static == 0: return filtered_ds else: return interleave_ops.sample_from_datasets( [dataset.map(add_class_value), filtered_ds], weights=prob_of_original_ds.map( lambda prob: [(prob, 1.0 - prob)]), seed=seed, stop_on_empty_dataset=True)
def testSampleFromDatasetsWithZeroWeight(self, weights_type): # Sampling stops when the second dataset is exhausted. weights = _get_weights_of_type(np.asarray([0., 1.]), weights_type) datasets = [ dataset_ops.Dataset.from_tensors(-1).repeat(2), dataset_ops.Dataset.from_tensors(1).repeat(2) ] sample_dataset = interleave_ops.sample_from_datasets( datasets, weights=weights, stop_on_empty_dataset=True) self.assertDatasetProduces(sample_dataset, [1, 1])
def testSampleFromDatasetsSkippingDatasetsWithZeroWeight(self): # Sampling skips the first dataset. weights = np.asarray([0., 1.]) datasets = [ dataset_ops.Dataset.from_tensors(-1).repeat(), dataset_ops.Dataset.from_tensors(1) ] sample_dataset = interleave_ops.sample_from_datasets( datasets, weights=weights, stop_on_empty_dataset=False) self.assertDatasetProduces(sample_dataset, [1])
def testSampleFromDatasetsAllWeightsAreZero(self): # Sampling skips both datasets. weights = np.asarray([0., 0.]) datasets = [ dataset_ops.Dataset.from_tensors(-1).repeat(), dataset_ops.Dataset.from_tensors(1).repeat() ] sample_dataset = interleave_ops.sample_from_datasets( datasets, weights=weights, stop_on_empty_dataset=False) self.assertDatasetProduces(sample_dataset, [])
def testSampleFromEmptyDataset(self, weights_as_dataset): weights = np.asarray([1., 0.]) if weights_as_dataset: weights = dataset_ops.Dataset.from_tensors(weights).repeat() datasets = [ dataset_ops.Dataset.from_tensors(-1).skip(5), dataset_ops.Dataset.from_tensors(1).repeat() ] sample_dataset = interleave_ops.sample_from_datasets( datasets, weights=weights, stop_on_empty_dataset=True) self.assertDatasetProduces(sample_dataset, [])
def testErrors(self): with self.assertRaisesRegexp(ValueError, r"vector of length `len\(datasets\)`"): interleave_ops.sample_from_datasets( [dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(20)], weights=[0.25, 0.25, 0.25, 0.25]) with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"): interleave_ops.sample_from_datasets( [dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(20)], weights=[1, 1]) with self.assertRaisesRegexp(TypeError, "must have the same type"): interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(0.0) ]) with self.assertRaisesRegexp(TypeError, "tf.int64"): interleave_ops.choose_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(1) ], choice_dataset=dataset_ops.Dataset.from_tensors(1.0)) with self.assertRaisesRegexp(TypeError, "scalar"): interleave_ops.choose_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(1) ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
def testErrors(self): with self.assertRaisesRegexp(ValueError, r"vector of length `len\(datasets\)`"): interleave_ops.sample_from_datasets( [dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(20)], weights=[0.25, 0.25, 0.25, 0.25]) with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"): interleave_ops.sample_from_datasets( [dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(20)], weights=[1, 1]) with self.assertRaisesRegexp(TypeError, "must have the same type"): interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(0.0) ]) with self.assertRaisesRegexp(TypeError, "tf.int64"): interleave_ops.choose_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(1) ], choice_dataset=dataset_ops.Dataset.from_tensors(1.0)) with self.assertRaisesRegexp(TypeError, "scalar"): interleave_ops.choose_from_datasets([ dataset_ops.Dataset.from_tensors(0), dataset_ops.Dataset.from_tensors(1) ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
def testSampleFromDatasetsStoppingOnEmptyDataset(self, weights_type): # Sampling stops when the first dataset is exhausted. weights = _get_weights_of_type(np.asarray([.5, .1, .4]), weights_type) datasets = [ dataset_ops.Dataset.from_tensors(np.int64(-1)), dataset_ops.Dataset.from_tensors(np.int64(1)).repeat(), dataset_ops.Dataset.range(10).repeat() ] sample_dataset = interleave_ops.sample_from_datasets( datasets, weights=weights, stop_on_empty_dataset=True) samples_list = self.getIteratorOutput(self.getNext(sample_dataset)) self.assertEqual(samples_list.count(-1), 1)
def testSampleFromDatasetsSkippingEmptyDataset(self, weights_type): # Sampling skips the first dataset after it becomes empty. weights = _get_weights_of_type(np.asarray([.5, .1, .4]), weights_type) datasets = [ dataset_ops.Dataset.from_tensors(np.int64(-1)), dataset_ops.Dataset.from_tensors(np.int64(1)).repeat(), dataset_ops.Dataset.range(10).repeat() ] sample_dataset = interleave_ops.sample_from_datasets( datasets, weights=weights, stop_on_empty_dataset=False).take(100) samples_list = self.getIteratorOutput(self.getNext(sample_dataset)) self.assertLen(samples_list, 100) self.assertEqual(samples_list.count(-1), 1)
def testSampleFromDatasetsWithZeroWeight(self, weights_as_dataset): weights = np.asarray([0., 1.]) if weights_as_dataset: weights = dataset_ops.Dataset.from_tensors(weights).repeat() # Sampling stops when the second dataset is exhausted. datasets = [ dataset_ops.Dataset.from_tensors(-1).repeat(2), dataset_ops.Dataset.from_tensors(1).repeat(2) ] sample_dataset = interleave_ops.sample_from_datasets( datasets, weights=weights, stop_on_empty_dataset=True) samples_list = self.getIteratorOutput(self.getNext(sample_dataset)) self.assertEqual(samples_list, [1, 1])
def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples): # Create a dataset that samples each integer in `[0, num_datasets)` # with probability given by `weights[i]`. dataset = interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(i).repeat(None) for i in range(num_datasets) ], weights) dataset = dataset.take(num_samples) next_element = self.getNext(dataset) freqs = np.zeros([num_datasets]) for _ in range(num_samples): freqs[self.evaluate(next_element())] += 1 with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) return freqs
def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples): # Create a dataset that samples each integer in `[0, num_datasets)` # with probability given by `weights[i]`. dataset = interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(i).repeat(None) for i in range(num_datasets) ], weights) dataset = dataset.take(num_samples) next_element = self.getNext(dataset) freqs = np.zeros([num_datasets]) for _ in range(num_samples): freqs[self.evaluate(next_element())] += 1 with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) return freqs
def _apply_fn(dataset): """Function from `Dataset` to `Dataset` that applies the transformation.""" target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist") class_values_ds = dataset.map(class_func) # Get initial distribution. if initial_dist is not None: initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist") acceptance_dist, prob_of_original = ( _calculate_acceptance_probs_with_mixing(initial_dist_t, target_dist_t)) initial_dist_ds = dataset_ops.Dataset.from_tensors( initial_dist_t).repeat() acceptance_dist_ds = dataset_ops.Dataset.from_tensors( acceptance_dist).repeat() prob_of_original_ds = dataset_ops.Dataset.from_tensors( prob_of_original).repeat() else: initial_dist_ds = _estimate_initial_dist_ds( target_dist_t, class_values_ds) acceptance_and_original_prob_ds = initial_dist_ds.map( lambda initial: _calculate_acceptance_probs_with_mixing( # pylint: disable=g-long-lambda initial, target_dist_t)) acceptance_dist_ds = acceptance_and_original_prob_ds.map( lambda accept_prob, _: accept_prob) prob_of_original_ds = acceptance_and_original_prob_ds.map( lambda _, prob_original: prob_original) filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds, seed) # Prefetch filtered dataset for speed. filtered_ds = filtered_ds.prefetch(3) prob_original_static = _get_prob_original_static( initial_dist_t, target_dist_t) if initial_dist is not None else None if prob_original_static == 1: return dataset_ops.Dataset.zip((class_values_ds, dataset)) elif prob_original_static == 0: return filtered_ds else: return interleave_ops.sample_from_datasets( [dataset_ops.Dataset.zip((class_values_ds, dataset)), filtered_ds], weights=prob_of_original_ds.map(lambda prob: [(prob, 1.0 - prob)]), seed=seed)
def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples): # Create a dataset that samples each integer in `[0, num_datasets)` # with probability given by `weights[i]`. dataset = interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(i).repeat(None) for i in range(num_datasets) ], weights) dataset = dataset.take(num_samples) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with self.cached_session() as sess: freqs = np.zeros([num_datasets]) for _ in range(num_samples): freqs[sess.run(next_element)] += 1 with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) return freqs
def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples): # Create a dataset that samples each integer in `[0, num_datasets)` # with probability given by `weights[i]`. dataset = interleave_ops.sample_from_datasets([ dataset_ops.Dataset.from_tensors(i).repeat(None) for i in range(num_datasets) ], weights) dataset = dataset.take(num_samples) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with self.cached_session() as sess: freqs = np.zeros([num_datasets]) for _ in range(num_samples): freqs[sess.run(next_element)] += 1 with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) return freqs
def testSampleFromDatasets(self): cluster = data_service_test_base.TestCluster(num_workers=3) num_samples = 200 weights = [.6, .3, .1] classes = len(weights) # Create a dataset that samples each integer in `[0, num_datasets)` # with probability given by `weights[i]`. ds = interleave_ops.sample_from_datasets( [dataset_ops.Dataset.from_tensors(i).repeat() for i in range(classes)], weights) ds = self._make_dynamic_sharding_dataset(ds, cluster) ds = ds.take(num_samples) freqs = np.zeros([classes]) for v in self.getDatasetOutput(ds): freqs[v] += 1 self.assertGreater(freqs[0], freqs[1]) self.assertGreater(freqs[1], freqs[2])
def sample_from_datasets(datasets, weights=None, seed=None): """Samples elements at random from the datasets in `datasets`. Args: datasets: A list of `tf.data.Dataset` objects with compatible structure. weights: (Optional.) A list of `len(datasets)` floating-point values where `weights[i]` represents the probability with which an element should be sampled from `datasets[i]`, or a `tf.data.Dataset` object where each element is such a list. Defaults to a uniform distribution across `datasets`. seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random seed that will be used to create the distribution. See `tf.set_random_seed` for behavior. Returns: A dataset that interleaves elements from `datasets` at random, according to `weights` if provided, otherwise with uniform probability. Raises: TypeError: If the `datasets` or `weights` arguments have the wrong type. ValueError: If the `weights` argument is specified and does not match the length of the `datasets` element. """ return interleave_ops.sample_from_datasets(datasets, weights, seed)
def sample_from_datasets(datasets, weights=None, seed=None): """Samples elements at random from the datasets in `datasets`. Args: datasets: A list of `tf.data.Dataset` objects with compatible structure. weights: (Optional.) A list of `len(datasets)` floating-point values where `weights[i]` represents the probability with which an element should be sampled from `datasets[i]`, or a `tf.data.Dataset` object where each element is such a list. Defaults to a uniform distribution across `datasets`. seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random seed that will be used to create the distribution. See `tf.compat.v1.set_random_seed` for behavior. Returns: A dataset that interleaves elements from `datasets` at random, according to `weights` if provided, otherwise with uniform probability. Raises: TypeError: If the `datasets` or `weights` arguments have the wrong type. ValueError: If the `weights` argument is specified and does not match the length of the `datasets` element. """ return interleave_ops.sample_from_datasets(datasets, weights, seed)
def testSampleFromDatasetsCardinality(self): ds1 = dataset_ops.Dataset.from_tensors([1.0]).repeat() ds2 = dataset_ops.Dataset.from_tensors([2.0]).repeat() ds = interleave_ops.sample_from_datasets([ds1, ds2]) self.assertEqual(self.evaluate(ds.cardinality()), dataset_ops.INFINITE)