def testDistribution(self, initial_known): classes = np.random.randint(5, size=(20000, )) # Uniformly sampled target_dist = [0.9, 0.05, 0.05, 0.0, 0.0] initial_dist = [0.2] * 5 if initial_known else None classes = math_ops.cast(classes, dtypes.int64) # needed for Windows build. dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle( 200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat() get_next = self.getNext( dataset.apply( resampling.rejection_resample(target_dist=target_dist, initial_dist=initial_dist, class_func=lambda c, _: c, seed=27))) returned = [] while len(returned) < 4000: returned.append(self.evaluate(get_next())) returned_classes, returned_classes_and_data = zip(*returned) _, returned_data = zip(*returned_classes_and_data) self.assertAllEqual( [compat.as_bytes(str(c)) for c in returned_classes], returned_data) total_returned = len(returned_classes) class_counts = np.array([ len([True for v in returned_classes if v == c]) for c in range(5) ]) returned_dist = class_counts / total_returned self.assertAllClose(target_dist, returned_dist, atol=1e-2)
def benchmark_resample_performance(self): init_dist = [0.25, 0.25, 0.25, 0.25] target_dist = [0.0, 0.0, 0.0, 1.0] num_classes = len(init_dist) # We don't need many samples to test a dirac-delta target distribution num_samples = 1000 data_np = np.random.choice(num_classes, num_samples, p=init_dist) # Prepare the dataset dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat() # Reshape distribution via rejection sampling. dataset = dataset.apply( resampling.rejection_resample(class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist, seed=142)) options = dataset_ops.Options() options.experimental_optimization.apply_default_optimizations = False dataset = dataset.with_options(options) wall_time = self.run_benchmark(dataset=dataset, num_elements=num_samples, iters=10, warmup=True) resample_time = wall_time * num_samples self.report_benchmark(iters=10, wall_time=resample_time, name="resample_{}".format(num_samples))
def testDistribution(self, initial_known): classes = np.random.randint(5, size=(20000,)) # Uniformly sampled target_dist = [0.9, 0.05, 0.05, 0.0, 0.0] initial_dist = [0.2] * 5 if initial_known else None classes = math_ops.cast(classes, dtypes.int64) # needed for Windows build. dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle( 200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat() get_next = self.getNext( dataset.apply( resampling.rejection_resample( target_dist=target_dist, initial_dist=initial_dist, class_func=lambda c, _: c, seed=27))) returned = [] while len(returned) < 4000: returned.append(self.evaluate(get_next())) returned_classes, returned_classes_and_data = zip(*returned) _, returned_data = zip(*returned_classes_and_data) self.assertAllEqual([compat.as_bytes(str(c)) for c in returned_classes], returned_data) total_returned = len(returned_classes) class_counts = np.array([ len([True for v in returned_classes if v == c]) for c in range(5)]) returned_dist = class_counts / total_returned self.assertAllClose(target_dist, returned_dist, atol=1e-2)
def testOtherDtypes(self, target_dtype, init_dtype): target_dist = np.array([0.5, 0.5], dtype=target_dtype) if init_dtype is None: init_dist = None else: init_dist = np.array([0.5, 0.5], dtype=init_dtype) dataset = dataset_ops.Dataset.range(10) resampler = resampling.rejection_resample(class_func=lambda x: x % 2, target_dist=target_dist, initial_dist=init_dist) dataset = dataset.apply(resampler) get_next = self.getNext(dataset) self.evaluate(get_next())
def _time_resampling(test_obj, data_np, target_dist, init_dist, num_to_sample): dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat() # Reshape distribution via rejection sampling. dataset = dataset.apply( resampling.rejection_resample(class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist, seed=142)) get_next = dataset_ops.make_one_shot_iterator(dataset).get_next() with test_obj.test_session() as sess: start_time = time.time() for _ in xrange(num_to_sample): sess.run(get_next) end_time = time.time() return end_time - start_time
def rejection_resample(class_func, target_dist, initial_dist=None, seed=None): """A transformation that resamples a dataset to achieve a target distribution. **NOTE** Resampling is performed via rejection sampling; some fraction of the input values will be dropped. Args: class_func: A function mapping an element of the input dataset to a scalar `tf.int32` tensor. Values should be in `[0, num_classes)`. target_dist: A floating point type tensor, shaped `[num_classes]`. initial_dist: (Optional.) A floating point type tensor, shaped `[num_classes]`. If not provided, the true class distribution is estimated live in a streaming fashion. seed: (Optional.) Python integer seed for the resampler. Returns: A `Dataset` transformation function, which can be passed to `tf.data.Dataset.apply`. """ return resampling.rejection_resample(class_func, target_dist, initial_dist, seed)
def _time_resampling( test_obj, data_np, target_dist, init_dist, num_to_sample): dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat() # Reshape distribution via rejection sampling. dataset = dataset.apply( resampling.rejection_resample( class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist, seed=142)) get_next = dataset.make_one_shot_iterator().get_next() with test_obj.test_session() as sess: start_time = time.time() for _ in xrange(num_to_sample): sess.run(get_next) end_time = time.time() return end_time - start_time
def rejection_resample(class_func, target_dist, initial_dist=None, seed=None): """A transformation that resamples a dataset to achieve a target distribution. **NOTE** Resampling is performed via rejection sampling; some fraction of the input values will be dropped. Args: class_func: A function mapping an element of the input dataset to a scalar `tf.int32` tensor. Values should be in `[0, num_classes)`. target_dist: A floating point type tensor, shaped `[num_classes]`. initial_dist: (Optional.) A floating point type tensor, shaped `[num_classes]`. If not provided, the true class distribution is estimated live in a streaming fashion. seed: (Optional.) Python integer seed for the resampler. Returns: A `Dataset` transformation function, which can be passed to `tf.data.Dataset.apply`. """ return resampling.rejection_resample(class_func, target_dist, initial_dist, seed)
def testExhaustion(self): init_dist = [0.5, 0.5] target_dist = [0.9, 0.1] dataset = dataset_ops.Dataset.range(10000) resampler = resampling.rejection_resample(class_func=lambda x: x % 2, target_dist=target_dist, initial_dist=init_dist) dataset = dataset.apply(resampler) get_next = self.getNext(dataset) returned = [] with self.assertRaises(errors.OutOfRangeError): while True: returned.append(self.evaluate(get_next())) classes, _ = zip(*returned) bincount = np.bincount(np.array(classes), minlength=len(init_dist)).astype( np.float32) / len(classes) self.assertAllClose(target_dist, bincount, atol=1e-2)
def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist): init_dist = [0.5, 0.5] target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0] num_classes = len(init_dist) # We don't need many samples to test that this works. num_samples = 100 data_np = np.random.choice(num_classes, num_samples, p=init_dist) dataset = dataset_ops.Dataset.from_tensor_slices(data_np) # Reshape distribution. dataset = dataset.apply( resampling.rejection_resample(class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist)) get_next = self.getNext(dataset) returned = [] with self.assertRaises(errors.OutOfRangeError): while True: returned.append(self.evaluate(get_next()))
def _time_resampling(data_np, target_dist, init_dist, num_to_sample): # pylint: disable=missing-docstring dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat() # Reshape distribution via rejection sampling. dataset = dataset.apply( resampling.rejection_resample(class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist, seed=142)) options = dataset_ops.Options() options.experimental_optimization.apply_default_optimizations = False dataset = dataset.with_options(options) get_next = dataset_ops.make_one_shot_iterator(dataset).get_next() with session.Session() as sess: start_time = time.time() for _ in xrange(num_to_sample): sess.run(get_next) end_time = time.time() return end_time - start_time
def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist): init_dist = [0.5, 0.5] target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0] num_classes = len(init_dist) # We don't need many samples to test that this works. num_samples = 100 data_np = np.random.choice(num_classes, num_samples, p=init_dist) dataset = dataset_ops.Dataset.from_tensor_slices(data_np) # Reshape distribution. dataset = dataset.apply( resampling.rejection_resample( class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist)) get_next = self.getNext(dataset) returned = [] with self.assertRaises(errors.OutOfRangeError): while True: returned.append(self.evaluate(get_next()))
def _time_resampling(data_np, target_dist, init_dist, num_to_sample): # pylint: disable=missing-docstring dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat() # Reshape distribution via rejection sampling. dataset = dataset.apply( resampling.rejection_resample( class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist, seed=142)) options = dataset_ops.Options() options.experimental_optimization.apply_default_optimizations = False dataset = dataset.with_options(options) get_next = dataset_ops.make_one_shot_iterator(dataset).get_next() with session.Session() as sess: start_time = time.time() for _ in xrange(num_to_sample): sess.run(get_next) end_time = time.time() return end_time - start_time
def testRandomClasses(self): init_dist = [0.25, 0.25, 0.25, 0.25] target_dist = [0.0, 0.0, 0.0, 1.0] num_classes = len(init_dist) # We don't need many samples to test a dirac-delta target distribution. num_samples = 100 data_np = np.random.choice(num_classes, num_samples, p=init_dist) dataset = dataset_ops.Dataset.from_tensor_slices(data_np) # Apply a random mapping that preserves the data distribution. def _remap_fn(_): return math_ops.cast( random_ops.random_uniform([1]) * num_classes, dtypes.int32)[0] dataset = dataset.map(_remap_fn) # Reshape distribution. dataset = dataset.apply( resampling.rejection_resample(class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist)) get_next = dataset_ops.make_one_shot_iterator(dataset).get_next() with self.cached_session() as sess: returned = [] with self.assertRaises(errors.OutOfRangeError): while True: returned.append(sess.run(get_next)) classes, _ = zip(*returned) bincount = np.bincount(np.array(classes), minlength=num_classes).astype( np.float32) / len(classes) self.assertAllClose(target_dist, bincount, atol=1e-2)
def testRandomClasses(self): init_dist = [0.25, 0.25, 0.25, 0.25] target_dist = [0.0, 0.0, 0.0, 1.0] num_classes = len(init_dist) # We don't need many samples to test a dirac-delta target distribution. num_samples = 100 data_np = np.random.choice(num_classes, num_samples, p=init_dist) dataset = dataset_ops.Dataset.from_tensor_slices(data_np) # Apply a random mapping that preserves the data distribution. def _remap_fn(_): return math_ops.cast(random_ops.random_uniform([1]) * num_classes, dtypes.int32)[0] dataset = dataset.map(_remap_fn) # Reshape distribution. dataset = dataset.apply( resampling.rejection_resample( class_func=lambda x: x, target_dist=target_dist, initial_dist=init_dist)) get_next = dataset.make_one_shot_iterator().get_next() with self.cached_session() as sess: returned = [] with self.assertRaises(errors.OutOfRangeError): while True: returned.append(sess.run(get_next)) classes, _ = zip(*returned) bincount = np.bincount( np.array(classes), minlength=num_classes).astype(np.float32) / len(classes) self.assertAllClose(target_dist, bincount, atol=1e-2)