def test_not_parallel_read(self): os.environ["BIGTABLE_EMULATOR_HOST"] = self.emulator.get_addr() self.emulator.create_table( "fake_project", "fake_instance", "test-table", ["fam1", "fam2"], splits=["row005", "row010", "row015"], ) values = [[f"[{i,j}]" for j in range(2)] for i in range(20)] ten = tf.constant(values) client = BigtableClient("fake_project", "fake_instance") table = client.get_table("test-table") self.emulator.write_tensor( "fake_project", "fake_instance", "test-table", ten, ["row" + str(i).rjust(3, "0") for i in range(20)], ["fam1:col1", "fam2:col2"], ) dataset = table.parallel_read_rows( ["fam1:col1", "fam2:col2"], row_set=row_set.from_rows_or_ranges(row_range.infinite()), num_parallel_calls=2, ) results = [[v.numpy().decode() for v in row] for row in dataset] self.assertEqual(repr(sorted(values)), repr(sorted(results)))
def test_parallel_read(self): os.environ["BIGTABLE_EMULATOR_HOST"] = self.emulator.get_addr() self.emulator.create_table( "fake_project", "fake_instance", "test-table", ["fam1", "fam2"], splits=["row005", "row010", "row015"], ) values = [[f"[{i,j}]" for j in range(2)] for i in range(20)] flat_values = [value for row in values for value in row] ten = tf.constant(values) client = BigtableClient("fake_project", "fake_instance") table = client.get_table("test-table") self.emulator.write_tensor( "fake_project", "fake_instance", "test-table", ten, ["row" + str(i).rjust(3, "0") for i in range(20)], ["fam1:col1", "fam2:col2"], ) for r in table.parallel_read_rows( ["fam1:col1", "fam2:col2"], row_set=row_set.from_rows_or_ranges(row_range.infinite()), ): for c in r: self.assertTrue(c.numpy().decode() in flat_values)
def check_values(self, values, table, type_name, tf_dtype): for i, r in enumerate( table.read_rows( ["fam1:" + type_name], row_set=row_set.from_rows_or_ranges(row_range.infinite()), output_type=tf_dtype, ) ): if tf_dtype in [tf.float64, tf.float32]: self.assertAlmostEqual(values[i].numpy(), r.numpy()[0]) else: self.assertEqual(values[i].numpy(), r.numpy()[0])
def parallel_read_rows( self, columns: List[str], num_parallel_calls=tf.data.AUTOTUNE, row_set: bigtable_row_set.RowSet = None, filter: filters.BigtableFilter = None, output_type=tf.string, ): """Retrieves values from Google Bigtable in parallel. The ammount of work is split between workers based on SampleRowKeys. Keep in mind that when reading in parallel, rows are not read in any particular order. Args: columns (List[str]): the list of columns to read from; the order on this list will determine the order in the output tensors num_parallel_calls: number of workers assigned to reading the data. row_set (RowSet): set of rows to read. Returns: A `tf.data.Dataset` returning the cell contents. """ # We have to make sure that all the default arguments are initialized # on each invocation. For more info see read_rows method. if row_set is None: row_set = bigtable_row_set.from_rows_or_ranges( bigtable_row_range.infinite()) if filter is None: filter = filters.latest() samples = core_ops.bigtable_split_row_set_evenly( self._client_resource, row_set._impl, self._table_id, num_parallel_calls) def map_func(idx): return self.read_rows(columns, bigtable_row_set.RowSet(samples[idx]), filter, output_type) # We interleave a dataset of sample's indexes instead of a dataset of # samples, because Dataset.from_tensor_slices attempts to copy the # resource tensors using DeepCopy from tensor_util.cc which is not # possible for tensors of type DT_RESOURCE. return tf.data.Dataset.range(samples.shape[0]).interleave( map_func=map_func, cycle_length=num_parallel_calls, block_length=1, num_parallel_calls=num_parallel_calls, deterministic=False, )
def test_infinite(self): self.assertEqual("", repr(row_range.infinite()))
def test_split_row_set(self): os.environ["BIGTABLE_EMULATOR_HOST"] = self.emulator.get_addr() self.emulator.create_table( "fake_project", "fake_instance", "test-table", ["fam1", "fam2"], splits=[ "row005", "row010", "row015", "row020", "row025", "row030" ], ) values = [[f"[{i,j}]" for j in range(2)] for i in range(40)] ten = tf.constant(values) client = BigtableClient("fake_project", "fake_instance") self.emulator.write_tensor( "fake_project", "fake_instance", "test-table", ten, ["row" + str(i).rjust(3, "0") for i in range(40)], ["fam1:col1", "fam2:col2"], ) rs = row_set.from_rows_or_ranges(row_range.infinite()) num_parallel_calls = 2 samples = [ s for s in core_ops.bigtable_split_row_set_evenly( client._client_resource, rs._impl, "test-table", num_parallel_calls, ) ] self.assertEqual(len(samples), num_parallel_calls) num_parallel_calls = 6 samples = [ s for s in core_ops.bigtable_split_row_set_evenly( client._client_resource, rs._impl, "test-table", num_parallel_calls, ) ] # The emulator may return different samples each time, so we can't # expect an exact number, but it must be no more than num_parallel_calls self.assertLessEqual(len(samples), num_parallel_calls) num_parallel_calls = 1 samples = [ s for s in core_ops.bigtable_split_row_set_evenly( client._client_resource, rs._impl, "test-table", num_parallel_calls, ) ] self.assertEqual(len(samples), num_parallel_calls)