コード例 #1
0
    def test_not_parallel_read(self):
        os.environ["BIGTABLE_EMULATOR_HOST"] = self.emulator.get_addr()
        self.emulator.create_table(
            "fake_project",
            "fake_instance",
            "test-table",
            ["fam1", "fam2"],
            splits=["row005", "row010", "row015"],
        )

        values = [[f"[{i,j}]" for j in range(2)] for i in range(20)]

        ten = tf.constant(values)

        client = BigtableClient("fake_project", "fake_instance")
        table = client.get_table("test-table")

        self.emulator.write_tensor(
            "fake_project",
            "fake_instance",
            "test-table",
            ten,
            ["row" + str(i).rjust(3, "0") for i in range(20)],
            ["fam1:col1", "fam2:col2"],
        )

        dataset = table.parallel_read_rows(
            ["fam1:col1", "fam2:col2"],
            row_set=row_set.from_rows_or_ranges(row_range.infinite()),
            num_parallel_calls=2,
        )
        results = [[v.numpy().decode() for v in row] for row in dataset]
        self.assertEqual(repr(sorted(values)), repr(sorted(results)))
コード例 #2
0
    def test_parallel_read(self):
        os.environ["BIGTABLE_EMULATOR_HOST"] = self.emulator.get_addr()
        self.emulator.create_table(
            "fake_project",
            "fake_instance",
            "test-table",
            ["fam1", "fam2"],
            splits=["row005", "row010", "row015"],
        )

        values = [[f"[{i,j}]" for j in range(2)] for i in range(20)]
        flat_values = [value for row in values for value in row]

        ten = tf.constant(values)

        client = BigtableClient("fake_project", "fake_instance")
        table = client.get_table("test-table")

        self.emulator.write_tensor(
            "fake_project",
            "fake_instance",
            "test-table",
            ten,
            ["row" + str(i).rjust(3, "0") for i in range(20)],
            ["fam1:col1", "fam2:col2"],
        )

        for r in table.parallel_read_rows(
            ["fam1:col1", "fam2:col2"],
                row_set=row_set.from_rows_or_ranges(row_range.infinite()),
        ):
            for c in r:
                self.assertTrue(c.numpy().decode() in flat_values)
コード例 #3
0
 def check_values(self, values, table, type_name, tf_dtype):
     for i, r in enumerate(
         table.read_rows(
             ["fam1:" + type_name],
             row_set=row_set.from_rows_or_ranges(row_range.infinite()),
             output_type=tf_dtype,
         )
     ):
         if tf_dtype in [tf.float64, tf.float32]:
             self.assertAlmostEqual(values[i].numpy(), r.numpy()[0])
         else:
             self.assertEqual(values[i].numpy(), r.numpy()[0])
コード例 #4
0
    def parallel_read_rows(
        self,
        columns: List[str],
        num_parallel_calls=tf.data.AUTOTUNE,
        row_set: bigtable_row_set.RowSet = None,
        filter: filters.BigtableFilter = None,
        output_type=tf.string,
    ):
        """Retrieves values from Google Bigtable in parallel. The ammount of work
        is split between workers based on SampleRowKeys. Keep in mind that when
        reading in parallel, rows are not read in any particular order.
        Args:
            columns (List[str]): the list of columns to read from; the order on
                this list will determine the order in the output tensors
            num_parallel_calls: number of workers assigned to reading the data.
            row_set (RowSet): set of rows to read.

        Returns:
            A `tf.data.Dataset` returning the cell contents.
        """

        # We have to make sure that all the default arguments are initialized
        # on each invocation. For more info see read_rows method.
        if row_set is None:
            row_set = bigtable_row_set.from_rows_or_ranges(
                bigtable_row_range.infinite())
        if filter is None:
            filter = filters.latest()

        samples = core_ops.bigtable_split_row_set_evenly(
            self._client_resource, row_set._impl, self._table_id,
            num_parallel_calls)

        def map_func(idx):
            return self.read_rows(columns,
                                  bigtable_row_set.RowSet(samples[idx]),
                                  filter, output_type)

        # We interleave a dataset of sample's indexes instead of a dataset of
        # samples, because Dataset.from_tensor_slices attempts to copy the
        # resource tensors using DeepCopy from tensor_util.cc which is not
        # possible for tensors of type DT_RESOURCE.
        return tf.data.Dataset.range(samples.shape[0]).interleave(
            map_func=map_func,
            cycle_length=num_parallel_calls,
            block_length=1,
            num_parallel_calls=num_parallel_calls,
            deterministic=False,
        )
コード例 #5
0
ファイル: test_row_set.py プロジェクト: yongtang/io
 def test_infinite(self):
     self.assertEqual("", repr(row_range.infinite()))
コード例 #6
0
    def test_split_row_set(self):
        os.environ["BIGTABLE_EMULATOR_HOST"] = self.emulator.get_addr()
        self.emulator.create_table(
            "fake_project",
            "fake_instance",
            "test-table",
            ["fam1", "fam2"],
            splits=[
                "row005", "row010", "row015", "row020", "row025", "row030"
            ],
        )

        values = [[f"[{i,j}]" for j in range(2)] for i in range(40)]

        ten = tf.constant(values)

        client = BigtableClient("fake_project", "fake_instance")

        self.emulator.write_tensor(
            "fake_project",
            "fake_instance",
            "test-table",
            ten,
            ["row" + str(i).rjust(3, "0") for i in range(40)],
            ["fam1:col1", "fam2:col2"],
        )

        rs = row_set.from_rows_or_ranges(row_range.infinite())

        num_parallel_calls = 2
        samples = [
            s for s in core_ops.bigtable_split_row_set_evenly(
                client._client_resource,
                rs._impl,
                "test-table",
                num_parallel_calls,
            )
        ]
        self.assertEqual(len(samples), num_parallel_calls)

        num_parallel_calls = 6
        samples = [
            s for s in core_ops.bigtable_split_row_set_evenly(
                client._client_resource,
                rs._impl,
                "test-table",
                num_parallel_calls,
            )
        ]

        # The emulator may return different samples each time, so we can't
        # expect an exact number, but it must be no more than num_parallel_calls
        self.assertLessEqual(len(samples), num_parallel_calls)

        num_parallel_calls = 1
        samples = [
            s for s in core_ops.bigtable_split_row_set_evenly(
                client._client_resource,
                rs._impl,
                "test-table",
                num_parallel_calls,
            )
        ]
        self.assertEqual(len(samples), num_parallel_calls)