Python chunks Examples

Programming Language: Python

Namespace/Package Name: bigdl.orca.data.image.utils

Method/Function: chunks

Examples at hotexamples.com: 5

Python chunks - 5 examples found. These are the top rated real world Python examples of bigdl.orca.data.image.utils.chunks extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def test_xshards_predict_save_load(self):

        sc = init_nncontext()
        rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result_shards = estimator.predict(shards, batch_size=4)
        result_before = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])
        assert np.array_equal(result_before, expected_result)

        path = "/tmp/model.pth"
        try:
            estimator.save(path)
            estimator.load(path)
            result_shards = estimator.predict(shards, batch_size=4)
            result_after = np.concatenate(
                [shard["prediction"] for shard in result_shards.collect()])

        finally:
            os.remove(path)

        assert np.array_equal(result_before, result_after)

Example #2

Show file

 def get_pred_xshards(key):
     rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50))
     shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
         lambda x: {
             key: np.stack(x)
         }).map(lambda x: {key: [x[key][:, :24], x[key][:, 24:]]})
     shards = SparkXShards(shards)
     return shards

Example #3

Show file

    def test_convert_predict_rdd_to_xshard(self):
        rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)
        pred_rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50))
        result_shards = convert_predict_rdd_to_xshard(shards, pred_rdd)
        result = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])

        assert np.array_equal(result, expected_result)

Example #4

Show file

    def test_xshards_predict(self):

        sc = init_nncontext()
        rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result_shards = estimator.predict(shards, batch_size=4)
        result = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])

        assert np.array_equal(result, expected_result)

Example #5

Show file

    def write(path,
              generator,
              schema,
              block_size=1000,
              write_mode="overwrite",
              **kwargs):
        """
        Take each record in the generator and write it to a parquet file.

        **generator**
        Each record in the generator is a dict, the key is a string and will be the
        column name of saved parquet record and the value is the data.

        **schema**
        schema defines the name, dtype, shape of a column, as well as the feature
        type of a column. The feature type, defines how to encode and decode the column value.

        There are three kinds of feature type:
        1. Scalar, such as a int or float number, or a string, which can be directly mapped
           to a parquet type
        2. NDarray, which takes a np.ndarray and save it serialized bytes. The corresponding
           parquet type is BYTE_ARRAY .
        3. Image, which takes a string representing a image file in local file system and save
           the raw file content bytes.
           The corresponding parquet type is BYTE_ARRAY.

        :param path: the output path, e.g. file:///output/path, hdfs:///output/path
        :param generator: generate a dict, whose key is a string and value is one of
                          (a scalar value, ndarray, image file path)
        :param schema: a dict, whose key is a string, value is one of
                      (schema_field.Scalar, schema_field.NDarray, schema_field.Image)
        :param kwargs: other args
        """

        sc = init_nncontext()
        spark = SparkSession(sc)
        node_num, core_num = get_node_and_core_number()
        for i, chunk in enumerate(chunks(generator, block_size)):
            chunk_path = os.path.join(path, f"chunk={i}")
            rows_rdd = sc.parallelize(chunk, core_num * node_num) \
                .map(lambda x: dict_to_row(schema, x))
            spark.createDataFrame(rows_rdd).write.mode(write_mode).parquet(
                chunk_path)
        metadata_path = os.path.join(path, "_orca_metadata")

        write_text(metadata_path, encode_schema(schema))