def test_xshards_predict_save_load(self): sc = init_nncontext() rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: IdentityNet()) result_shards = estimator.predict(shards, batch_size=4) result_before = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result_before, expected_result) path = "/tmp/model.pth" try: estimator.save(path) estimator.load(path) result_shards = estimator.predict(shards, batch_size=4) result_after = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) finally: os.remove(path) assert np.array_equal(result_before, result_after)
def get_pred_xshards(key): rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: { key: np.stack(x) }).map(lambda x: {key: [x[key][:, :24], x[key][:, 24:]]}) shards = SparkXShards(shards) return shards
def test_convert_predict_rdd_to_xshard(self): rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) pred_rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) result_shards = convert_predict_rdd_to_xshard(shards, pred_rdd) result = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result, expected_result)
def test_xshards_predict(self): sc = init_nncontext() rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: IdentityNet()) result_shards = estimator.predict(shards, batch_size=4) result = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result, expected_result)
def write(path, generator, schema, block_size=1000, write_mode="overwrite", **kwargs): """ Take each record in the generator and write it to a parquet file. **generator** Each record in the generator is a dict, the key is a string and will be the column name of saved parquet record and the value is the data. **schema** schema defines the name, dtype, shape of a column, as well as the feature type of a column. The feature type, defines how to encode and decode the column value. There are three kinds of feature type: 1. Scalar, such as a int or float number, or a string, which can be directly mapped to a parquet type 2. NDarray, which takes a np.ndarray and save it serialized bytes. The corresponding parquet type is BYTE_ARRAY . 3. Image, which takes a string representing a image file in local file system and save the raw file content bytes. The corresponding parquet type is BYTE_ARRAY. :param path: the output path, e.g. file:///output/path, hdfs:///output/path :param generator: generate a dict, whose key is a string and value is one of (a scalar value, ndarray, image file path) :param schema: a dict, whose key is a string, value is one of (schema_field.Scalar, schema_field.NDarray, schema_field.Image) :param kwargs: other args """ sc = init_nncontext() spark = SparkSession(sc) node_num, core_num = get_node_and_core_number() for i, chunk in enumerate(chunks(generator, block_size)): chunk_path = os.path.join(path, f"chunk={i}") rows_rdd = sc.parallelize(chunk, core_num * node_num) \ .map(lambda x: dict_to_row(schema, x)) spark.createDataFrame(rows_rdd).write.mode(write_mode).parquet( chunk_path) metadata_path = os.path.join(path, "_orca_metadata") write_text(metadata_path, encode_schema(schema))