def main(): args = parser.parse_args() cluster_mode = args.cluster_mode if cluster_mode.startswith("yarn"): hadoop_conf = os.environ.get("HADOOP_CONF_DIR") assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ "set the environment variable HADOOP_CONF_DIR" spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ .set("spark.executor.cores", 2) \ .set("spark.executor.instances", 2) \ .set("spark.driver.memory", "2g") if cluster_mode == "yarn-client": sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) else: sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) else: sc = init_nncontext() def model_fn(features, labels, mode): from nets import lenet slim = tf.contrib.slim with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(features, num_classes=10, is_training=True) if mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.TRAIN: loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) optimizer = ZooOptimizer(tf.train.AdamOptimizer()) train_op = optimizer.minimize(loss) return tf.estimator.EstimatorSpec(mode, predictions=logits, loss=loss, train_op=train_op) else: return tf.estimator.EstimatorSpec(mode, predictions=logits) def input_fn(mode): if mode == tf.estimator.ModeKeys.TRAIN: training_data = get_data("train") dataset = TFDataset.from_ndarrays(training_data, batch_size=320) elif mode == tf.estimator.ModeKeys.EVAL: testing_data = get_data("test") dataset = TFDataset.from_ndarrays(testing_data, batch_per_thread=80) else: images, _ = get_data("test") dataset = TFDataset.from_ndarrays(images, batch_per_thread=80) return dataset estimator = TFEstimator.from_model_fn(model_fn, model_dir="/tmp/estimator") estimator.train(input_fn, steps=10) metrics = estimator.evaluate(input_fn, ["acc"]) print(metrics) predictions = estimator.predict(input_fn) print(predictions.first()) print("finished...") sc.stop()
def main(data_num): data_path = '/tmp/mnist' if not args.data_path else args.data_path cluster_mode = args.cluster_mode if cluster_mode.startswith("yarn"): hadoop_conf = os.environ.get("HADOOP_CONF_DIR") assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ "set the environment variable HADOOP_CONF_DIR" spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ .set("spark.executor.cores", 2) \ .set("spark.executor.instances", 2) \ .set("spark.executorEnv.HTTP_PROXY", "http://child-prc.intel.com:913") \ .set("spark.executorEnv.HTTPS_PROXY", "http://child-prc.intel.com:913") \ .set("spark.driver.memory", "2g") if cluster_mode == "yarn-client": sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) else: sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) else: sc = init_nncontext() # get data, pre-process and create TFDataset (images_data, labels_data) = mnist.read_data_sets(data_path, "test") images_data = (images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD labels_data = labels_data[:data_num].astype(np.int32) dataset = TFDataset.from_ndarrays((images_data, labels_data), batch_per_thread=20) # construct the model from TFDataset images, labels = dataset.tensors labels = tf.squeeze(labels) with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=False) predictions = tf.to_int32(tf.argmax(logits, axis=1)) correct = tf.expand_dims(tf.to_int32(tf.equal(predictions, labels)), axis=1) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, "/tmp/lenet/model") predictor = TFPredictor(sess, [correct]) accuracy = predictor.predict().mean() print("predict accuracy is %s" % accuracy)
def setup_method(self, method): """ setup any state tied to the execution of the given method in a class. setup_method is invoked for every test method of a class. """ sparkConf = init_spark_conf().setMaster("local[1]").setAppName( "testEstimator") self.sc = init_nncontext(sparkConf)
def test_dataframe_shard_size(self): from bigdl.orca import OrcaContext OrcaContext._shard_size = 3 sc = init_nncontext() rdd = sc.range(0, 10) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect() OrcaContext._shard_size = None
def test_estimator_graph_dataframe(self): tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") sc = init_nncontext() sqlcontext = SQLContext(sc) df = sqlcontext.read.csv(file_path, header=True, inferSchema=True) est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], label_cols=['label'], validation_data=df) result = est.evaluate(df, batch_size=4, feature_cols=['user', 'item'], label_cols=['label']) print(result) prediction_df = est.predict(df, batch_size=4, feature_cols=['user', 'item']) assert 'prediction' in prediction_df.columns predictions = prediction_df.collect() assert len(predictions) == 48
def test_partition_num_less_than_workers(self): sc = init_nncontext() rdd = sc.range(200, numSlices=1) assert rdd.getNumPartitions() == 1 from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) assert df.rdd.getNumPartitions() < trainer.num_workers trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, validation_data=df, validation_steps=1, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect()
def test_xshards_predict_save_load(self): sc = init_nncontext() rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: IdentityNet()) result_shards = estimator.predict(shards, batch_size=4) result_before = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result_before, expected_result) path = "/tmp/model.pth" try: estimator.save(path) estimator.load(path) result_shards = estimator.predict(shards, batch_size=4) result_after = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) finally: os.remove(path) assert np.array_equal(result_before, result_after)
def read_rdd(esConfig, esResource=None, filter=None, esQuery=None): """ Read the data from elastic search into Spark RDD. :param esConfig: Dictionary which represents configuration for elastic search(eg. ip, port, es query etc). :param esResource: Optional. resource file in elastic search. It also can be set in esConfig :param filter: Optional. Request only those fields from Elasticsearch :param esQuery: Optional. es query :return: Spark RDD """ sc = init_nncontext() if "es.resource" not in esConfig: esConfig["es.resource"] = esResource if filter is not None: esConfig["es.read.source.filter"] = filter if esQuery is not None: esConfig["es.query"] = esQuery rdd = sc.newAPIHadoopRDD( "org.elasticsearch.hadoop.mr.EsInputFormat", "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=esConfig) return rdd
def partition(data, num_shards=None): """ Partition local in memory data and form a SparkXShards :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure made of tuple, list, dict with ndarray as the leaf value :param num_shards: the number of shards that the data will be partitioned into :return: a SparkXShards """ sc = init_nncontext() node_num, core_num = get_node_and_core_number() shard_num = node_num * core_num if num_shards is None else num_shards import numpy as np type_err_msg = """ The types supported in bigdl.orca.data.XShards.partition are 1. np.ndarray 2. a tuple, list, dict of np.ndarray 3. nested structure made of tuple, list, dict with ndarray as the leaf value But got data of type {} """.format(type(data)) supported_types = {list, tuple, dict} if isinstance(data, np.ndarray): if data.shape[0] < shard_num: raise ValueError( "The length of data {} is smaller than the total number " "of shards {}. Please adjust the num_shards option to be " "at most {}.".format(data.shape[0], shard_num, data.shape[0])) arrays = np.array_split(data, shard_num) rdd = sc.parallelize(arrays) else: assert type(data) in supported_types, type_err_msg flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_shard = [] if data_length < shard_num: raise ValueError( "The length of data {} is smaller than the total number " "of shards {}. Please adjust the num_shards option to be " "at most {}.".format(data_length, shard_num, data_length)) for i in range(shard_num): data_to_be_shard.append([]) for x in flattened: assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension, " \ "got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, shard_num) for idx, x_part in enumerate(x_parts): data_to_be_shard[idx].append(x_part) data_to_be_shard = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_shard ] rdd = sc.parallelize(data_to_be_shard) data_shards = SparkXShards(rdd) return data_shards
def test_not_sync_stats(self): sc = init_nncontext() rdd = sc.range(0, 100).repartition(2) # the data and model are constructed that loss on worker 0 is always 0.0 # and loss on worker 1 is always 1.0 df = rdd.mapPartitionsWithIndex( lambda idx, iter: [([float(idx)], [0.0]) for _ in iter]).toDF( ["feature", "label"]) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: LinearModel(), loss=nn.MSELoss(), optimizer=get_zero_optimizer, sync_stats=False) stats = estimator.fit(df, batch_size=4, epochs=2, feature_cols=["feature"], label_cols=["label"], reduce_results=False) worker_0_stats, worker_1_stats = stats[0] train_loss_0 = worker_0_stats["train_loss"] train_loss_1 = worker_1_stats["train_loss"] error_msg = f"stats from all workers should not be the same, " \ f"but got worker_0_stats: {worker_0_stats}, worker_1_stats: {worker_1_stats}" assert abs(train_loss_0 - train_loss_1) > 0.9, error_msg
def test_num_part_data_diff_val_data(self): sc = init_nncontext() rdd = sc.range(200, numSlices=10) val_rdd = sc.range(60, numSlices=8) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) val_df = val_rdd.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)), int(np.random.randint(0, 1, size=()))))\ .toDF(["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) assert df.rdd.getNumPartitions() > trainer.num_workers assert df.rdd.getNumPartitions() != val_df.rdd.getNumPartitions() trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, validation_data=val_df, validation_steps=1, feature_cols=["feature"], label_cols=["label"])
def load_pickle(cls, path, minPartitions=None): """ Load XShards from pickle files. :param path: The pickle file path/directory :param minPartitions: The minimum partitions for the XShards :return: SparkXShards object """ sc = init_nncontext() return SparkXShards(sc.pickleFile(path, minPartitions))
def test_dataframe_predict(self): sc = init_nncontext() rdd = sc.parallelize(range(20)) df = rdd.map(lambda x: ([float(x)] * 5, [int(np.random.randint(0, 2, size=()))])).toDF( ["feature", "label"]) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: IdentityNet()) result = estimator.predict(df, batch_size=4, feature_cols=["feature"]) expr = "sum(cast(feature <> to_array(prediction) as int)) as error" assert result.selectExpr(expr).first()["error"] == 0
def test_estimator_graph_dataframe_exception(self): tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") sc = init_nncontext() sqlcontext = SQLContext(sc) df = sqlcontext.read.csv(file_path, header=True, inferSchema=True) est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) with self.assertRaises(Exception) as context: est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], validation_data=df) self.assertTrue( 'label columns is None; it should not be None in training' in str( context.exception)) est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], label_cols=['label']) with self.assertRaises(Exception) as context: predictions = est.predict(df, batch_size=4).collect() self.assertTrue( 'feature columns is None; it should not be None in prediction' in str(context.exception)) with self.assertRaises(Exception) as context: est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], label_cols=['label'], validation_data=[1, 2, 3]) self.assertTrue( 'train data and validation data should be both Spark DataFrame' in str(context.exception))
def write(path, generator, schema, block_size=1000, write_mode="overwrite", **kwargs): """ Take each record in the generator and write it to a parquet file. **generator** Each record in the generator is a dict, the key is a string and will be the column name of saved parquet record and the value is the data. **schema** schema defines the name, dtype, shape of a column, as well as the feature type of a column. The feature type, defines how to encode and decode the column value. There are three kinds of feature type: 1. Scalar, such as a int or float number, or a string, which can be directly mapped to a parquet type 2. NDarray, which takes a np.ndarray and save it serialized bytes. The corresponding parquet type is BYTE_ARRAY . 3. Image, which takes a string representing a image file in local file system and save the raw file content bytes. The corresponding parquet type is BYTE_ARRAY. :param path: the output path, e.g. file:///output/path, hdfs:///output/path :param generator: generate a dict, whose key is a string and value is one of (a scalar value, ndarray, image file path) :param schema: a dict, whose key is a string, value is one of (schema_field.Scalar, schema_field.NDarray, schema_field.Image) :param kwargs: other args """ sc = init_nncontext() spark = SparkSession(sc) node_num, core_num = get_node_and_core_number() for i, chunk in enumerate(chunks(generator, block_size)): chunk_path = os.path.join(path, f"chunk={i}") rows_rdd = sc.parallelize(chunk, core_num * node_num) \ .map(lambda x: dict_to_row(schema, x)) spark.createDataFrame(rows_rdd).write.mode(write_mode).parquet( chunk_path) metadata_path = os.path.join(path, "_orca_metadata") write_text(metadata_path, encode_schema(schema))
def test_spark_xshards(self): from bigdl.dllib.nncontext import init_nncontext from bigdl.orca.data import SparkXShards estimator = get_estimator(workers_per_node=1) sc = init_nncontext() x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32)) # torch 1.7.1+ requires target size same as output size, which is (batch, 1) y_rdd = sc.parallelize( np.random.randint(0, 2, size=(4000, 1, 1)).astype(np.float32)) rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]}) train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1]) train_xshards = SparkXShards(train_rdd) val_xshards = SparkXShards(val_rdd) train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2) print(train_stats) val_stats = estimator.evaluate(val_xshards, batch_size=128) print(val_stats)
def test_dataframe_train_eval(self): sc = init_nncontext() rdd = sc.range(0, 100) df = rdd.map(lambda x: (np.random.randn(50).astype(np.float).tolist( ), [int(np.random.randint(0, 2, size=()))])).toDF(["feature", "label"]) estimator = get_estimator(workers_per_node=2) estimator.fit(df, batch_size=4, epochs=2, feature_cols=["feature"], label_cols=["label"]) estimator.evaluate(df, batch_size=4, feature_cols=["feature"], label_cols=["label"])
def test_xshards_predict(self): sc = init_nncontext() rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: IdentityNet()) result_shards = estimator.predict(shards, batch_size=4) result = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result, expected_result)
def test_data_parallel_sgd_correctness(self): sc = init_nncontext() rdd = sc.range(0, 100).repartition(2) # partition 0: [(0, 0), (0, 0)] # partition 1: [(1, 0), (1, 0)] # model: y = w * x # loss = (wx)^2 # dloss/dw = 2x^2*w # end of first iteration: # partition 0 loss: 0.0 # partition 1 loss: 1.0 # avg_grad = avg([0, 0, 2, 2]) = 1 # weight = 1.0 - 0.5 * avg_grad = 0.5 # end of second iteration: # partition 0 loss: 0.0 # partition 1 loss: 0.25 # avg_grad = avg([0, 0, 1, 1]) = 0.5 # weight = 0.5 - 0.5 * avg_grad = 0.25 df = rdd.mapPartitionsWithIndex( lambda idx, iter: [([float(idx)], [0.0]) for _ in iter][:2]).toDF( ["feature", "label"]) def get_optimizer(model, config): return torch.optim.SGD(model.parameters(), lr=0.5) estimator = Estimator.from_torch(model=lambda config: LinearModel(), optimizer=get_optimizer, loss=torch.nn.MSELoss(), metrics=Accuracy(), config={}, workers_per_node=2, backend="torch_distributed", sync_stats=False) stats = estimator.fit(df, batch_size=4, epochs=2, feature_cols=["feature"], label_cols=["label"], reduce_results=False) state = estimator.get_state_dict() assert state['models'][0]['fc1.weight'].item() == 0.25
def test_partition_num_less_than_workers(self): sc = init_nncontext() rdd = sc.range(200, numSlices=1) df = rdd.map(lambda x: (np.random.randn(50).astype(np.float).tolist( ), [int(np.random.randint(0, 2, size=()))])).toDF(["feature", "label"]) estimator = get_estimator(workers_per_node=2) assert df.rdd.getNumPartitions() < estimator.num_workers estimator.fit(df, batch_size=4, epochs=2, feature_cols=["feature"], label_cols=["label"]) estimator.evaluate(df, batch_size=4, feature_cols=["feature"], label_cols=["label"]) estimator.predict(df, feature_cols=["feature"]).collect()
def test_openvino_predict_spark_df(self): from pyspark.sql import SparkSession self.load_resnet() sc = init_nncontext() spark = SparkSession(sc) input_list = self.input.tolist() rdd = sc.range(0, 18, numSlices=3) input_df = rdd.map(lambda x: [input_list]).toDF(["feature"]) with self.assertRaises(Exception): self.est.predict(input_df, feature_cols=["feature"]) rdd = sc.range(0, 18, numSlices=5) input_df = rdd.map(lambda x: [input_list]).toDF(["feature"]) result_df = self.est.predict(input_df, feature_cols=["feature"]) result = list(map(lambda row: np.array(row["prediction"]), result_df.select("prediction").collect())) assert np.array(result_df.select("prediction").first()).shape == (1, 1000) assert result_df.count() == 18 assert self.check_result(result, 18)
def save(self, path): """Save the ML instance to the input path.""" super(NNModelWriter, self).save(path) sc = init_nncontext() # change class name in metadata to python class name metadata_path = os.path.join(path, "metadata") metadataStr = sc.textFile(metadata_path, 1).first() metadata = json.loads(metadataStr) py_type = metadata['class'].replace("com.intel.analytics.zoo", "zoo") metadata['class'] = py_type metadata_json = json.dumps(metadata, separators=[',', ':']) # replace old metadata temp_dir = tempfile.mkdtemp() temp_meta_path = os.path.join(temp_dir, "metadata") sc.parallelize([metadata_json], 1).saveAsTextFile(temp_meta_path) for file in os.listdir(temp_meta_path): put_local_file_to_remote(os.path.join(temp_meta_path, file), os.path.join(metadata_path, file), True) import shutil shutil.rmtree(temp_dir)
def predict(model_path, img_path): model = InferenceModel() model.load_openvino(model_path, weight_path=model_path[:model_path.rindex(".")] + ".bin", batch_size=BATCH_SIZE) sc = init_nncontext("OpenVINO Python resnet_v1_50 Inference Example") # pre-processing infer_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageCenterCrop(224, 224), ImageMatToTensor(format="NHWC", to_RGB=True) ]) image_set = ImageSet.read(img_path, sc).\ transform(infer_transformer).get_image().collect() image_set = np.expand_dims(image_set, axis=1) for i in range(len(image_set) // BATCH_SIZE + 1): index = i * BATCH_SIZE # check whether out of index if index >= len(image_set): break batch = image_set[index] # put 4 images in one batch for j in range(index + 1, min(index + BATCH_SIZE, len(image_set))): batch = np.vstack((batch, image_set[j])) batch = np.expand_dims(batch, axis=0) # predict batch predictions = model.predict(batch) result = predictions[0] # post-processing for Top-1 print("batch_" + str(i)) for r in result: output = {} max_index = np.argmax(r) output["Top-1"] = str(max_index) print("* Predict result " + str(output)) print("finished...") sc.stop()
def test_openvino_predict_xshards(self): self.load_resnet() input_data_list = [np.array([self.input] * 4), np.concatenate([np.array([self.input] * 2), np.zeros([1, 3, 224, 224])])] sc = init_nncontext() rdd = sc.parallelize(input_data_list, numSlices=2) shards = SparkXShards(rdd) def pre_processing(images): return {"x": images} shards = shards.transform_shard(pre_processing) result = self.est.predict(shards) result_c = result.collect() assert isinstance(result, SparkXShards) assert result_c[0]["prediction"].shape == (4, 1000) assert result_c[1]["prediction"].shape == (3, 1000) assert self.check_result(result_c[0]["prediction"], 4) assert self.check_result(result_c[1]["prediction"], 2) assert not self.check_result(result_c[1]["prediction"][2:], 1)
def read_parquet(file_path, columns=None, schema=None, **options): """ Read parquet files to SparkXShards of pandas DataFrames. :param file_path: Parquet file path, a list of multiple parquet file paths, or a directory containing parquet files. Local file system, HDFS, and AWS S3 are supported. :param columns: list of column name, default=None. If not None, only these columns will be read from the file. :param schema: pyspark.sql.types.StructType for the input schema or a DDL-formatted string (For example col0 INT, col1 DOUBLE). :param options: other options for reading parquet. :return: An instance of SparkXShards. """ sc = init_nncontext() spark = OrcaContext.get_spark_session() # df = spark.read.parquet(file_path) df = spark.read.load(file_path, "parquet", schema=schema, **options) if columns: df = df.select(*columns) def to_pandas(columns): def f(iter): import pandas as pd data = list(iter) pd_df = pd.DataFrame(data, columns=columns) return [pd_df] return f pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns)) try: data_shards = SparkXShards(pd_rdd) except Exception as e: print("An error occurred when reading parquet files") raise e return data_shards
def read_df(esConfig, esResource, schema=None): """ Read the data from elastic search into DataFrame. :param esConfig: Dictionary which represents configuration for elastic search(eg. ip, port etc). :param esResource: resource file in elastic search. :param schema: Optional. Defines the schema of Spark dataframe. If each column in Es is single value, don't need set schema. :return: Spark DataFrame. Each row represents a document in ES. """ sc = init_nncontext() spark = OrcaContext.get_spark_session() reader = spark.read.format("org.elasticsearch.spark.sql") for key in esConfig: reader.option(key, esConfig[key]) if schema: reader.schema(schema) df = reader.load(esResource) return df
def test_multiple_inputs_model(self): sc = init_nncontext() rdd = sc.parallelize(range(100)) from pyspark.sql import SparkSession spark = SparkSession(sc) df = rdd.map(lambda x: ([float(x)] * 25, [float(x)] * 25, [int(np.random.randint(0, 2, size=()))])).toDF( ["f1", "f2", "label"]) estimator = get_estimator(workers_per_node=2, model_fn=lambda config: MultiInputNet()) estimator.fit(df, batch_size=4, epochs=2, feature_cols=["f1", "f2"], label_cols=["label"]) estimator.evaluate(df, batch_size=4, feature_cols=["f1", "f2"], label_cols=["label"]) result = estimator.predict(df, batch_size=4, feature_cols=["f1", "f2"]) result.collect()
def main(max_epoch): args = parser.parse_args() cluster_mode = args.cluster_mode if cluster_mode.startswith("yarn"): hadoop_conf = os.environ.get("HADOOP_CONF_DIR") assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ "set the environment variable HADOOP_CONF_DIR" spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ .set("spark.executor.cores", 2) \ .set("spark.executor.instances", 2) \ .set("spark.driver.memory", "2g") if cluster_mode == "yarn-client": _ = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) else: _ = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) else: _ = init_nncontext() (training_images_data, training_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") (testing_images_data, testing_labels_data) = mnist.read_data_sets("/tmp/mnist", "test") training_images_data = (training_images_data - mnist.TRAIN_MEAN) / mnist.TRAIN_STD testing_images_data = (testing_images_data - mnist.TRAIN_MEAN) / mnist.TRAIN_STD model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) keras_model = KerasModel(model) keras_model.fit(training_images_data, training_labels_data, validation_data=(testing_images_data, testing_labels_data), epochs=max_epoch, batch_size=320, distributed=True) result = keras_model.evaluate(testing_images_data, testing_labels_data, distributed=True, batch_per_thread=80) print(result) # >> [0.08865142822265625, 0.9722] # the following assert is used for internal testing assert result['acc Top1Accuracy'] > 0.95 keras_model.save_weights("/tmp/mnist_keras.h5")
def main(option): batch_size = 16 if not option.batch_size else int(option.batch_size) cluster_mode = options.cluster_mode if cluster_mode.startswith("yarn"): hadoop_conf = os.environ.get("HADOOP_CONF_DIR") assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ "set the environment variable HADOOP_CONF_DIR" spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ .set("spark.executor.cores", 2) \ .set("spark.executor.instances", 2) \ .set("spark.driver.memory", "2g") if cluster_mode == "yarn-client": sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) else: sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) else: sc = init_nncontext() def input_fn(mode, params): if mode == tf.estimator.ModeKeys.TRAIN: image_set = ImageSet.read(params["image_path"], sc=sc, with_label=True, one_based_label=False) train_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageRandomCrop(224, 224), ImageRandomPreprocessing(ImageHFlip(), 0.5), ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225), ImageMatToTensor(to_RGB=True, format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"]) ]) feature_set = FeatureSet.image_frame(image_set.to_image_frame()) feature_set = feature_set.transform(train_transformer) feature_set = feature_set.transform(ImageFeatureToSample()) dataset = TFDataset.from_feature_set(feature_set, features=(tf.float32, [224, 224, 3]), labels=(tf.int32, [1]), batch_size=batch_size) else: raise NotImplementedError return dataset def model_fn(features, labels, mode, params): from nets import inception slim = tf.contrib.slim labels = tf.squeeze(labels, axis=1) with slim.arg_scope(inception.inception_v1_arg_scope()): logits, end_points = inception.inception_v1( features, num_classes=int(params["num_classes"]), is_training=True) if mode == tf.estimator.ModeKeys.TRAIN: loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) train_op = ZooOptimizer(tf.train.AdamOptimizer()).minimize(loss) return tf.estimator.EstimatorSpec(mode, train_op=train_op, predictions=logits, loss=loss) else: raise NotImplementedError estimator = TFEstimator.from_model_fn(model_fn, params={ "image_path": option.image_path, "num_classes": option.num_classes, "batch_size": option.batch_size }) estimator.train(input_fn, steps=100) print("finished...") sc.stop()
def predict(self, data, feature_cols=None, batch_size=4): """ Predict input data :param batch_size: Int. Set batch Size, default is 4. :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy arrays are supported. If data is XShards, each partition is a dictionary of {'x': feature}, where feature(label) is a numpy array or a list of numpy arrays. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: predicted result. If the input data is XShards, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If the input data is numpy arrays or list of numpy arrays, the predict result is a numpy array or a list of numpy arrays. """ sc = init_nncontext() model_bytes_broadcast = sc.broadcast(self.model_bytes) weight_bytes_broadcast = sc.broadcast(self.weight_bytes) def partition_inference(partition): model_bytes = model_bytes_broadcast.value weight_bytes = weight_bytes_broadcast.value partition = list(partition) data_num = len(partition) ie = IECore() config = {'CPU_THREADS_NUM': str(self.core_num)} ie.set_config(config, 'CPU') net = ie.read_network(model=model_bytes, weights=weight_bytes, init_from_buffer=True) net.batch_size = batch_size local_model = ie.load_network(network=net, device_name="CPU", num_requests=data_num) inputs = list(iter(local_model.requests[0].input_blobs)) outputs = list(iter(local_model.requests[0].output_blobs)) assert len( outputs) != 0, "The number of model outputs should not be 0." def add_elem(d): d_len = len(d) if d_len < batch_size: rep_time = [1] * (d_len - 1) rep_time.append(batch_size - d_len + 1) return np.repeat(d, rep_time, axis=0), d_len else: return d, d_len results = [] for idx, batch_data in enumerate(partition): infer_request = local_model.requests[idx] input_dict = dict() elem_num = 0 if isinstance(batch_data, list): for i, input in enumerate(inputs): input_dict[input], elem_num = add_elem(batch_data[i]) else: input_dict[inputs[0]], elem_num = add_elem(batch_data) infer_request.infer(input_dict) if len(outputs) == 1: results.append(infer_request.output_blobs[ outputs[0]].buffer[:elem_num]) else: results.append( list( map( lambda output: infer_request.output_blobs[ output].buffer[:elem_num], outputs))) return results def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[0] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return feature_data if isinstance(data, DataFrame): from bigdl.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe xshards, _ = dataframe_to_xshards(data, validation_data=None, feature_cols=feature_cols, label_cols=None, mode="predict") transformed_data = xshards.transform_shard(predict_transform, batch_size) result_rdd = transformed_data.rdd.mapPartitions( lambda iter: partition_inference(iter)) return convert_predict_rdd_to_dataframe( data, result_rdd.flatMap(lambda data: data)) elif isinstance(data, SparkXShards): transformed_data = data.transform_shard(predict_transform, batch_size) result_rdd = transformed_data.rdd.mapPartitions( lambda iter: partition_inference(iter)) def update_result_shard(data): shard, y = data shard["prediction"] = y return shard return SparkXShards( data.rdd.zip(result_rdd).map(update_result_shard)) elif isinstance(data, (np.ndarray, list)): if isinstance(data, np.ndarray): split_num = math.ceil(len(data) / batch_size) arrays = np.array_split(data, split_num) num_slices = min(split_num, self.node_num) data_rdd = sc.parallelize(arrays, numSlices=num_slices) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = math.ceil(flattened[0].shape[0] / batch_size) num_slices = min(split_num, self.node_num) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=num_slices) print("Partition number: ", data_rdd.getNumPartitions()) result_rdd = data_rdd.mapPartitions( lambda iter: partition_inference(iter)) result_arr_list = result_rdd.collect() result_arr = None if isinstance(result_arr_list[0], list): result_arr = [ np.concatenate([r[i] for r in result_arr_list], axis=0) for i in range(len(result_arr_list[0])) ] elif isinstance(result_arr_list[0], np.ndarray): result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr" "ays are supported as input data, but get " + data.__class__.__name__)