def main(data_num): data_path = '/tmp/mnist' if not args.data_path else args.data_path cluster_mode = args.cluster_mode if cluster_mode.startswith("yarn"): hadoop_conf = os.environ.get("HADOOP_CONF_DIR") assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ "set the environment variable HADOOP_CONF_DIR" spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ .set("spark.executor.cores", 2) \ .set("spark.executor.instances", 2) \ .set("spark.executorEnv.HTTP_PROXY", "http://child-prc.intel.com:913") \ .set("spark.executorEnv.HTTPS_PROXY", "http://child-prc.intel.com:913") \ .set("spark.driver.memory", "2g") if cluster_mode == "yarn-client": sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) else: sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) else: sc = init_nncontext() # get data, pre-process and create TFDataset (images_data, labels_data) = mnist.read_data_sets(data_path, "test") images_data = (images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD labels_data = labels_data[:data_num].astype(np.int32) dataset = TFDataset.from_ndarrays((images_data, labels_data), batch_per_thread=20) # construct the model from TFDataset images, labels = dataset.tensors labels = tf.squeeze(labels) with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=False) predictions = tf.to_int32(tf.argmax(logits, axis=1)) correct = tf.expand_dims(tf.to_int32(tf.equal(predictions, labels)), axis=1) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, "/tmp/lenet/model") predictor = TFPredictor(sess, [correct]) accuracy = predictor.predict().mean() print("predict accuracy is %s" % accuracy)
def get_data_rdd(dataset, sc): data_path = args.data_path from bigdl.dllib.feature.dataset import mnist (images_data, labels_data) = mnist.read_data_sets(data_path, dataset) image_rdd = sc.parallelize(images_data) labels_rdd = sc.parallelize(labels_data) rdd = image_rdd.zip(labels_rdd) \ .map(lambda rec_tuple: ((rec_tuple[0] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD, np.array(rec_tuple[1]))) return rdd
def get_mnist(data_type="train", location="/tmp/mnist"): """ Get mnist dataset with features and label as ndarray. Data would be downloaded automatically if it doesn't present at the specific location. :param data_type: "train" for training data and "test" for testing data. :param location: Location to store mnist dataset. :return: (features: ndarray, label: ndarray) """ X, Y = mnist.read_data_sets(location, data_type) return X, Y + 1 # The label of ClassNLLCriterion starts from 1 instead of 0
def get_mnist(sc, data_type="train", location="/tmp/mnist"): """ Download or load MNIST dataset to/from the specified path. Normalize and transform input data into an RDD of Sample """ from bigdl.dllib.feature.dataset import mnist from bigdl.dllib.feature.dataset.transformer import normalizer (images, labels) = mnist.read_data_sets(location, data_type) images = images.reshape((images.shape[0], ) + input_shape) images = sc.parallelize(images) labels = sc.parallelize(labels + 1) # Target start from 1 in BigDL record = images.zip(labels).map(lambda rec_tuple: (normalizer(rec_tuple[0], mnist.TRAIN_MEAN, mnist.TRAIN_STD), rec_tuple[1])) \ .map(lambda t: Sample.from_ndarray(t[0], t[1])) return record
def get_mnist(sc, data_type="train", location="/tmp/mnist"): """ Get mnist dataset and parallelize into RDDs. Data would be downloaded automatically if it doesn't present at the specific location. :param sc: SparkContext. :param data_type: "train" for training data and "test" for testing data. :param location: Location to store mnist dataset. :return: RDD of (features: ndarray, label: ndarray). """ (images, labels) = mnist.read_data_sets(location, data_type) images = sc.parallelize(images) labels = sc.parallelize(labels + 1) # Target start from 1 in BigDL record = images.zip(labels) return record
def get_data(dataset): from bigdl.dllib.feature.dataset import mnist (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", dataset) images_data = (images_data - mnist.TRAIN_MEAN) / mnist.TRAIN_STD return (images_data, labels_data.astype(np.int32))
def main(max_epoch): args = parser.parse_args() cluster_mode = args.cluster_mode if cluster_mode.startswith("yarn"): hadoop_conf = os.environ.get("HADOOP_CONF_DIR") assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ "set the environment variable HADOOP_CONF_DIR" spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ .set("spark.executor.cores", 2) \ .set("spark.executor.instances", 2) \ .set("spark.driver.memory", "2g") if cluster_mode == "yarn-client": _ = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) else: _ = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) else: _ = init_nncontext() (training_images_data, training_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") (testing_images_data, testing_labels_data) = mnist.read_data_sets("/tmp/mnist", "test") training_images_data = (training_images_data - mnist.TRAIN_MEAN) / mnist.TRAIN_STD testing_images_data = (testing_images_data - mnist.TRAIN_MEAN) / mnist.TRAIN_STD model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax'), ]) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) keras_model = KerasModel(model) keras_model.fit(training_images_data, training_labels_data, validation_data=(testing_images_data, testing_labels_data), epochs=max_epoch, batch_size=320, distributed=True) result = keras_model.evaluate(testing_images_data, testing_labels_data, distributed=True, batch_per_thread=80) print(result) # >> [0.08865142822265625, 0.9722] # the following assert is used for internal testing assert result['acc Top1Accuracy'] > 0.95 keras_model.save_weights("/tmp/mnist_keras.h5")
def main(max_epoch, data_num): args = parser.parse_args() cluster_mode = args.cluster_mode if cluster_mode.startswith("yarn"): hadoop_conf = os.environ.get("HADOOP_CONF_DIR") assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ "set the environment variable HADOOP_CONF_DIR" spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ .set("spark.executor.cores", 2) \ .set("spark.executor.instances", 2) \ .set("spark.driver.memory", "2g") if cluster_mode == "yarn-client": sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) else: sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) else: sc = init_nncontext() # get data, pre-process and create TFDataset (train_images_data, train_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") (test_images_data, test_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") train_images_data = (train_images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD train_labels_data = train_labels_data[:data_num].astype(np.int) test_images_data = (test_images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD test_labels_data = (test_labels_data[:data_num]).astype(np.int) dataset = TFDataset.from_ndarrays( (train_images_data, train_labels_data), batch_size=360, val_tensors=(test_images_data, test_labels_data)) # construct the model from TFDataset images, labels = dataset.tensors with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=True) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) acc = accuracy(logits, labels) # create a optimizer optimizer = TFOptimizer.from_loss(loss, Adam(1e-3), metrics={"acc": acc}, model_dir="/tmp/lenet/") # kick off training optimizer.optimize(end_trigger=MaxEpoch(max_epoch)) saver = tf.train.Saver() saver.save(optimizer.sess, "/tmp/lenet/model")