Ejemplo n.º 1
0
def main():
    args = parser.parse_args()
    cluster_mode = args.cluster_mode
    if cluster_mode.startswith("yarn"):
        hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
        assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \
                "set the environment variable HADOOP_CONF_DIR"
        spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \
            .set("spark.executor.cores", 2) \
            .set("spark.executor.instances", 2) \
            .set("spark.driver.memory", "2g")
        if cluster_mode == "yarn-client":
            sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf)
        else:
            sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf)
    else:
        sc = init_nncontext()

    def model_fn(features, labels, mode):
        from nets import lenet
        slim = tf.contrib.slim
        with slim.arg_scope(lenet.lenet_arg_scope()):
            logits, end_points = lenet.lenet(features, num_classes=10, is_training=True)

        if mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.TRAIN:
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

            optimizer = ZooOptimizer(tf.train.AdamOptimizer())
            train_op = optimizer.minimize(loss)
            return tf.estimator.EstimatorSpec(mode, predictions=logits,
                                              loss=loss, train_op=train_op)
        else:
            return tf.estimator.EstimatorSpec(mode, predictions=logits)

    def input_fn(mode):
        if mode == tf.estimator.ModeKeys.TRAIN:
            training_data = get_data("train")
            dataset = TFDataset.from_ndarrays(training_data, batch_size=320)
        elif mode == tf.estimator.ModeKeys.EVAL:
            testing_data = get_data("test")
            dataset = TFDataset.from_ndarrays(testing_data, batch_per_thread=80)
        else:
            images, _ = get_data("test")
            dataset = TFDataset.from_ndarrays(images, batch_per_thread=80)

        return dataset
    estimator = TFEstimator.from_model_fn(model_fn, model_dir="/tmp/estimator")

    estimator.train(input_fn, steps=10)

    metrics = estimator.evaluate(input_fn, ["acc"])
    print(metrics)

    predictions = estimator.predict(input_fn)

    print(predictions.first())
    print("finished...")
    sc.stop()
Ejemplo n.º 2
0
def main(data_num):

    data_path = '/tmp/mnist' if not args.data_path else args.data_path
    cluster_mode = args.cluster_mode
    if cluster_mode.startswith("yarn"):
        hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
        assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \
                "set the environment variable HADOOP_CONF_DIR"
        spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \
            .set("spark.executor.cores", 2) \
            .set("spark.executor.instances", 2) \
            .set("spark.executorEnv.HTTP_PROXY", "http://child-prc.intel.com:913") \
            .set("spark.executorEnv.HTTPS_PROXY", "http://child-prc.intel.com:913") \
            .set("spark.driver.memory", "2g")
        if cluster_mode == "yarn-client":
            sc = init_nncontext(spark_conf,
                                cluster_mode="yarn-client",
                                hadoop_conf=hadoop_conf)
        else:
            sc = init_nncontext(spark_conf,
                                cluster_mode="yarn-cluster",
                                hadoop_conf=hadoop_conf)
    else:
        sc = init_nncontext()

    # get data, pre-process and create TFDataset
    (images_data, labels_data) = mnist.read_data_sets(data_path, "test")
    images_data = (images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD
    labels_data = labels_data[:data_num].astype(np.int32)
    dataset = TFDataset.from_ndarrays((images_data, labels_data),
                                      batch_per_thread=20)

    # construct the model from TFDataset
    images, labels = dataset.tensors

    labels = tf.squeeze(labels)

    with slim.arg_scope(lenet.lenet_arg_scope()):
        logits, end_points = lenet.lenet(images,
                                         num_classes=10,
                                         is_training=False)

    predictions = tf.to_int32(tf.argmax(logits, axis=1))
    correct = tf.expand_dims(tf.to_int32(tf.equal(predictions, labels)),
                             axis=1)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, "/tmp/lenet/model")

        predictor = TFPredictor(sess, [correct])

        accuracy = predictor.predict().mean()

        print("predict accuracy is %s" % accuracy)
Ejemplo n.º 3
0
 def setup_method(self, method):
     """ setup any state tied to the execution of the given method in a
     class.  setup_method is invoked for every test method of a class.
     """
     sparkConf = init_spark_conf().setMaster("local[1]").setAppName(
         "testEstimator")
     self.sc = init_nncontext(sparkConf)
Ejemplo n.º 4
0
    def test_dataframe_shard_size(self):
        from bigdl.orca import OrcaContext
        OrcaContext._shard_size = 3
        sc = init_nncontext()
        rdd = sc.range(0, 10)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
        OrcaContext._shard_size = None
    def test_estimator_graph_dataframe(self):
        tf.reset_default_graph()

        model = SimpleModel()
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        sc = init_nncontext()
        sqlcontext = SQLContext(sc)
        df = sqlcontext.read.csv(file_path, header=True, inferSchema=True)

        est = Estimator.from_graph(inputs=[model.user, model.item],
                                   labels=[model.label],
                                   outputs=[model.logits],
                                   loss=model.loss,
                                   optimizer=tf.train.AdamOptimizer(),
                                   metrics={"loss": model.loss})

        est.fit(data=df,
                batch_size=8,
                epochs=10,
                feature_cols=['user', 'item'],
                label_cols=['label'],
                validation_data=df)

        result = est.evaluate(df,
                              batch_size=4,
                              feature_cols=['user', 'item'],
                              label_cols=['label'])
        print(result)

        prediction_df = est.predict(df,
                                    batch_size=4,
                                    feature_cols=['user', 'item'])
        assert 'prediction' in prediction_df.columns
        predictions = prediction_df.collect()
        assert len(predictions) == 48
Ejemplo n.º 6
0
    def test_partition_num_less_than_workers(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=1)
        assert rdd.getNumPartitions() == 1
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)
        assert df.rdd.getNumPartitions() < trainer.num_workers

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    validation_data=df,
                    validation_steps=1,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
Ejemplo n.º 7
0
    def test_xshards_predict_save_load(self):

        sc = init_nncontext()
        rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result_shards = estimator.predict(shards, batch_size=4)
        result_before = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])
        assert np.array_equal(result_before, expected_result)

        path = "/tmp/model.pth"
        try:
            estimator.save(path)
            estimator.load(path)
            result_shards = estimator.predict(shards, batch_size=4)
            result_after = np.concatenate(
                [shard["prediction"] for shard in result_shards.collect()])

        finally:
            os.remove(path)

        assert np.array_equal(result_before, result_after)
Ejemplo n.º 8
0
    def read_rdd(esConfig, esResource=None, filter=None, esQuery=None):
        """
        Read the data from elastic search into Spark RDD.

        :param esConfig: Dictionary which represents configuration for
               elastic search(eg. ip, port, es query etc).
        :param esResource: Optional. resource file in elastic search.
               It also can be set in esConfig
        :param filter: Optional. Request only those fields from Elasticsearch
        :param esQuery: Optional. es query
        :return: Spark RDD
        """
        sc = init_nncontext()
        if "es.resource" not in esConfig:
            esConfig["es.resource"] = esResource
        if filter is not None:
            esConfig["es.read.source.filter"] = filter
        if esQuery is not None:
            esConfig["es.query"] = esQuery
        rdd = sc.newAPIHadoopRDD(
            "org.elasticsearch.hadoop.mr.EsInputFormat",
            "org.apache.hadoop.io.NullWritable",
            "org.elasticsearch.hadoop.mr.LinkedMapWritable",
            conf=esConfig)
        return rdd
Ejemplo n.º 9
0
    def partition(data, num_shards=None):
        """

        Partition local in memory data and form a SparkXShards

        :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure
        made of tuple, list, dict with ndarray as the leaf value
        :param num_shards: the number of shards that the data will be partitioned into
        :return: a SparkXShards
        """
        sc = init_nncontext()
        node_num, core_num = get_node_and_core_number()
        shard_num = node_num * core_num if num_shards is None else num_shards
        import numpy as np
        type_err_msg = """
The types supported in bigdl.orca.data.XShards.partition are
1. np.ndarray
2. a tuple, list, dict of np.ndarray
3. nested structure made of tuple, list, dict with ndarray as the leaf value

But got data of type {}
        """.format(type(data))
        supported_types = {list, tuple, dict}
        if isinstance(data, np.ndarray):
            if data.shape[0] < shard_num:
                raise ValueError(
                    "The length of data {} is smaller than the total number "
                    "of shards {}. Please adjust the num_shards option to be "
                    "at most {}.".format(data.shape[0], shard_num,
                                         data.shape[0]))
            arrays = np.array_split(data, shard_num)
            rdd = sc.parallelize(arrays)
        else:
            assert type(data) in supported_types, type_err_msg
            flattened = nest.flatten(data)
            data_length = len(flattened[0])
            data_to_be_shard = []
            if data_length < shard_num:
                raise ValueError(
                    "The length of data {} is smaller than the total number "
                    "of shards {}. Please adjust the num_shards option to be "
                    "at most {}.".format(data_length, shard_num, data_length))
            for i in range(shard_num):
                data_to_be_shard.append([])
            for x in flattened:
                assert len(x) == data_length, \
                    "the ndarrays in data must all have the same size in first dimension, " \
                    "got first ndarray of size {} and another {}".format(data_length, len(x))
                x_parts = np.array_split(x, shard_num)
                for idx, x_part in enumerate(x_parts):
                    data_to_be_shard[idx].append(x_part)

            data_to_be_shard = [
                nest.pack_sequence_as(data, shard)
                for shard in data_to_be_shard
            ]
            rdd = sc.parallelize(data_to_be_shard)

        data_shards = SparkXShards(rdd)
        return data_shards
Ejemplo n.º 10
0
    def test_not_sync_stats(self):
        sc = init_nncontext()
        rdd = sc.range(0, 100).repartition(2)

        # the data and model are constructed that loss on worker 0 is always 0.0
        # and loss on worker 1 is always 1.0

        df = rdd.mapPartitionsWithIndex(
            lambda idx, iter: [([float(idx)], [0.0]) for _ in iter]).toDF(
                ["feature", "label"])

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: LinearModel(),
                                  loss=nn.MSELoss(),
                                  optimizer=get_zero_optimizer,
                                  sync_stats=False)
        stats = estimator.fit(df,
                              batch_size=4,
                              epochs=2,
                              feature_cols=["feature"],
                              label_cols=["label"],
                              reduce_results=False)
        worker_0_stats, worker_1_stats = stats[0]
        train_loss_0 = worker_0_stats["train_loss"]
        train_loss_1 = worker_1_stats["train_loss"]
        error_msg = f"stats from all workers should not be the same, " \
                    f"but got worker_0_stats: {worker_0_stats}, worker_1_stats: {worker_1_stats}"
        assert abs(train_loss_0 - train_loss_1) > 0.9, error_msg
Ejemplo n.º 11
0
    def test_num_part_data_diff_val_data(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=10)
        val_rdd = sc.range(60, numSlices=8)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])
        val_df = val_rdd.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)),
                                        int(np.random.randint(0, 1, size=()))))\
            .toDF(["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)
        assert df.rdd.getNumPartitions() > trainer.num_workers
        assert df.rdd.getNumPartitions() != val_df.rdd.getNumPartitions()

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    validation_data=val_df,
                    validation_steps=1,
                    feature_cols=["feature"],
                    label_cols=["label"])
Ejemplo n.º 12
0
    def load_pickle(cls, path, minPartitions=None):
        """

        Load XShards from pickle files.

        :param path: The pickle file path/directory
        :param minPartitions: The minimum partitions for the XShards
        :return: SparkXShards object
        """
        sc = init_nncontext()
        return SparkXShards(sc.pickleFile(path, minPartitions))
Ejemplo n.º 13
0
    def test_dataframe_predict(self):

        sc = init_nncontext()
        rdd = sc.parallelize(range(20))
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result = estimator.predict(df, batch_size=4, feature_cols=["feature"])
        expr = "sum(cast(feature <> to_array(prediction) as int)) as error"
        assert result.selectExpr(expr).first()["error"] == 0
    def test_estimator_graph_dataframe_exception(self):
        tf.reset_default_graph()

        model = SimpleModel()
        file_path = os.path.join(resource_path, "orca/learn/ncf.csv")
        sc = init_nncontext()
        sqlcontext = SQLContext(sc)
        df = sqlcontext.read.csv(file_path, header=True, inferSchema=True)

        est = Estimator.from_graph(inputs=[model.user, model.item],
                                   labels=[model.label],
                                   outputs=[model.logits],
                                   loss=model.loss,
                                   optimizer=tf.train.AdamOptimizer(),
                                   metrics={"loss": model.loss})

        with self.assertRaises(Exception) as context:
            est.fit(data=df,
                    batch_size=8,
                    epochs=10,
                    feature_cols=['user', 'item'],
                    validation_data=df)
        self.assertTrue(
            'label columns is None; it should not be None in training' in str(
                context.exception))

        est.fit(data=df,
                batch_size=8,
                epochs=10,
                feature_cols=['user', 'item'],
                label_cols=['label'])
        with self.assertRaises(Exception) as context:
            predictions = est.predict(df, batch_size=4).collect()
        self.assertTrue(
            'feature columns is None; it should not be None in prediction' in
            str(context.exception))

        with self.assertRaises(Exception) as context:
            est.fit(data=df,
                    batch_size=8,
                    epochs=10,
                    feature_cols=['user', 'item'],
                    label_cols=['label'],
                    validation_data=[1, 2, 3])
        self.assertTrue(
            'train data and validation data should be both Spark DataFrame' in
            str(context.exception))
Ejemplo n.º 15
0
    def write(path,
              generator,
              schema,
              block_size=1000,
              write_mode="overwrite",
              **kwargs):
        """
        Take each record in the generator and write it to a parquet file.

        **generator**
        Each record in the generator is a dict, the key is a string and will be the
        column name of saved parquet record and the value is the data.

        **schema**
        schema defines the name, dtype, shape of a column, as well as the feature
        type of a column. The feature type, defines how to encode and decode the column value.

        There are three kinds of feature type:
        1. Scalar, such as a int or float number, or a string, which can be directly mapped
           to a parquet type
        2. NDarray, which takes a np.ndarray and save it serialized bytes. The corresponding
           parquet type is BYTE_ARRAY .
        3. Image, which takes a string representing a image file in local file system and save
           the raw file content bytes.
           The corresponding parquet type is BYTE_ARRAY.

        :param path: the output path, e.g. file:///output/path, hdfs:///output/path
        :param generator: generate a dict, whose key is a string and value is one of
                          (a scalar value, ndarray, image file path)
        :param schema: a dict, whose key is a string, value is one of
                      (schema_field.Scalar, schema_field.NDarray, schema_field.Image)
        :param kwargs: other args
        """

        sc = init_nncontext()
        spark = SparkSession(sc)
        node_num, core_num = get_node_and_core_number()
        for i, chunk in enumerate(chunks(generator, block_size)):
            chunk_path = os.path.join(path, f"chunk={i}")
            rows_rdd = sc.parallelize(chunk, core_num * node_num) \
                .map(lambda x: dict_to_row(schema, x))
            spark.createDataFrame(rows_rdd).write.mode(write_mode).parquet(
                chunk_path)
        metadata_path = os.path.join(path, "_orca_metadata")

        write_text(metadata_path, encode_schema(schema))
Ejemplo n.º 16
0
 def test_spark_xshards(self):
     from bigdl.dllib.nncontext import init_nncontext
     from bigdl.orca.data import SparkXShards
     estimator = get_estimator(workers_per_node=1)
     sc = init_nncontext()
     x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32))
     # torch 1.7.1+ requires target size same as output size, which is (batch, 1)
     y_rdd = sc.parallelize(
         np.random.randint(0, 2, size=(4000, 1, 1)).astype(np.float32))
     rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]})
     train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1])
     train_xshards = SparkXShards(train_rdd)
     val_xshards = SparkXShards(val_rdd)
     train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2)
     print(train_stats)
     val_stats = estimator.evaluate(val_xshards, batch_size=128)
     print(val_stats)
Ejemplo n.º 17
0
    def test_dataframe_train_eval(self):

        sc = init_nncontext()
        rdd = sc.range(0, 100)
        df = rdd.map(lambda x: (np.random.randn(50).astype(np.float).tolist(
        ), [int(np.random.randint(0, 2, size=()))])).toDF(["feature", "label"])

        estimator = get_estimator(workers_per_node=2)
        estimator.fit(df,
                      batch_size=4,
                      epochs=2,
                      feature_cols=["feature"],
                      label_cols=["label"])
        estimator.evaluate(df,
                           batch_size=4,
                           feature_cols=["feature"],
                           label_cols=["label"])
Ejemplo n.º 18
0
    def test_xshards_predict(self):

        sc = init_nncontext()
        rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result_shards = estimator.predict(shards, batch_size=4)
        result = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])

        assert np.array_equal(result, expected_result)
Ejemplo n.º 19
0
    def test_data_parallel_sgd_correctness(self):
        sc = init_nncontext()
        rdd = sc.range(0, 100).repartition(2)

        # partition 0: [(0, 0), (0, 0)]
        # partition 1: [(1, 0), (1, 0)]
        # model: y = w * x
        # loss = (wx)^2
        # dloss/dw = 2x^2*w
        # end of first iteration:
        #    partition 0 loss: 0.0
        #    partition 1 loss: 1.0
        #    avg_grad = avg([0, 0, 2, 2]) = 1
        #    weight = 1.0 - 0.5 * avg_grad = 0.5
        # end of second iteration:
        #    partition 0 loss: 0.0
        #    partition 1 loss: 0.25
        #    avg_grad = avg([0, 0, 1, 1]) = 0.5
        #    weight = 0.5 - 0.5 * avg_grad = 0.25
        df = rdd.mapPartitionsWithIndex(
            lambda idx, iter: [([float(idx)], [0.0]) for _ in iter][:2]).toDF(
                ["feature", "label"])

        def get_optimizer(model, config):
            return torch.optim.SGD(model.parameters(), lr=0.5)

        estimator = Estimator.from_torch(model=lambda config: LinearModel(),
                                         optimizer=get_optimizer,
                                         loss=torch.nn.MSELoss(),
                                         metrics=Accuracy(),
                                         config={},
                                         workers_per_node=2,
                                         backend="torch_distributed",
                                         sync_stats=False)

        stats = estimator.fit(df,
                              batch_size=4,
                              epochs=2,
                              feature_cols=["feature"],
                              label_cols=["label"],
                              reduce_results=False)

        state = estimator.get_state_dict()
        assert state['models'][0]['fc1.weight'].item() == 0.25
Ejemplo n.º 20
0
    def test_partition_num_less_than_workers(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=1)
        df = rdd.map(lambda x: (np.random.randn(50).astype(np.float).tolist(
        ), [int(np.random.randint(0, 2, size=()))])).toDF(["feature", "label"])

        estimator = get_estimator(workers_per_node=2)
        assert df.rdd.getNumPartitions() < estimator.num_workers

        estimator.fit(df,
                      batch_size=4,
                      epochs=2,
                      feature_cols=["feature"],
                      label_cols=["label"])
        estimator.evaluate(df,
                           batch_size=4,
                           feature_cols=["feature"],
                           label_cols=["label"])
        estimator.predict(df, feature_cols=["feature"]).collect()
    def test_openvino_predict_spark_df(self):
        from pyspark.sql import SparkSession

        self.load_resnet()
        sc = init_nncontext()
        spark = SparkSession(sc)
        input_list = self.input.tolist()
        rdd = sc.range(0, 18, numSlices=3)
        input_df = rdd.map(lambda x: [input_list]).toDF(["feature"])
        with self.assertRaises(Exception):
            self.est.predict(input_df, feature_cols=["feature"])
        rdd = sc.range(0, 18, numSlices=5)
        input_df = rdd.map(lambda x: [input_list]).toDF(["feature"])
        result_df = self.est.predict(input_df, feature_cols=["feature"])
        result = list(map(lambda row: np.array(row["prediction"]),
                          result_df.select("prediction").collect()))
        assert np.array(result_df.select("prediction").first()).shape == (1, 1000)
        assert result_df.count() == 18
        assert self.check_result(result, 18)
Ejemplo n.º 22
0
 def save(self, path):
     """Save the ML instance to the input path."""
     super(NNModelWriter, self).save(path)
     sc = init_nncontext()
     # change class name in metadata to python class name
     metadata_path = os.path.join(path, "metadata")
     metadataStr = sc.textFile(metadata_path, 1).first()
     metadata = json.loads(metadataStr)
     py_type = metadata['class'].replace("com.intel.analytics.zoo", "zoo")
     metadata['class'] = py_type
     metadata_json = json.dumps(metadata, separators=[',', ':'])
     # replace old metadata
     temp_dir = tempfile.mkdtemp()
     temp_meta_path = os.path.join(temp_dir, "metadata")
     sc.parallelize([metadata_json], 1).saveAsTextFile(temp_meta_path)
     for file in os.listdir(temp_meta_path):
         put_local_file_to_remote(os.path.join(temp_meta_path, file),
                                  os.path.join(metadata_path, file), True)
     import shutil
     shutil.rmtree(temp_dir)
Ejemplo n.º 23
0
def predict(model_path, img_path):
    model = InferenceModel()
    model.load_openvino(model_path,
                        weight_path=model_path[:model_path.rindex(".")] +
                        ".bin",
                        batch_size=BATCH_SIZE)
    sc = init_nncontext("OpenVINO Python resnet_v1_50 Inference Example")
    # pre-processing
    infer_transformer = ChainedPreprocessing([
        ImageBytesToMat(),
        ImageResize(256, 256),
        ImageCenterCrop(224, 224),
        ImageMatToTensor(format="NHWC", to_RGB=True)
    ])
    image_set = ImageSet.read(img_path, sc).\
        transform(infer_transformer).get_image().collect()
    image_set = np.expand_dims(image_set, axis=1)

    for i in range(len(image_set) // BATCH_SIZE + 1):
        index = i * BATCH_SIZE
        # check whether out of index
        if index >= len(image_set):
            break
        batch = image_set[index]
        # put 4 images in one batch
        for j in range(index + 1, min(index + BATCH_SIZE, len(image_set))):
            batch = np.vstack((batch, image_set[j]))
        batch = np.expand_dims(batch, axis=0)
        # predict batch
        predictions = model.predict(batch)
        result = predictions[0]

        # post-processing for Top-1
        print("batch_" + str(i))
        for r in result:
            output = {}
            max_index = np.argmax(r)
            output["Top-1"] = str(max_index)
            print("* Predict result " + str(output))
    print("finished...")
    sc.stop()
    def test_openvino_predict_xshards(self):
        self.load_resnet()
        input_data_list = [np.array([self.input] * 4),
                           np.concatenate([np.array([self.input] * 2),
                                           np.zeros([1, 3, 224, 224])])]
        sc = init_nncontext()
        rdd = sc.parallelize(input_data_list, numSlices=2)
        shards = SparkXShards(rdd)

        def pre_processing(images):
            return {"x": images}

        shards = shards.transform_shard(pre_processing)
        result = self.est.predict(shards)
        result_c = result.collect()
        assert isinstance(result, SparkXShards)
        assert result_c[0]["prediction"].shape == (4, 1000)
        assert result_c[1]["prediction"].shape == (3, 1000)
        assert self.check_result(result_c[0]["prediction"], 4)
        assert self.check_result(result_c[1]["prediction"], 2)
        assert not self.check_result(result_c[1]["prediction"][2:], 1)
Ejemplo n.º 25
0
def read_parquet(file_path, columns=None, schema=None, **options):
    """

    Read parquet files to SparkXShards of pandas DataFrames.

    :param file_path: Parquet file path, a list of multiple parquet file paths, or a directory
    containing parquet files. Local file system, HDFS, and AWS S3 are supported.
    :param columns: list of column name, default=None.
    If not None, only these columns will be read from the file.
    :param schema: pyspark.sql.types.StructType for the input schema or
    a DDL-formatted string (For example col0 INT, col1 DOUBLE).
    :param options: other options for reading parquet.
    :return: An instance of SparkXShards.
    """
    sc = init_nncontext()
    spark = OrcaContext.get_spark_session()
    # df = spark.read.parquet(file_path)
    df = spark.read.load(file_path, "parquet", schema=schema, **options)

    if columns:
        df = df.select(*columns)

    def to_pandas(columns):
        def f(iter):
            import pandas as pd
            data = list(iter)
            pd_df = pd.DataFrame(data, columns=columns)
            return [pd_df]

        return f

    pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns))
    try:
        data_shards = SparkXShards(pd_rdd)
    except Exception as e:
        print("An error occurred when reading parquet files")
        raise e
    return data_shards
Ejemplo n.º 26
0
    def read_df(esConfig, esResource, schema=None):
        """
        Read the data from elastic search into DataFrame.

        :param esConfig: Dictionary which represents configuration for
               elastic search(eg. ip, port etc).
        :param esResource: resource file in elastic search.
        :param schema: Optional. Defines the schema of Spark dataframe.
                If each column in Es is single value, don't need set schema.
        :return: Spark DataFrame. Each row represents a document in ES.
        """
        sc = init_nncontext()
        spark = OrcaContext.get_spark_session()

        reader = spark.read.format("org.elasticsearch.spark.sql")

        for key in esConfig:
            reader.option(key, esConfig[key])
        if schema:
            reader.schema(schema)

        df = reader.load(esResource)
        return df
Ejemplo n.º 27
0
    def test_multiple_inputs_model(self):

        sc = init_nncontext()
        rdd = sc.parallelize(range(100))

        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        df = rdd.map(lambda x: ([float(x)] * 25, [float(x)] * 25,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["f1", "f2", "label"])

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: MultiInputNet())
        estimator.fit(df,
                      batch_size=4,
                      epochs=2,
                      feature_cols=["f1", "f2"],
                      label_cols=["label"])
        estimator.evaluate(df,
                           batch_size=4,
                           feature_cols=["f1", "f2"],
                           label_cols=["label"])
        result = estimator.predict(df, batch_size=4, feature_cols=["f1", "f2"])
        result.collect()
Ejemplo n.º 28
0
def main(max_epoch):
    args = parser.parse_args()
    cluster_mode = args.cluster_mode
    if cluster_mode.startswith("yarn"):
        hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
        assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \
                "set the environment variable HADOOP_CONF_DIR"
        spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \
            .set("spark.executor.cores", 2) \
            .set("spark.executor.instances", 2) \
            .set("spark.driver.memory", "2g")
        if cluster_mode == "yarn-client":
            _ = init_nncontext(spark_conf,
                               cluster_mode="yarn-client",
                               hadoop_conf=hadoop_conf)
        else:
            _ = init_nncontext(spark_conf,
                               cluster_mode="yarn-cluster",
                               hadoop_conf=hadoop_conf)
    else:
        _ = init_nncontext()

    (training_images_data,
     training_labels_data) = mnist.read_data_sets("/tmp/mnist", "train")
    (testing_images_data,
     testing_labels_data) = mnist.read_data_sets("/tmp/mnist", "test")

    training_images_data = (training_images_data -
                            mnist.TRAIN_MEAN) / mnist.TRAIN_STD
    testing_images_data = (testing_images_data -
                           mnist.TRAIN_MEAN) / mnist.TRAIN_STD

    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax'),
    ])

    model.compile(optimizer='rmsprop',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    keras_model = KerasModel(model)

    keras_model.fit(training_images_data,
                    training_labels_data,
                    validation_data=(testing_images_data, testing_labels_data),
                    epochs=max_epoch,
                    batch_size=320,
                    distributed=True)

    result = keras_model.evaluate(testing_images_data,
                                  testing_labels_data,
                                  distributed=True,
                                  batch_per_thread=80)

    print(result)
    # >> [0.08865142822265625, 0.9722]

    # the following assert is used for internal testing
    assert result['acc Top1Accuracy'] > 0.95

    keras_model.save_weights("/tmp/mnist_keras.h5")
Ejemplo n.º 29
0
def main(option):
    batch_size = 16 if not option.batch_size else int(option.batch_size)
    cluster_mode = options.cluster_mode
    if cluster_mode.startswith("yarn"):
        hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
        assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \
                "set the environment variable HADOOP_CONF_DIR"
        spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \
            .set("spark.executor.cores", 2) \
            .set("spark.executor.instances", 2) \
            .set("spark.driver.memory", "2g")
        if cluster_mode == "yarn-client":
            sc = init_nncontext(spark_conf,
                                cluster_mode="yarn-client",
                                hadoop_conf=hadoop_conf)
        else:
            sc = init_nncontext(spark_conf,
                                cluster_mode="yarn-cluster",
                                hadoop_conf=hadoop_conf)
    else:
        sc = init_nncontext()

    def input_fn(mode, params):

        if mode == tf.estimator.ModeKeys.TRAIN:
            image_set = ImageSet.read(params["image_path"],
                                      sc=sc,
                                      with_label=True,
                                      one_based_label=False)
            train_transformer = ChainedPreprocessing([
                ImageBytesToMat(),
                ImageResize(256, 256),
                ImageRandomCrop(224, 224),
                ImageRandomPreprocessing(ImageHFlip(), 0.5),
                ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224,
                                      0.225),
                ImageMatToTensor(to_RGB=True, format="NHWC"),
                ImageSetToSample(input_keys=["imageTensor"],
                                 target_keys=["label"])
            ])
            feature_set = FeatureSet.image_frame(image_set.to_image_frame())
            feature_set = feature_set.transform(train_transformer)
            feature_set = feature_set.transform(ImageFeatureToSample())
            dataset = TFDataset.from_feature_set(feature_set,
                                                 features=(tf.float32,
                                                           [224, 224, 3]),
                                                 labels=(tf.int32, [1]),
                                                 batch_size=batch_size)
        else:
            raise NotImplementedError

        return dataset

    def model_fn(features, labels, mode, params):
        from nets import inception
        slim = tf.contrib.slim
        labels = tf.squeeze(labels, axis=1)
        with slim.arg_scope(inception.inception_v1_arg_scope()):
            logits, end_points = inception.inception_v1(
                features,
                num_classes=int(params["num_classes"]),
                is_training=True)

        if mode == tf.estimator.ModeKeys.TRAIN:
            loss = tf.reduce_mean(
                tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                       labels=labels))
            train_op = ZooOptimizer(tf.train.AdamOptimizer()).minimize(loss)
            return tf.estimator.EstimatorSpec(mode,
                                              train_op=train_op,
                                              predictions=logits,
                                              loss=loss)
        else:
            raise NotImplementedError

    estimator = TFEstimator.from_model_fn(model_fn,
                                          params={
                                              "image_path": option.image_path,
                                              "num_classes":
                                              option.num_classes,
                                              "batch_size": option.batch_size
                                          })

    estimator.train(input_fn, steps=100)
    print("finished...")
    sc.stop()
Ejemplo n.º 30
0
    def predict(self, data, feature_cols=None, batch_size=4):
        """
        Predict input data

        :param batch_size: Int. Set batch Size, default is 4.
        :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy
               arrays are supported. If data is XShards, each partition is a dictionary of  {'x':
               feature}, where feature(label) is a numpy array or a list of numpy arrays.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
               DataFrame. Default: None.
        :return: predicted result.
                 If the input data is XShards, the predict result is a XShards, each partition
                 of the XShards is a dictionary of {'prediction': result}, where the result is a
                 numpy array or a list of numpy arrays.
                 If the input data is numpy arrays or list of numpy arrays, the predict result is
                 a numpy array or a list of numpy arrays.
        """
        sc = init_nncontext()
        model_bytes_broadcast = sc.broadcast(self.model_bytes)
        weight_bytes_broadcast = sc.broadcast(self.weight_bytes)

        def partition_inference(partition):
            model_bytes = model_bytes_broadcast.value
            weight_bytes = weight_bytes_broadcast.value
            partition = list(partition)
            data_num = len(partition)
            ie = IECore()
            config = {'CPU_THREADS_NUM': str(self.core_num)}
            ie.set_config(config, 'CPU')
            net = ie.read_network(model=model_bytes,
                                  weights=weight_bytes,
                                  init_from_buffer=True)
            net.batch_size = batch_size
            local_model = ie.load_network(network=net,
                                          device_name="CPU",
                                          num_requests=data_num)
            inputs = list(iter(local_model.requests[0].input_blobs))
            outputs = list(iter(local_model.requests[0].output_blobs))
            assert len(
                outputs) != 0, "The number of model outputs should not be 0."

            def add_elem(d):
                d_len = len(d)
                if d_len < batch_size:
                    rep_time = [1] * (d_len - 1)
                    rep_time.append(batch_size - d_len + 1)
                    return np.repeat(d, rep_time, axis=0), d_len
                else:
                    return d, d_len

            results = []
            for idx, batch_data in enumerate(partition):
                infer_request = local_model.requests[idx]
                input_dict = dict()
                elem_num = 0
                if isinstance(batch_data, list):
                    for i, input in enumerate(inputs):
                        input_dict[input], elem_num = add_elem(batch_data[i])
                else:
                    input_dict[inputs[0]], elem_num = add_elem(batch_data)
                infer_request.infer(input_dict)
                if len(outputs) == 1:
                    results.append(infer_request.output_blobs[
                        outputs[0]].buffer[:elem_num])
                else:
                    results.append(
                        list(
                            map(
                                lambda output: infer_request.output_blobs[
                                    output].buffer[:elem_num], outputs)))

            return results

        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[0] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return feature_data

        if isinstance(data, DataFrame):
            from bigdl.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe
            xshards, _ = dataframe_to_xshards(data,
                                              validation_data=None,
                                              feature_cols=feature_cols,
                                              label_cols=None,
                                              mode="predict")
            transformed_data = xshards.transform_shard(predict_transform,
                                                       batch_size)
            result_rdd = transformed_data.rdd.mapPartitions(
                lambda iter: partition_inference(iter))
            return convert_predict_rdd_to_dataframe(
                data, result_rdd.flatMap(lambda data: data))
        elif isinstance(data, SparkXShards):
            transformed_data = data.transform_shard(predict_transform,
                                                    batch_size)
            result_rdd = transformed_data.rdd.mapPartitions(
                lambda iter: partition_inference(iter))

            def update_result_shard(data):
                shard, y = data
                shard["prediction"] = y
                return shard

            return SparkXShards(
                data.rdd.zip(result_rdd).map(update_result_shard))
        elif isinstance(data, (np.ndarray, list)):
            if isinstance(data, np.ndarray):
                split_num = math.ceil(len(data) / batch_size)
                arrays = np.array_split(data, split_num)
                num_slices = min(split_num, self.node_num)
                data_rdd = sc.parallelize(arrays, numSlices=num_slices)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = math.ceil(flattened[0].shape[0] / batch_size)
                num_slices = min(split_num, self.node_num)
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=num_slices)

            print("Partition number: ", data_rdd.getNumPartitions())
            result_rdd = data_rdd.mapPartitions(
                lambda iter: partition_inference(iter))
            result_arr_list = result_rdd.collect()
            result_arr = None
            if isinstance(result_arr_list[0], list):
                result_arr = [
                    np.concatenate([r[i] for r in result_arr_list], axis=0)
                    for i in range(len(result_arr_list[0]))
                ]
            elif isinstance(result_arr_list[0], np.ndarray):
                result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr"
                "ays are supported as input data, but get " +
                data.__class__.__name__)