Beispiel #1
0
    def fit(
        self,
        input_df,
        validation_df=None,
        metric="mse",
        recipe=SmokeRecipe(),
        mc=False,
        resources_per_trial={"cpu": 2},
        upload_dir=None,
    ):
        """
        Trains the model for time sequence prediction.
        If future sequence length > 1, use seq2seq model, else use vanilla LSTM model.
        :param input_df: The input time series data frame, Example:
         datetime   value   "extra feature 1"   "extra feature 2"
         2019-01-01 1.9 1   2
         2019-01-02 2.3 0   2
        :param validation_df: validation data
        :param metric: String. Metric used for train and validation. Available values are
                       "mean_squared_error" or "r_square"
        :param recipe: a Recipe object. Various recipes covers different search space and stopping
                      criteria. Default is SmokeRecipe().
        :param resources_per_trial: Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}`
        :param upload_dir: Optional URI to sync training results and checkpoints. We only support
            hdfs URI for now. It defaults to
            "hdfs:///user/{hadoop_user_name}/ray_checkpoints/{predictor_name}".
            Where hadoop_user_name is specified in init_orca_context or init_spark_on_yarn,
            which defaults to "root". predictor_name is the name used in predictor instantiation.
        )
        :return: a pipeline constructed with the best model and configs.
        """
        self._check_df(input_df)
        if validation_df is not None:
            self._check_df(validation_df)

        ray_ctx = RayContext.get()
        is_local = ray_ctx.is_local
        # BasePredictor._check_fit_metric(metric)
        if not is_local:
            if not upload_dir:
                hadoop_user_name = os.getenv("HADOOP_USER_NAME")
                upload_dir = os.path.join(os.sep, "user", hadoop_user_name,
                                          "ray_checkpoints", self.name)
            cmd = "hadoop fs -mkdir -p {}".format(upload_dir)
            process(cmd)
        else:
            upload_dir = None

        self.pipeline = self._hp_search(
            input_df,
            validation_df=validation_df,
            metric=metric,
            recipe=recipe,
            mc=mc,
            resources_per_trial=resources_per_trial,
            remote_dir=upload_dir)
        return self.pipeline
Beispiel #2
0
    def __init__(self,
                 *,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator=None,
                 scheduler_creator=None,
                 training_operator_cls=TrainingOperator,
                 initialization_hook=None,
                 config=None,
                 scheduler_step_freq="batch"):

        if not (callable(model_creator) and callable(optimizer_creator)
                and callable(data_creator)):
            raise ValueError(
                "Must provide a callable model_creator, optimizer_creator, "
                "and data_creator.")

        self.model_creator = model_creator
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.data_creator = data_creator
        self.scheduler_creator = scheduler_creator
        self.training_operator_cls = training_operator_cls
        self.scheduler_step_freq = scheduler_step_freq

        if not training_operator_cls and not loss_creator:
            raise ValueError("If a loss_creator is not provided, you must "
                             "provide a custom training operator.")

        self.initialization_hook = initialization_hook
        self.config = {} if config is None else config

        self.param = dict(model_creator=self.model_creator,
                          data_creator=self.data_creator,
                          optimizer_creator=self.optimizer_creator,
                          loss_creator=self.loss_creator,
                          scheduler_creator=self.scheduler_creator,
                          training_operator_cls=self.training_operator_cls,
                          scheduler_step_freq=self.scheduler_step_freq)
        super().__init__(RayContext.get(),
                         worker_cls=TorchWorker,
                         worker_param=self.param)

        def init_func():
            import torch
            torch.set_num_threads(self.cores_per_node)
            print("Worker initialized")

        self.run(init_func)
        remote_setups = [
            worker.setup.remote(None, None, None)
            for i, worker in enumerate(self.remote_workers)
        ]
        # Get setup tasks in order to throw errors on failure
        ray.get(remote_setups)
 def get_default_remote_dir(name):
     from zoo.ray import RayContext
     from zoo.orca.automl.search.utils import process
     ray_ctx = RayContext.get()
     if ray_ctx.is_local:
         return None
     else:
         default_remote_dir = f"hdfs:///tmp/{name}"
         process(command=f"hadoop fs -mkdir -p {default_remote_dir}")
         return default_remote_dir
Beispiel #4
0
 def to_spark_rdd(self):
     ray_ctx = RayContext.get()
     sc = ray_ctx.sc
     address = ray_ctx.redis_address
     password = ray_ctx.redis_password
     num_parts = ray.get(self.meta_store.num_partitions.remote())
     meta_store_name = f"meta_store:{self.uuid}"
     rdd = sc.parallelize([0] * num_parts * 10, num_parts)\
         .mapPartitionsWithIndex(
         lambda idx, _: get_from_ray(idx, address, password, meta_store_name))
     return rdd
Beispiel #5
0
    def predict(self, x=None, horizon=24, mc=False, num_workers=None):
        """
        Predict horizon time-points ahead the input x in fit_eval
        :param x: We don't support input x currently.
        :param horizon: horizon length to predict
        :param mc:
        :param num_workers: the number of workers to use. Note that there has to be an activate
            RayContext if num_workers > 1.
        :return:
        """
        if x is not None:
            raise ValueError("We don't support input x directly.")
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling predict"
            )
        if num_workers is None:
            num_workers = TCMF.get_default_num_workers()
        if num_workers > 1:
            import ray
            from zoo.ray import RayContext
            try:
                RayContext.get(initialize=False)
            except:
                try:
                    # detect whether ray has been started.
                    ray.put(None)
                except:
                    raise RuntimeError(
                        f"There must be an activate ray context while running with "
                        f"{num_workers} workers. You can either start and init a "
                        f"RayContext by init_orca_context(..., init_ray_on_spark="
                        f"True) or start Ray with ray.init()")

        out = self.model.predict_horizon(
            future=horizon,
            bsize=90,
            normalize=False,
            num_workers=num_workers,
        )
        return out[:, -horizon::]
Beispiel #6
0
 def test_gluon(self):
     current_ray_ctx = RayContext.get()
     address_info = current_ray_ctx.address_info
     assert "object_store_address" in address_info
     config = create_config(log_interval=2, optimizer="adam",
                            optimizer_params={'learning_rate': 0.02})
     estimator = Estimator(config, get_model, get_loss,
                           eval_metrics_creator=get_metrics,
                           validation_metrics_creator=get_metrics,
                           num_workers=2)
     estimator.fit(get_train_data_iter, validation_data=get_test_data_iter, epochs=2)
     estimator.shutdown()
Beispiel #7
0
    def from_partition_refs(parts_refs, part_ids, part_id2ip):
        ray_ctx = RayContext.get()
        uuid_str = str(uuid.uuid4())
        meta_store = MetaStore.options(name=f"meta_store:{uuid_str}").remote()

        results = []
        for part_id, part_ref in zip(part_ids, parts_refs):
            result = meta_store.set_partition_ref.remote(part_id, [part_ref])
            results.append(result)
        ray.get(results)

        return RayRdd(uuid_str, meta_store, part_id2ip)
    def impl_test_fit_and_evaluate(self, backend):
        import tensorflow as tf
        ray_ctx = RayContext.get()
        batch_size = 32
        global_batch_size = batch_size * ray_ctx.num_ray_nodes
        config = {
            "batch_size": global_batch_size
        }

        if backend == "horovod":
            trainer = Estimator.from_keras(
                model_creator=simple_model,
                compile_args_creator=compile_args,
                verbose=True,
                config=config,
                backend=backend)
        else:

            trainer = Estimator.from_keras(model_creator=model_creator,
                                           verbose=True,
                                           config=config,
                                           backend=backend,
                                           workers_per_node=2)

        # model baseline performance
        start_stats = trainer.evaluate(create_test_dataset,
                                       steps=NUM_TEST_SAMPLES // global_batch_size)
        print(start_stats)

        def scheduler(epoch):
            if epoch < 2:
                return 0.001
            else:
                return 0.001 * tf.math.exp(0.1 * (2 - epoch))

        scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)
        # train for 2 epochs
        trainer.fit(create_train_datasets, epochs=2, steps_per_epoch=10, callbacks=[scheduler])
        trainer.fit(create_train_datasets, epochs=2, steps_per_epoch=10, callbacks=[scheduler])

        # model performance after training (should improve)
        end_stats = trainer.evaluate(create_test_dataset,
                                     steps=NUM_TEST_SAMPLES // global_batch_size)
        print(end_stats)

        # sanity check that training worked
        dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
        dmse = (end_stats["validation_mean_squared_error"] -
                start_stats["validation_mean_squared_error"])
        print(f"dLoss: {dloss}, dMSE: {dmse}")

        assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"
Beispiel #9
0
 def to_spark_xshards(self):
     from zoo.orca.data import SparkXShards
     ray_ctx = RayContext.get()
     sc = ray_ctx.sc
     address = ray_ctx.redis_address
     password = ray_ctx.redis_password
     num_parts = self.num_partitions()
     partition2store = self.partition2store_name
     rdd = sc.parallelize([0] * num_parts * 10, num_parts)\
         .mapPartitionsWithIndex(
         lambda idx, _: get_from_ray(idx, address, password, partition2store))
     spark_xshards = SparkXShards(rdd)
     return spark_xshards
Beispiel #10
0
def stop_orca_context():
    """
    Stop the SparkContext (and stop Ray services across the cluster if necessary).
    """
    from pyspark import SparkContext
    from zoo.ray import RayContext
    ray_ctx = RayContext.get(initialize=False)
    if ray_ctx.initialized:
        ray_ctx.stop()
    sc = SparkContext.getOrCreate()
    if sc.getConf().get("spark.master").startswith("spark://"):
        from zoo import stop_spark_standalone
        stop_spark_standalone()
    sc.stop()
    def test_horovod_learning_rate_schedule(self):
        import horovod
        major, minor, patch = horovod.__version__.split(".")

        larger_major = int(major) > 0
        larger_minor = int(major) == 0 and int(minor) > 19
        larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2

        if larger_major or larger_minor or larger_patch:
            ray_ctx = RayContext.get()
            batch_size = 32
            workers_per_node = 4
            global_batch_size = batch_size * workers_per_node
            config = {"lr": 0.8}
            trainer = Estimator.from_keras(model_creator=simple_model,
                                           compile_args_creator=compile_args,
                                           verbose=True,
                                           config=config,
                                           backend="horovod",
                                           workers_per_node=workers_per_node)
            import horovod.tensorflow.keras as hvd
            callbacks = [
                hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                         initial_lr=0.4,
                                                         verbose=True),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=5,
                                                           end_epoch=10,
                                                           multiplier=1.,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=10,
                                                           end_epoch=15,
                                                           multiplier=1e-1,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=15,
                                                           end_epoch=20,
                                                           multiplier=1e-2,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=20,
                                                           multiplier=1e-3,
                                                           initial_lr=0.4),
                LRChecker()
            ]
            for i in range(30):
                trainer.fit(create_train_datasets,
                            epochs=1,
                            batch_size=global_batch_size,
                            callbacks=callbacks)
        else:
            # skip tests in horovod lower version
            pass
Beispiel #12
0
def init_ray_context_fixture():
    from zoo import init_spark_on_local
    from zoo.ray import RayContext
    sc = init_spark_on_local(cores=4, spark_log_level="INFO")
    ray_ctx = RayContext(sc=sc, object_store_memory="1g")
    ray_ctx.init()
    yield
    ray_ctx.stop()
    sc.stop()
Beispiel #13
0
    def fit(
            self,
            input_df,
            validation_df=None,
            metric="mse",
            recipe=SmokeRecipe(),
            mc=False,
            resources_per_trial={"cpu": 2},
    ):
        """
        Trains the model for time sequence prediction.
        If future sequence length > 1, use seq2seq model, else use vanilla LSTM model.
        :param input_df: The input time series data frame, Example:
         datetime   value   "extra feature 1"   "extra feature 2"
         2019-01-01 1.9 1   2
         2019-01-02 2.3 0   2
        :param validation_df: validation data
        :param metric: String. Metric used for train and validation. Available values are
                       "mean_squared_error" or "r_square"
        :param recipe: a Recipe object. Various recipes covers different search space and stopping
                      criteria. Default is SmokeRecipe().
        :param resources_per_trial: Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}`
        :return: a pipeline constructed with the best model and configs.
        """
        self._check_df(input_df)
        if validation_df is not None:
            self._check_df(validation_df)

        ray_ctx = RayContext.get()
        is_local = ray_ctx.is_local
        # BasePredictor._check_fit_metric(metric)
        if not is_local:
            remote_dir = os.path.join(os.sep, "ray_results", self.name)
            if self.name not in get_remote_list(os.path.dirname(remote_dir)):
                cmd = "hadoop fs -mkdir -p {}".format(remote_dir)
                process(cmd)
        else:
            remote_dir = None

        self.pipeline = self._hp_search(
            input_df,
            validation_df=validation_df,
            metric=metric,
            recipe=recipe,
            mc=mc,
            resources_per_trial=resources_per_trial,
            remote_dir=remote_dir)
        return self.pipeline
Beispiel #14
0
    def to_ray(self):
        """
        Put data of this SparkXShards to Ray cluster object store.
        :return: a new RayXShards which contains data of this SparkXShards.
        """
        from zoo.ray import RayContext
        ray_ctx = RayContext.get()
        object_store_address = ray_ctx.address_info["object_store_address"]

        def put_to_plasma(ids):
            def f(index, iterator):
                import pyarrow.plasma as plasma
                from zoo.util.utils import get_node_ip
                res = list(iterator)
                client = plasma.connect(object_store_address)
                target_id = ids[index]
                # If the ObjectID exists in plasma, we assume a task trial
                # succeeds and the data is already in the object store.
                if not client.contains(target_id):
                    object_id = client.put(res, target_id)
                    assert object_id == target_id, \
                        "Errors occurred when putting data into plasma object store"
                client.disconnect()
                yield target_id, get_node_ip()

            return f

        # Create plasma ObjectIDs beforehand instead of creating a random one every time to avoid
        # memory leak in case errors occur when putting data into plasma and Spark would retry.
        # ObjectIDs in plasma is a byte string of length 20 containing characters and numbers.
        # The random generation of ObjectIDs is often good enough to ensure unique IDs.
        import pyarrow.plasma as plasma
        object_ids = [
            plasma.ObjectID.from_random()
            for i in range(self.rdd.getNumPartitions())
        ]
        object_id_node_ips = self.rdd.mapPartitionsWithIndex(
            put_to_plasma(object_ids)).collect()
        self.uncache()
        # Sort the data according to the node_ips.
        object_id_node_ips.sort(key=lambda x: x[1])
        partitions = [
            RayPartition(object_id=id_ip[0],
                         node_ip=id_ip[1],
                         object_store_address=object_store_address)
            for id_ip in object_id_node_ips
        ]
        return RayXShards(partitions)
Beispiel #15
0
def orca_data_fixture():
    from zoo import init_spark_on_local
    from zoo.ray import RayContext
    global ray_ctx
    sc = init_spark_on_local(cores=4, spark_log_level="INFO")
    access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
    secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    ray_ctx = RayContext(sc=sc,
                         object_store_memory="1g",
                         env={
                             "AWS_ACCESS_KEY_ID": access_key_id,
                             "AWS_SECRET_ACCESS_KEY": secret_access_key
                         })
    ray_ctx = RayContext(sc=sc, object_store_memory="1g")
    ray_ctx.init()
    yield
    ray_ctx.stop()
    sc.stop()
Beispiel #16
0
def rayonspark_fixture():
    from zoo import init_spark_on_local
    from zoo.ray import RayContext
    global sc
    global ray_ctx
    sc = init_spark_on_local(cores=8, spark_log_level="INFO")
    ray_ctx = RayContext(sc=sc, object_store_memory="1g")
    ray_ctx.init()
    yield
    ray_ctx.stop()
    sc.stop()
    def test_auto_shard_tf(self):
        # file 1 contains all 0s, file 2 contains all 1s
        # If shard by files, then each model will
        # see the same records in the same batch.
        # If shard by records, then each batch
        # will have different records.
        # The loss func is constructed such that
        # the former case will return 0, and the latter
        # case will return non-zero.

        ray_ctx = RayContext.get()
        trainer = Estimator.from_keras(
            model_creator=auto_shard_model_creator,
            verbose=True,
            backend="tf2", workers_per_node=2)
        stats = trainer.fit(create_auto_shard_datasets, epochs=1, batch_size=4, steps_per_epoch=2)
        assert stats["train_loss"] == 0.0
Beispiel #18
0
    def _from_spark_xshards_ray_api(spark_xshards):
        ray_ctx = RayContext.get()
        address = ray_ctx.redis_address
        password = ray_ctx.redis_password
        driver_ip = ray._private.services.get_node_ip_address()
        uuid_str = str(uuid.uuid4())
        resources = ray.cluster_resources()
        nodes = []
        for key, value in resources.items():
            if key.startswith("node:"):
                # if running in cluster, filter out driver ip
                if key != f"node:{driver_ip}":
                    nodes.append(key)
        # for the case of local mode and single node spark standalone
        if not nodes:
            nodes.append(f"node:{driver_ip}")

        partition_stores = {}
        for node in nodes:
            name = f"partition:{uuid_str}:{node}"
            store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore)\
                .options(name=name).remote()
            partition_stores[name] = store

        # actor creation is aync, this is to make sure they all have been started
        ray.get([v.get_partitions.remote() for v in partition_stores.values()])
        partition_store_names = list(partition_stores.keys())
        result = spark_xshards.rdd.mapPartitionsWithIndex(
            lambda idx, part: write_to_ray(idx, part, address, password,
                                           partition_store_names)).collect()

        num_empty_partitions = 0
        id2ip = {}
        id2store_name = {}
        for idx, ip, local_store_name, is_empty in result:
            id2ip[idx] = ip
            id2store_name[idx] = local_store_name
            if is_empty:
                num_empty_partitions += 1
        if num_empty_partitions > 0:
            logger.warning(
                f"Found {num_empty_partitions} empty partitions in your SparkXShards."
            )

        return RayXShards(uuid_str, dict(id2store_name), dict(id2ip),
                          partition_stores)
 def test_gluon(self):
     current_ray_ctx = RayContext.get()
     address_info = current_ray_ctx.address_info
     assert "object_store_address" in address_info
     config = create_trainer_config(
         batch_size=32,
         log_interval=2,
         optimizer="adam",
         optimizer_params={'learning_rate': 0.02})
     trainer = MXNetTrainer(config,
                            get_train_data_iter,
                            get_model,
                            get_loss,
                            eval_metrics_creator=get_metrics,
                            validation_metrics_creator=get_metrics,
                            num_workers=2,
                            test_data=get_test_data_iter)
     trainer.train(nb_epoch=2)
Beispiel #20
0
 def from_partition_refs(ip2part_id, part_id2ref):
     ray_ctx = RayContext.get()
     uuid_str = str(uuid.uuid4())
     id2store_name = {}
     partition_stores = {}
     part_id2ip = {}
     result = []
     for node, part_ids in ip2part_id.items():
         name = f"partition:{uuid_str}:{node}"
         store = ray.remote(num_cpus=0, resources={f"node:{node}": 1e-4})(LocalStore) \
             .options(name=name).remote()
         partition_stores[name] = store
         for idx in part_ids:
             result.append(store.upload_partition.remote(idx, part_id2ref[idx]))
             id2store_name[idx] = name
             part_id2ip[idx] = node
     ray.get(result)
     return RayXShards(uuid_str, id2store_name, part_id2ip, partition_stores)
Beispiel #21
0
def orca_data_fixture():
    from zoo import init_spark_on_local
    from zoo.ray import RayContext
    ZooContext._orca_eager_mode = True
    sc = init_spark_on_local(cores=4, spark_log_level="INFO")
    access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
    secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    if access_key_id is not None and secret_access_key is not None:
        ray_ctx = RayContext(sc=sc,
                             object_store_memory="1g",
                             env={
                                 "AWS_ACCESS_KEY_ID": access_key_id,
                                 "AWS_SECRET_ACCESS_KEY": secret_access_key
                             })
    else:
        ray_ctx = RayContext(sc=sc, object_store_memory="1g")
    ray_ctx.init()
    yield
    ray_ctx.stop()
    sc.stop()
Beispiel #22
0
    def to_spark_xshards(self):
        from zoo.orca.data import SparkXShards
        ray_ctx = RayContext.get()
        sc = ray_ctx.sc
        address = ray_ctx.redis_address
        password = ray_ctx.redis_password
        num_parts = self.num_partitions()
        partition2store = self.partition2store_name
        rdd = sc.parallelize([0] * num_parts * 10, num_parts)\
            .mapPartitionsWithIndex(
            lambda idx, _: get_from_ray(idx, address, password, partition2store))

        # the reason why we trigger computation here is to ensure we get the data
        # from ray before the RayXShards goes out of scope and the data get garbage collected
        from pyspark.storagelevel import StorageLevel
        rdd = rdd.cache()
        result_rdd = rdd.map(lambda x: x)  # sparkxshards will uncache the rdd when gc
        spark_xshards = SparkXShards(result_rdd)
        return spark_xshards
Beispiel #23
0
def stop_orca_context():
    """
    Stop the SparkContext (and stop Ray services across the cluster if necessary).
    """
    from pyspark import SparkContext
    # If users successfully call stop_orca_context after the program finishes,
    # namely when there is no active SparkContext, the registered exit function
    # should do nothing.
    if SparkContext._active_spark_context is not None:
        print("Stopping orca context")
        from zoo.ray import RayContext
        ray_ctx = RayContext.get(initialize=False)
        if ray_ctx.initialized:
            ray_ctx.stop()
        sc = SparkContext.getOrCreate()
        if sc.getConf().get("spark.master").startswith("spark://"):
            from zoo import stop_spark_standalone
            stop_spark_standalone()
        sc.stop()
Beispiel #24
0
    def _from_spark_xshards_ray_api(spark_xshards):
        ray_ctx = RayContext.get()
        address = ray_ctx.redis_address
        password = ray_ctx.redis_password
        driver_ip = ray._private.services.get_node_ip_address()
        uuid_str = str(uuid.uuid4())
        resources = ray.cluster_resources()
        nodes = []
        for key, value in resources.items():
            if key.startswith("node:"):
                # if running in cluster, filter out driver ip
                if key != f"node:{driver_ip}":
                    nodes.append(key)
        # for the case of local mode and single node spark standalone
        if not nodes:
            nodes.append(f"node:{driver_ip}")

        partition_stores = {}
        for node in nodes:
            name = f"partition:{uuid_str}:{node}"
            if version.parse(ray.__version__) >= version.parse("1.4.0"):
                store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore)\
                    .options(name=name, lifetime="detached").remote()
            else:
                store = ray.remote(num_cpus=0, resources={node: 1e-4})(LocalStore) \
                    .options(name=name).remote()
            partition_stores[name] = store

        # actor creation is aync, this is to make sure they all have been started
        ray.get([v.get_partitions.remote() for v in partition_stores.values()])
        partition_store_names = list(partition_stores.keys())
        result_rdd = spark_xshards.rdd.mapPartitionsWithIndex(
            lambda idx, part: write_to_ray(idx, part, address, password,
                                           partition_store_names)).cache()
        result = result_rdd.collect()

        id2ip = {}
        id2store_name = {}
        for idx, ip, local_store_name in result:
            id2ip[idx] = ip
            id2store_name[idx] = local_store_name

        return RayXShards(uuid_str, result_rdd, partition_stores)
    def test_local(self):
        @ray.remote
        class TestRay:
            def hostname(self):
                import socket
                return socket.gethostname()

        sc = init_spark_on_local(cores=4)
        ray_ctx = RayContext(sc=sc, object_store_memory="1g")
        ray_ctx.init()
        actors = [TestRay.remote() for i in range(0, 4)]
        print(ray.get([actor.hostname.remote() for actor in actors]))
        ray_ctx.stop()
        sc.stop()
Beispiel #26
0
    def to_ray(self):
        import random
        import string
        from zoo.ray import RayContext
        ray_ctx = RayContext.get()
        object_store_address = ray_ctx.address_info["object_store_address"]

        # TODO: Handle failure when doing this?
        # TODO: delete the data in the plasma?
        def put_to_plasma(seed):
            def f(index, iterator):
                import pyarrow.plasma as plasma
                from zoo.orca.data.utils import get_node_ip
                # mapPartition would set the same random seed for each partition?
                # Here use the partition index to override the random seed so that there won't be
                # identical object_ids in plasma.
                random.seed(seed + str(index))
                res = list(iterator)
                client = plasma.connect(object_store_address)
                object_id = client.put(res)
                yield object_id, get_node_ip()

            return f

        # Generate a random string here to make sure that when this method is called twice, the
        # seeds to generate plasma ObjectID are different.
        random_str = ''.join([
            random.choice(string.ascii_letters + string.digits)
            for i in range(32)
        ])
        object_id_node_ips = self.rdd.mapPartitionsWithIndex(
            put_to_plasma(random_str)).collect()
        self.uncache()
        # Sort the data according to the node_ips.
        object_id_node_ips.sort(key=lambda x: x[1])
        partitions = [
            RayPartition(shard_list=id_ip[0],
                         node_ip=id_ip[1],
                         object_store_address=object_store_address)
            for id_ip in object_id_node_ips
        ]
        return RayXShards(partitions)
    def impl_test_auto_shard(self, backend):

        # file 1 contains all 0s, file 2 contains all 1s
        # If shard by files, then each model will
        # see the same records in the same batch.
        # If shard by records, then each batch
        # will have different records.
        # The loss func is constructed such that
        # the former case will return 0, and the latter
        # case will return non-zero.

        ray_ctx = RayContext.get()
        trainer = Estimator(
            model_creator=create_auto_shard_model,
            compile_args_creator=create_auto_shard_compile_args,
            verbose=True,
            config={},
            backend=backend,
            workers_per_node=2)
        stats = trainer.fit(create_auto_shard_datasets,
                            epochs=1,
                            steps_per_epoch=2)
        assert stats["train_loss"] == 0.0
    def __init__(self,
                 config,
                 model_creator,
                 loss_creator=None,
                 eval_metrics_creator=None,
                 validation_metrics_creator=None,
                 num_workers=None,
                 num_servers=None,
                 runner_cores=None):
        ray_ctx = RayContext.get()
        if not num_workers:
            num_workers = ray_ctx.num_ray_nodes
        self.config = {} if config is None else config
        assert isinstance(config, dict), "config must be a dict"
        for param in ["optimizer", "optimizer_params", "log_interval"]:
            assert param in config, param + " must be specified in config"
        self.model_creator = model_creator
        self.loss_creator = loss_creator
        self.validation_metrics_creator = validation_metrics_creator
        self.eval_metrics_creator = eval_metrics_creator
        self.num_workers = num_workers
        self.num_servers = num_servers if num_servers else self.num_workers

        # Generate actor class
        # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server
        # if runner_cores is specified so that we can place one worker and one server on a node
        # for better performance.
        Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)
        Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)

        # Start runners: workers followed by servers
        self.workers = [Worker.remote() for i in range(self.num_workers)]
        self.servers = [Server.remote() for i in range(self.num_servers)]
        self.runners = self.workers + self.servers

        env = {
            "DMLC_PS_ROOT_URI": str(get_host_ip()),
            "DMLC_PS_ROOT_PORT": str(find_free_port()),
            "DMLC_NUM_SERVER": str(self.num_servers),
            "DMLC_NUM_WORKER": str(self.num_workers),
        }
        envs = []
        for i in range(self.num_workers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'worker'
            envs.append(current_env)
        for i in range(self.num_servers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'server'
            envs.append(current_env)

        env['DMLC_ROLE'] = 'scheduler'
        modified_env = os.environ.copy()
        modified_env.update(env)
        # Need to contain system env to run bash
        # TODO: Need to kill this process manually?
        subprocess.Popen("python -c 'import mxnet'",
                         shell=True,
                         env=modified_env)

        ray.get([
            runner.setup_distributed.remote(envs[i], self.config,
                                            self.model_creator,
                                            self.loss_creator,
                                            self.validation_metrics_creator,
                                            self.eval_metrics_creator)
            for i, runner in enumerate(self.runners)
        ])
Beispiel #29
0
                      help="The file path to be read")

    (options, args) = parser.parse_args(sys.argv)

    # Prepare csv files
    df = pd.read_csv(options.file_path)
    sc = init_spark_on_local(cores="*")
    sqlContext = SQLContext(sc)
    num_nodes, num_cores = get_node_and_core_number()
    df_spark = sqlContext.createDataFrame(df)
    df_spark.printSchema()
    df_spark.repartition(num_cores).write.\
        format('json').mode("overwrite").save("/tmp/ray-pandas-example")

    # init ray context
    ray_ctx = RayContext(sc=sc, object_store_memory="5g")
    ray_ctx.init(object_store_memory="5g")

    # read data
    data_shard = zoo.xshard.pandas.read_json("/tmp/ray-pandas-example",
                                             ray_ctx)

    # collect data
    data = data_shard.collect()
    print("collected data :")
    print(data[0].head())

    # repartition
    partitions = data_shard.get_partitions()
    print("get %d partitions" % len(partitions))
    data_shard.repartition(2)
Beispiel #30
0
                        help='The number of executor cores you want to use.')
    parser.add_argument('-n',
                        '--num_workers',
                        type=int,
                        default=2,
                        help='The number of workers to be launched.')
    opt = parser.parse_args()
    if opt.hadoop_conf:
        assert opt.conda_name is not None, "conda_name must be specified for yarn mode"
        sc = init_spark_on_yarn(hadoop_conf=opt.hadoop_conf,
                                conda_name=opt.conda_name,
                                num_executors=opt.num_workers,
                                executor_cores=opt.executor_cores)
    else:
        sc = init_spark_on_local(cores="*")
    ray_ctx = RayContext(sc=sc)
    ray_ctx.init()

    import pandas as pd
    df = pd.read_csv(opt.path)
    feature_cols = [
        "FIPS", "Lower 95% Confidence Interval",
        "Upper 95% Confidence Interval", "Average Annual Count",
        "Recent 5-Year Trend"
    ]
    target_col = "Age-Adjusted Incidence Rate"
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=2)

    config = {'random_state': 2, 'min_child_weight': 3, 'n_jobs': 2}
    estimator = AutoXGBoost().regressor(feature_cols=feature_cols,
                                        target_col=target_col,