Exemple #1
0
  def testFailImportingRemoteFunction(self):
    ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE)

    # This example is somewhat contrived. It should be successfully pickled, and
    # then it should throw an exception when it is unpickled. This may depend a
    # bit on the specifics of our pickler.
    def reducer(*args):
      raise Exception("There is a problem here.")
    class Foo(object):
      def __init__(self):
        self.__name__ = "Foo_object"
        self.func_doc = ""
        self.__globals__ = {}
      def __reduce__(self):
        return reducer, ()
      def __call__(self):
        return
    ray.remote(Foo())
    for _ in range(100): # Retry if we need to wait longer.
      if len(ray.task_info()["failed_remote_function_imports"]) >= 1:
        break
      time.sleep(0.1)
    self.assertTrue("There is a problem here." in ray.task_info()["failed_remote_function_imports"][0]["error_message"])

    ray.worker.cleanup()
Exemple #2
0
def test_remote_training_loss(ray_start_regular):
    net = ray.remote(TrainActor).remote()
    net_values = TrainActor().values
    loss, variables, _, sess, grads, train, placeholders = net_values

    before_acc = sess.run(
        loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))

    for _ in range(3):
        gradients_list = ray.get([
            net.training_step.remote(variables.get_weights()) for _ in range(2)
        ])
        mean_grads = [
            sum(gradients[i]
                for gradients in gradients_list) / len(gradients_list)
            for i in range(len(gradients_list[0]))
        ]
        feed_dict = {
            grad[0]: mean_grad
            for (grad, mean_grad) in zip(grads, mean_grads)
        }
        sess.run(train, feed_dict=feed_dict)
    after_acc = sess.run(
        loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
    assert before_acc < after_acc
Exemple #3
0
    def _setup_runner(self):
        self.status = Trial.RUNNING
        trainable_cls = get_registry().get(
            TRAINABLE_CLASS, self.trainable_name)
        cls = ray.remote(
            num_cpus=self.resources.driver_cpu_limit,
            num_gpus=self.resources.driver_gpu_limit)(trainable_cls)
        if not self.result_logger:
            if not os.path.exists(self.local_dir):
                os.makedirs(self.local_dir)
            self.logdir = tempfile.mkdtemp(
                prefix="{}_{}".format(
                    str(self)[:MAX_LEN_IDENTIFIER], date_str()),
                dir=self.local_dir)
            self.result_logger = UnifiedLogger(
                self.config, self.logdir, self.upload_dir)
        remote_logdir = self.logdir

        def logger_creator(config):
            # Set the working dir in the remote process, for user file writes
            if not os.path.exists(remote_logdir):
                os.makedirs(remote_logdir)
            os.chdir(remote_logdir)
            return NoopLogger(config, remote_logdir)

        # Logging for trials is handled centrally by TrialRunner, so
        # configure the remote runner to use a noop-logger.
        self.runner = cls.remote(
            config=self.config, registry=get_registry(),
            logger_creator=logger_creator)
Exemple #4
0
    def _init(self):
        self.local_evaluator = DQNEvaluator(
            self.registry, self.env_creator, self.config, self.logdir, 0)
        remote_cls = ray.remote(
            num_cpus=1, num_gpus=self.config["num_gpus_per_worker"])(
            DQNEvaluator)
        self.remote_evaluators = [
            remote_cls.remote(
                self.registry, self.env_creator, self.config, self.logdir,
                i)
            for i in range(self.config["num_workers"])]

        if self.config["force_evaluators_remote"]:
            self.remote_evaluators = drop_colocated(self.remote_evaluators)

        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in self.config["optimizer_config"]:
                self.config["optimizer_config"][k] = self.config[k]

        self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
            self.config["optimizer_config"], self.local_evaluator,
            self.remote_evaluators)

        self.saver = tf.train.Saver(max_to_keep=None)
        self.last_target_update_ts = 0
        self.num_target_updates = 0
Exemple #5
0
 def testBasic(self):
     ray.init(num_cpus=4)
     local = _MockEvaluator()
     remotes = ray.remote(_MockEvaluator)
     remote_evaluators = [remotes.remote() for i in range(5)]
     test_optimizer = AsyncOptimizer(
         {"grads_per_step": 10}, local, remote_evaluators)
     test_optimizer.step()
     self.assertTrue(all(local.get_weights() == 0))
Exemple #6
0
    def test_simple_class(self):
        cls = ray.remote(cyth.simple_class)
        a1 = cls.remote()
        a2 = cls.remote()

        result1 = ray.get(a1.increment.remote())
        result2 = ray.get(a2.increment.remote())
        result3 = ray.get(a2.increment.remote())

        self.assertEqual(result1, 1)
        self.assertEqual(result2, 1)
        self.assertEqual(result3, 2)
Exemple #7
0
def example6():
    """Cython simple class"""

    ray.init()

    cls = ray.remote(cyth.simple_class)
    a1 = cls.remote()
    a2 = cls.remote()

    result1 = ray.get(a1.increment.remote())
    result2 = ray.get(a2.increment.remote())

    print(result1, result2)
Exemple #8
0
def test_network_driver_worker_independent(ray_start_regular):
    # Create a network on the driver locally.
    sess1 = tf.Session()
    loss1, init1, _, _ = make_linear_network()
    ray.experimental.TensorFlowVariables(loss1, sess1)
    sess1.run(init1)

    net2 = ray.remote(NetActor).remote()
    weights2 = ray.get(net2.get_weights.remote())

    new_weights2 = ray.get(
        net2.set_and_get_weights.remote(net2.get_weights.remote()))
    assert weights2 == new_weights2
Exemple #9
0
def run_func(func, *args, **kwargs):
    """Helper function for running examples"""
    ray.init()

    func = ray.remote(func)

    # NOTE: kwargs not allowed for now
    result = ray.get(func.remote(*args))

    # Inspect the stack to get calling example
    caller = inspect.stack()[1][3]
    print("%s: %s" % (caller, str(result)))

    return result
Exemple #10
0
    def testNetworkDriverWorkerIndependent(self):
        ray.init(num_workers=1)

        # Create a network on the driver locally.
        sess1 = tf.Session()
        loss1, init1, _, _ = make_linear_network()
        ray.experimental.TensorFlowVariables(loss1, sess1)
        sess1.run(init1)

        net2 = ray.remote(NetActor).remote()
        weights2 = ray.get(net2.get_weights.remote())

        new_weights2 = ray.get(net2.set_and_get_weights.remote(
            net2.get_weights.remote()))
        self.assertEqual(weights2, new_weights2)
Exemple #11
0
    def make(
            cls, evaluator_cls, evaluator_args, num_workers, optimizer_config):
        """Create evaluators and an optimizer instance using those evaluators.

        Args:
            evaluator_cls (class): Python class of the evaluators to create.
            evaluator_args (list): List of constructor args for the evaluators.
            num_workers (int): Number of remote evaluators to create in
                addition to a local evaluator. This can be zero or greater.
            optimizer_config (dict): Keyword arguments to pass to the
                optimizer class constructor.
        """

        local_evaluator = evaluator_cls(*evaluator_args)
        remote_cls = ray.remote(num_cpus=1)(evaluator_cls)
        remote_evaluators = [
            remote_cls.remote(*evaluator_args)
            for _ in range(num_workers)]
        return cls(optimizer_config, local_evaluator, remote_evaluators)
Exemple #12
0
 def _init(self):
     self.global_step = 0
     self.kl_coeff = self.config["kl_coeff"]
     self.local_evaluator = PPOEvaluator(
         self.registry, self.env_creator, self.config, self.logdir, False)
     RemotePPOEvaluator = ray.remote(
         **self.config["worker_resources"])(PPOEvaluator)
     self.remote_evaluators = [
         RemotePPOEvaluator.remote(
             self.registry, self.env_creator, self.config, self.logdir,
             True)
         for _ in range(self.config["num_workers"])]
     self.start_time = time.time()
     if self.config["write_logs"]:
         self.file_writer = tf.summary.FileWriter(
             self.logdir, self.local_evaluator.sess.graph)
     else:
         self.file_writer = None
     self.saver = tf.train.Saver(max_to_keep=None)
Exemple #13
0
    def _setup_runner(self, trial):
        cls = ray.remote(
            num_cpus=trial.resources.cpu,
            num_gpus=trial.resources.gpu)(trial._get_trainable_cls())

        trial.init_logger()
        # We checkpoint metadata here to try mitigating logdir duplication
        self.try_checkpoint_metadata(trial)
        remote_logdir = trial.logdir

        def logger_creator(config):
            # Set the working dir in the remote process, for user file writes
            if not os.path.exists(remote_logdir):
                os.makedirs(remote_logdir)
            os.chdir(remote_logdir)
            return NoopLogger(config, remote_logdir)

        # Logging for trials is handled centrally by TrialRunner, so
        # configure the remote runner to use a noop-logger.
        return cls.remote(config=trial.config, logger_creator=logger_creator)
    def __init__(self, env_fns, spaces=None):
        """
        envs: list of gym environments to run in subprocesses
        """
        self.waiting = False
        self.closed = False
        self.task_pool = TaskPool(timeout=10)

        nenvs = len(env_fns)

        self.actors = []
        self.actor_to_i = {}
        remote_actor = ray.remote(Actor)
        for i in range(nenvs):
            actor = remote_actor.remote(i, env_fns[i])
            self.actors.append(actor)
            self.actor_to_i[actor] = i

        observation_space, action_space = ray.get(self.actors[0].get_spaces.remote())
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)

        self.results = [([0] * OBSERVATION_SPACE, 0, False, {"bad": True})] * self.num_envs
Exemple #15
0
    def testSynchronize(self):
        """Synchronize applies filter buffer onto own filter"""
        filt1 = MeanStdFilter(())
        for i in range(10):
            filt1(i)
        self.assertEqual(filt1.rs.n, 10)
        filt1.clear_buffer()
        self.assertEqual(filt1.buffer.n, 0)

        RemoteEvaluator = ray.remote(_MockEvaluator)
        remote_e = RemoteEvaluator.remote(sample_count=10)
        remote_e.sample.remote()

        FilterManager.synchronize({
            "obs_filter": filt1,
            "rew_filter": filt1.copy()
        }, [remote_e])

        filters = ray.get(remote_e.get_filters.remote())
        obs_f = filters["obs_filter"]
        self.assertEqual(filt1.rs.n, 20)
        self.assertEqual(filt1.buffer.n, 0)
        self.assertEqual(obs_f.rs.n, filt1.rs.n)
        self.assertEqual(obs_f.buffer.n, filt1.buffer.n)
Exemple #16
0
    def testRemoteTrainingLoss(self):
        ray.init(num_workers=2)

        net = ray.remote(TrainActor).remote()
        net_values = TrainActor().values
        loss, variables, _, sess, grads, train, placeholders = net_values

        before_acc = sess.run(loss, feed_dict=dict(zip(placeholders,
                                                       [[2] * 100,
                                                        [4] * 100])))

        for _ in range(3):
            gradients_list = ray.get(
                [net.training_step.remote(variables.get_weights())
                 for _ in range(2)])
            mean_grads = [sum([gradients[i] for gradients in gradients_list]) /
                          len(gradients_list) for i
                          in range(len(gradients_list[0]))]
            feed_dict = {grad[0]: mean_grad for (grad, mean_grad)
                         in zip(grads, mean_grads)}
            sess.run(train, feed_dict=feed_dict)
        after_acc = sess.run(loss, feed_dict=dict(zip(placeholders,
                                                      [[2] * 100, [4] * 100])))
        self.assertTrue(before_acc < after_acc)
Exemple #17
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator,
                 train_function=None,
                 validation_function=None,
                 initialization_hook=None,
                 config=None,
                 num_replicas=1,
                 use_gpu=False,
                 batch_size=16,
                 backend="auto"):
        """Sets up the PyTorch trainer.

        Args:
            model_creator (dict -> torch.nn.Module): creates the model
                using the config.
            data_creator (int, dict -> DataLoader, DataLoader): Function that
                takes in (batch_size, config) and returns two Torch DataLoader
                objects.
            optimizer_creator (torch.nn.Module, dict -> optimizer):
                creates the loss and optimizer using the model and the config.
            loss_creator (dict -> loss): Creates the loss function/criterion
                using the config.
            train_function: Trains a model for a epoch. This takes in (
                model, train_dataloader, criterion, optimizer, config), and
                returns a dict of training stats.
            validation_function: Runs validation. This takes in (
                model, val_dataloader, criterion, config) and returns a dict of
                validation stats.
            config (dict): configuration passed to "model_creator",
                "data_creator", "optimizer_creator", and "loss_creator".
            num_replicas (int): the number of workers used in distributed
                training.
            use_gpu (bool): Sets resource allocation for workers to 1 GPU
                if true.
            batch_size (int): batch size for an update.
            backend (string): backend used by distributed PyTorch.
        """
        # TODO: add support for mixed precision
        # TODO: add support for callbacks
        if num_replicas > 1 and not dist.is_available():
            raise ValueError(
                ("Distributed PyTorch is not supported on macOS. "
                 "To run without distributed PyTorch, set 'num_replicas=1'. "
                 "For more information, see "
                 "https://github.com/pytorch/examples/issues/467."))

        self.model_creator = model_creator
        self.train_function = train_function
        self.validation_function = validation_function
        self.config = {} if config is None else config
        self.optimizer_timer = utils.TimerStat(window_size=1)

        if backend == "auto":
            backend = "nccl" if use_gpu else "gloo"

        logger.info("Using {} as backend.".format(backend))

        if num_replicas == 1:
            # Generate actor class
            Runner = ray.remote(num_cpus=1,
                                num_gpus=int(use_gpu))(PyTorchRunner)
            # Start workers
            self.workers = [
                Runner.remote(model_creator,
                              data_creator,
                              optimizer_creator,
                              loss_creator,
                              train_function=train_function,
                              validation_function=validation_function,
                              config=self.config,
                              batch_size=batch_size)
            ]
            if initialization_hook:
                self.apply_all_workers(initialization_hook)
            # Get setup tasks in order to throw errors on failure
            ray.get(self.workers[0].setup.remote())
        else:
            # Generate actor class
            Runner = ray.remote(
                num_cpus=1, num_gpus=int(use_gpu))(DistributedPyTorchRunner)
            # Compute batch size per replica
            batch_size_per_replica = batch_size // num_replicas
            if batch_size % num_replicas > 0:
                new_batch_size = batch_size_per_replica * num_replicas
                logger.warning(
                    ("Changing batch size from {old_batch_size} to "
                     "{new_batch_size} to evenly distribute batches across "
                     "{num_replicas} replicas.").format(
                         old_batch_size=batch_size,
                         new_batch_size=new_batch_size,
                         num_replicas=num_replicas))
            # Start workers
            self.workers = [
                Runner.remote(model_creator,
                              data_creator,
                              optimizer_creator,
                              loss_creator,
                              backend=backend,
                              train_function=train_function,
                              validation_function=validation_function,
                              config=self.config,
                              batch_size=batch_size_per_replica)
                for i in range(num_replicas)
            ]
            if initialization_hook:
                self.apply_all_workers(initialization_hook)

            # Compute URL for initializing distributed PyTorch
            ip = ray.get(self.workers[0].get_node_ip.remote())
            port = ray.get(self.workers[0].find_free_port.remote())
            address = "tcp://{ip}:{port}".format(ip=ip, port=port)
            # Get setup tasks in order to throw errors on failure
            ray.get([
                worker.setup.remote(address, i, len(self.workers))
                for i, worker in enumerate(self.workers)
            ])
Exemple #18
0
    def __init__(self,
                 model_creator,
                 num_workers,
                 devices_per_worker,
                 gpu=True,
                 strategy="ps",
                 grad_shard_bytes=10000000,
                 all_reduce_alg="simple"):

        if num_workers == 1 and strategy == "ps":
            logger.warning(
                "The parameter server strategy does not make sense for single "
                "worker operation, falling back to simple mode.")
            strategy = "simple"

        if strategy == "ps":
            use_plasma_op = True
        elif strategy == "simple":
            use_plasma_op = False
            grad_shard_bytes = 0  # tensor fusion doesn't make sense
        else:
            raise ValueError("strategy must be one of 'ps', 'simple'")
        self.strategy = strategy

        self.model_creator = model_creator
        if gpu:
            requests = {"num_gpus": devices_per_worker}
        else:
            requests = {"num_cpus": devices_per_worker}

        RemoteSGDWorker = ray.remote(**requests)(SGDWorker)
        self.workers = []
        logger.info(
            "Creating SGD workers ({} total, {} devices per worker)".format(
                num_workers, devices_per_worker))
        for worker_index in range(num_workers):
            self.workers.append(
                RemoteSGDWorker.remote(
                    worker_index,
                    model_creator,
                    num_devices=devices_per_worker,
                    plasma_op=use_plasma_op,
                    gpu=gpu,
                    max_bytes=grad_shard_bytes,
                    all_reduce_alg=all_reduce_alg))

        logger.info("Waiting for gradient configuration")
        shard_shapes = ray.get(self.workers[0].shard_shapes.remote())

        logger.info("Waiting for actors to start")
        ray.get([w.shard_shapes.remote() for w in self.workers])

        if strategy == "ps":
            logger.info("Starting parameter servers ({} shards)".format(
                len(shard_shapes)))
            self.ps_list = [
                ParameterServer.remote(len(self.workers), i)
                for i, s in enumerate(shard_shapes)
            ]
            ray.get([
                ps.initialize.remote(s)
                for ps, s in zip(self.ps_list, shard_shapes)
            ])
            logger.info("Parameter servers started")
        else:
            self.ps_list = []
Exemple #19
0
    def testRemoteTrainingStep(self):
        ray.init(num_workers=1)

        net = ray.remote(TrainActor).remote()
        ray.get(net.training_step.remote(net.get_weights.remote()))
Exemple #20
0
    def __init__(
            self,
            *,
            model_creator,
            optimizer_creator,
            loss_creator=None,
            scheduler_creator=None,
            training_operator_cls=TrainingOperator,
            initialization_hook=None,
            config=None,
            scheduler_step_freq="batch",
            use_tqdm=False,
            backend="torch_distributed",
            workers_per_node=1):

        # todo remove ray_ctx to run on workers
        ray_ctx = RayContext.get()
        if not (isinstance(model_creator, types.FunctionType) and
                isinstance(optimizer_creator, types.FunctionType)):  # Torch model is also callable.
            raise ValueError(
                "Must provide a function for both model_creator and optimizer_creator")

        self.model_creator = model_creator
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.scheduler_creator = scheduler_creator
        self.training_operator_cls = training_operator_cls
        self.scheduler_step_freq = scheduler_step_freq
        self.use_tqdm = use_tqdm

        if not training_operator_cls and not loss_creator:
            raise ValueError("If a loss_creator is not provided, you must "
                             "provide a custom training operator.")

        self.initialization_hook = initialization_hook
        self.config = {} if config is None else config
        worker_config = self.config.copy()
        params = dict(
            model_creator=self.model_creator,
            optimizer_creator=self.optimizer_creator,
            loss_creator=self.loss_creator,
            scheduler_creator=self.scheduler_creator,
            training_operator_cls=self.training_operator_cls,
            scheduler_step_freq=self.scheduler_step_freq,
            use_tqdm=self.use_tqdm,
            config=worker_config)

        if backend == "torch_distributed":
            cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node
            num_nodes = ray_ctx.num_ray_nodes * workers_per_node
            RemoteRunner = ray.remote(num_cpus=cores_per_node)(TorchRunner)
            self.remote_workers = [
                RemoteRunner.remote(**params) for i in range(num_nodes)
            ]
            ray.get([
                worker.setup.remote(cores_per_node)
                for i, worker in enumerate(self.remote_workers)
            ])

            head_worker = self.remote_workers[0]
            address = ray.get(head_worker.setup_address.remote())

            logger.info(f"initializing pytorch process group on {address}")

            ray.get([
                worker.setup_torch_distribute.remote(address, i, num_nodes)
                for i, worker in enumerate(self.remote_workers)
            ])

        elif backend == "horovod":
            from zoo.orca.learn.horovod.horovod_ray_runner import HorovodRayRunner
            self.horovod_runner = HorovodRayRunner(ray_ctx,
                                                   worker_cls=TorchRunner,
                                                   worker_param=params,
                                                   workers_per_node=workers_per_node)
            self.remote_workers = self.horovod_runner.remote_workers
            cores_per_node = self.horovod_runner.cores_per_node
            ray.get([
                worker.setup.remote(cores_per_node)
                for i, worker in enumerate(self.remote_workers)
            ])

            ray.get([
                worker.setup_horovod.remote()
                for i, worker in enumerate(self.remote_workers)
            ])
        else:
            raise Exception("Only \"torch_distributed\" and \"horovod\" are supported "
                            "values of backend, but got {}".format(backend))
        self.num_workers = len(self.remote_workers)
Exemple #21
0
def generate_fake_x_y_data(num_data, seed=0):
  # Seed numpy to make the script deterministic.
  np.random.seed(seed)
  x = np.random.rand(num_data)
  y = x * 0.1 + 0.3
  return x, y

# Generate some training data.
batch_ids = [generate_fake_x_y_data.remote(BATCH_SIZE, seed=i) for i in range(NUM_BATCHES)]
x_ids = [x_id for x_id, y_id in batch_ids]
y_ids = [y_id for x_id, y_id in batch_ids]
# Generate some test data.
x_test, y_test = ray.get(generate_fake_x_y_data.remote(BATCH_SIZE, seed=NUM_BATCHES))

# Create actors to store the networks.
remote_network = ray.remote(Network)
actor_list = [remote_network.remote(x_ids[i], y_ids[i]) for i in range(NUM_BATCHES)]
local_network = Network(x_test, y_test)

# Get initial weights of local network.
weights = local_network.get_weights()

# Do some steps of training.
for iteration in range(NUM_ITERS):
  # Put the weights in the object store. This is optional. We could instead pass
  # the variable weights directly into step.remote, in which case it would be
  # placed in the object store under the hood. However, in that case multiple
  # copies of the weights would be put in the object store, so this approach is
  # more efficient.
  weights_id = ray.put(weights)
  # Call the remote function multiple times in parallel.
Exemple #22
0
def RayFuncWrapFunc(func):
    return ray.remote(func)
Exemple #23
0
def get_ray_result(cython_func, *args):
    func = ray.remote(cython_func)
    return ray.get(func.remote(*args))
Exemple #24
0
def train(envMaker,
          policy,
          optPolicy,
          baseline=None,
          optBaselineMaker=None,
          saver=None,
          iterations: int = 100,
          batchSize: int = BATCH_SIZE,
          gamma: float = GAMMA,
          lmbd: float = LAMBDA,
          maxDKL: float = MAX_DKL,
          beta: float = BETA,
          maxEpisodeLength: int = MAX_LENGTH,
          pBatchMem: float = 1.0,
          nTests: int = TESTS,
          testFreq: int = TEST_FREQ,
          testSteps: int = MAX_LENGTH,
          device=DEVICE_DEFT,
          nWorkers=NCPUS,
          workerSeeds: list = [],
          testSeed: int = 69,
          logger=None,
          **kwargs):

    assert (gamma <= 1) and (gamma >=
                             0), "Gamma must be in the interval [0,1] "
    assert nWorkers > 0, "nWorkers must be greater than 0"
    assert batchSize > 32, "Just 'case"
    gae = kwargs.get("gae", False)
    print("Gae status", gae)

    # init ray if needed
    if nWorkers > 1:
        try:
            import ray
            RAY = True
            nWorkers = nWorkers if nWorkers <= NCPUS else NCPUS
            ray.init(num_cpus=nWorkers)
        except:
            RAY = False
    else:
        RAY = False

    # Test variables
    testRewardRes, testVar, testStepsRes = [], [], []
    envTest = envMaker(seed=testSeed)
    if testSeed > 0:
        torch.manual_seed(testSeed)

    # Finishing training
    def closeTent():
        if RAY:
            ray.shutdown()
        return (testRewardRes, testVar, testStepsRes)

    # Create and load the optimizers
    optPolicy = optPolicy(policy, **kwargs)
    optBaseline = optBaselineMaker(
        baseline.parameters()) if baseline is not None else None

    # Creating crawler
    if RAY:

        diffSeeds = len(workerSeeds) - (nWorkers - 1)
        if diffSeeds < 0:
            for _ in range(-diffSeeds):
                workerSeeds += [-1]

        crawler = ray.remote(Crawler)
        batchPerCrw = ceil(batchSize / (nWorkers - 1))
        crawlers = [
            crawler.remote(envMaker,
                           policy.clone().to(DEVICE_DEFT),
                           baseline.clone().to(DEVICE_DEFT)
                           if baseline is not None else None,
                           gamma,
                           maxEpisodeLength,
                           batchPerCrw,
                           pBatchMem,
                           gae=gae,
                           lmbd=lmbd,
                           seed=workerSeeds[i]) for i in range(nWorkers - 1)
        ]
    else:
        crawler = Crawler(envMaker,
                          policy.clone(),
                          baseline.clone() if baseline is not None else None,
                          gamma,
                          maxEpisodeLength,
                          batchSize,
                          pBatchMem,
                          gae=gae,
                          lmbd=lmbd,
                          device=device,
                          seed=workerSeeds[0])

    # iterations loop
    bar = tqdm(range(iterations), unit="updates", desc="Training Policy")
    for it in bar:
        # Checking saver
        if saver is not None:
            saver.check()
        # Checking and executing test
        if it % testFreq == 0:
            meanAcc, var, meanSteps = testRun(envTest,
                                              policy,
                                              nTests=nTests,
                                              testSteps=testSteps,
                                              logger=logger)
            testRewardRes += [meanAcc]
            testVar += [var]
            testStepsRes += [meanSteps]
            bar.write(
                "Test Results: meanGt {:.3f}, var {:.3f} meanEpSteps {:.3f}".
                format(meanAcc, var, meanSteps))

            if kwargs.get("desiredPerformance", False):
                upper = meanAcc + 0.5 * var**0.5
                if upper >= kwargs["desiredPerformance"]:
                    return closeTent()

        # Produce and get trajectories batches
        if not RAY:
            trajectories = [crawler.getBatch()]
        else:
            trajectories = ray.get([crw.getBatch.remote() for crw in crawlers])

        # Update policy parameters
        s = optPolicy.updateParams(*trajectories)
        bar.write(s) if s is not None else None
        if logger is not None and s is not None:
            logger.logr(s)

        # Update baseline parameters
        if baseline is not None:
            states, returns = optPolicy.states, optPolicy.returns
            states.detach_().to(device)
            returns = returns.detach_().to(device)
            # Doing mini batches - Information already scrambled
            n = returns.shape[0]
            for i in range(0, n, 32):
                s = i + 32
                s = s if s < n else n
                states_b, returns_b = states[i:s], returns[i:s]
                baseline_b = baseline.forward(states_b).squeeze()
                optBaseline.zero_grad()
                lossBaseline = F.mse_loss(baseline_b, returns_b)
                lossBaseline.backward()
                optBaseline.step()

        # Update crawlers
        if RAY:
            sdPi = policy.getState(cpu=True, lst=True)
            ray.get([cwr.updatePi.remote(sdPi) for cwr in crawlers])
            if baseline is not None:
                sdB = baseline.getState(cpu=True, lst=True)
                ray.get([cwr.updateBasline.remote(sdB) for cwr in crawlers])
            ray.get([cwr.clearMem.remote() for cwr in crawlers])
        else:
            crawler.updatePi(policy.getState(lst=True))
            if baseline is not None:
                crawler.updateBasline(baseline.getState(lst=True))
            crawler.clearMem()

    return closeTent()
Exemple #25
0
                        self.replay_batch_size,
                        beta=self.prioritized_replay_beta)
                return MultiAgentBatch(samples, self.replay_batch_size)

    def update_priorities(self, prio_dict: Dict) -> None:
        with self.update_priorities_timer:
            for policy_id, (batch_indexes, td_errors) in prio_dict.items():
                new_priorities = (np.abs(td_errors) +
                                  self.prioritized_replay_eps)
                self.replay_buffers[policy_id].update_priorities(
                    batch_indexes, new_priorities)

    def stats(self, debug: bool = False) -> Dict:
        stat = {
            "add_batch_time_ms":
            round(1000 * self.add_batch_timer.mean, 3),
            "replay_time_ms":
            round(1000 * self.replay_timer.mean, 3),
            "update_priorities_time_ms":
            round(1000 * self.update_priorities_timer.mean, 3),
        }
        for policy_id, replay_buffer in self.replay_buffers.items():
            stat.update({
                "policy_{}".format(policy_id):
                replay_buffer.stats(debug=debug)
            })
        return stat


ReplayActor = ray.remote(num_cpus=0)(LocalReplayBuffer)
Exemple #26
0
    analysis = tune.run(
        train_convnet,
        name="pbt_test",
        scheduler=scheduler,
        metric="mean_accuracy",
        mode="max",
        verbose=1,
        stop=stopper,
        export_formats=[ExportFormat.MODEL],
        checkpoint_score_attr="mean_accuracy",
        keep_checkpoints_num=4,
        num_samples=4,
        config={
            "lr": tune.uniform(0.001, 1),
            "momentum": tune.uniform(0.001, 1),
        },
    )
    # __tune_end__

    if args.server_address:
        # If using Ray Client, we want to make sure checkpoint access
        # happens on the server. So we wrap `test_best_model` in a Ray task.
        # We have to make sure it gets executed on the same node that
        # ``tune.run`` is called on.
        from ray.util.ml_utils.node import force_on_current_node

        remote_fn = force_on_current_node(ray.remote(test_best_model))
        ray.get(remote_fn.remote(analysis))
    else:
        test_best_model(analysis)
Exemple #27
0
    def compute_gradients(self, samples):
        """Returns critic, actor gradients."""
        return self.model.compute_gradients(samples)

    def apply_gradients(self, grads):
        """Applies gradients to evaluator weights."""
        self.model.apply_gradients(grads)

    def compute_apply(self, samples):
        grads, _ = self.compute_gradients(samples)
        self.apply_gradients(grads)

    def get_weights(self):
        """Returns model weights."""
        return self.model.get_weights()

    def set_weights(self, weights):
        """Sets model weights."""
        self.model.set_weights(weights)

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

        Calling this clears the queue of completed rollout metrics.
        """
        return self.sampler.get_metrics()


RemoteDDPGEvaluator = ray.remote(DDPGEvaluator)
Exemple #28
0
    def __init__(self):
        self._job_info_client = JobInfoStorageClient()
        self._log_client = JobLogStorageClient()
        self._supervisor_actor_cls = ray.remote(JobSupervisor)

        self._recover_running_jobs()
Exemple #29
0
 def __init__(self, env_fn: Callable[[], gym.Env]) -> None:
     super().__init__(env_fn)
     self.env = ray.remote(gym.Wrapper).options(num_cpus=0).remote(env_fn())
 def get(self, trainable_cls):
     """Gets the wrapped trainable_cls, otherwise calls ray.remote."""
     if trainable_cls not in self._cache:
         remote_cls = ray.remote(trainable_cls)
         self._cache[trainable_cls] = remote_cls
     return self._cache[trainable_cls]
Exemple #31
0
 def as_remote(cls, num_cpus=None, num_gpus=None, resources=None):
     return ray.remote(
         num_cpus=num_cpus, num_gpus=num_gpus, resources=resources)(cls)
Exemple #32
0
    def save(self):
        torch.save(self.model.state_dict(), "mnist_cnn.pt")


net = Network()
net.train()
# __torch_net_end__
# yapf: enable

# yapf: disable
# __torch_ray_start__
import ray
ray.init()

RemoteNetwork = ray.remote(Network)
# Use the below instead of `ray.remote(network)` to leverage the GPU.
# RemoteNetwork = ray.remote(num_gpus=1)(Network)
# __torch_ray_end__
# yapf: enable

# yapf: disable
# __torch_actor_start__
NetworkActor = RemoteNetwork.remote()
NetworkActor2 = RemoteNetwork.remote()

ray.get([NetworkActor.train.remote(), NetworkActor2.train.remote()])
# __torch_actor_end__
# yapf: enable

# yapf: disable
Exemple #33
0
    def setup(self, config: PartialAlgorithmConfigDict):
        # Call super's setup to validate config, create RolloutWorkers
        # (train and eval), etc..
        num_gpus_saved = config["num_gpus"]
        config["num_gpus"] = min(config["num_gpus"], 1)
        super().setup(config)
        self.config["num_gpus"] = num_gpus_saved

        # - Create n policy learner actors (@ray.remote-converted Policies) on
        #   one or more GPU nodes.
        # - On each such node, also locate one replay buffer shard.

        ma_cfg = self.config["multiagent"]
        # By default, set max_num_policies_to_train to the number of policy IDs
        # provided in the multiagent config.
        if self.config["max_num_policies_to_train"] is None:
            self.config["max_num_policies_to_train"] = len(
                self.workers.local_worker().get_policies_to_train()
            )

        # Single CPU replay shard (co-located with GPUs so we can place the
        # policies on the same machine(s)).
        num_gpus = (
            0.01 if (self.config["num_gpus"] and not self.config["_fake_gpus"]) else 0
        )
        ReplayActor = ray.remote(
            num_cpus=1,
            num_gpus=num_gpus,
        )(MixInMultiAgentReplayBuffer)

        # Setup remote replay buffer shards and policy learner actors
        # (located on any GPU machine in the cluster):
        replay_actor_args = [
            self.config["replay_buffer_capacity"],
            self.config["replay_buffer_replay_ratio"],
        ]

        # Create a DistributedLearners utility object and set it up with
        # the initial first n learnable policies (found in the config).
        distributed_learners = DistributedLearners(
            config=self.config,
            max_num_policies_to_train=self.config["max_num_policies_to_train"],
            replay_actor_class=ReplayActor,
            replay_actor_args=replay_actor_args,
        )
        for pid, policy_spec in ma_cfg["policies"].items():
            if pid in self.workers.local_worker().get_policies_to_train():
                distributed_learners.add_policy(pid, policy_spec)

        # Store distributed_learners on all RolloutWorkers
        # so they know, to which replay shard to send samples to.

        def _set_policy_learners(worker):
            worker._distributed_learners = distributed_learners

        ray.get(
            [
                w.apply.remote(_set_policy_learners)
                for w in self.workers.remote_workers()
            ]
        )

        self.distributed_learners = distributed_learners
        self._sampling_actor_manager = AsyncRequestsManager(
            self.workers.remote_workers(),
            max_remote_requests_in_flight_per_worker=self.config[
                "max_requests_in_flight_per_sampler_worker"
            ],
            ray_wait_timeout_s=self.config["timeout_s_sampler_manager"],
        )
        policy_actors = [policy_actor for _, policy_actor, _ in distributed_learners]
        self._learner_worker_manager = AsyncRequestsManager(
            workers=policy_actors,
            max_remote_requests_in_flight_per_worker=self.config[
                "max_requests_in_flight_per_learner_worker"
            ],
            ray_wait_timeout_s=self.config["timeout_s_learner_manager"],
        )
Exemple #34
0
 def setup(self):
     self.square = ray.remote(resources={"foo": 1})(square)
 def as_remote(cls, num_cpus=None, num_gpus=None):
     return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)
Exemple #36
0
def send_dir_to_head(local_dir: str, remote_dir: str):
    import ray

    ip = ray.get(ray.remote(_get_head_ip).remote())
    return send_dir_to_node(ip, local_dir, remote_dir)
Exemple #37
0
async def router(serve_instance):
    q = ray.remote(Router).remote(serve_instance._controller)
    yield q
    ray.kill(q)
Exemple #38
0
    def apply_gradients(self, grads):
        self.policy.apply_gradients(grads)

    def get_weights(self):
        return self.policy.get_weights()

    def set_weights(self, params):
        self.policy.set_weights(params)

    def save(self):
        weights = self.get_weights()
        return pickle.dumps({
            "weights": weights})

    def restore(self, objs):
        objs = pickle.loads(objs)
        self.set_weights(objs["weights"])

    def get_metrics(self):
        completed = []
        while True:
            try:
                completed.append(self.metrics_queue.get_nowait())
            except queue.Empty:
                break
        return completed


RemoteBCEvaluator = ray.remote(BCEvaluator)
GPURemoteBCEvaluator = ray.remote(num_gpus=1)(BCEvaluator)
Exemple #39
0
    def testRemoteTrainingStep(self):
        ray.init(num_workers=1)

        net = ray.remote(TrainActor).remote()
        ray.get(net.training_step.remote(net.get_weights.remote()))
Exemple #40
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: Iterable[Block]) -> Iterable[ObjectRef[Block]]:

        map_bar = ProgressBar("Map Progress", total=len(blocks))

        class BlockWorker:
            def ready(self):
                return "ok"

            @ray.method(num_returns=2)
            def process_block(self, block: Block,
                              meta: BlockMetadata) -> (Block, BlockMetadata):
                new_block = fn(block)
                accessor = BlockAccessor.for_block(new_block)
                new_metadata = BlockMetadata(num_rows=accessor.num_rows(),
                                             size_bytes=accessor.size_bytes(),
                                             schema=accessor.schema(),
                                             input_files=meta.input_files)
                return new_block, new_metadata

        if not remote_args:
            remote_args["num_cpus"] = 1
        BlockWorker = ray.remote(**remote_args)(BlockWorker)

        self.workers = [BlockWorker.remote()]
        metadata_mapping = {}
        tasks = {w.ready.remote(): w for w in self.workers}
        ready_workers = set()
        blocks_in = [(b, m) for (b, m) in zip(blocks, blocks.get_metadata())]
        blocks_out = []

        while len(blocks_out) < len(blocks):
            ready, _ = ray.wait(list(tasks),
                                timeout=0.01,
                                num_returns=1,
                                fetch_local=False)
            if not ready:
                if len(ready_workers) / len(self.workers) > 0.8:
                    w = BlockWorker.remote()
                    self.workers.append(w)
                    tasks[w.ready.remote()] = w
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers),
                            len(self.workers) - len(ready_workers)))
                continue

            [obj_id] = ready
            worker = tasks[obj_id]
            del tasks[obj_id]

            # Process task result.
            if worker in ready_workers:
                blocks_out.append(obj_id)
                map_bar.update(1)
            else:
                ready_workers.add(worker)

            # Schedule a new task.
            if blocks_in:
                block_ref, meta_ref = worker.process_block.remote(
                    *blocks_in.pop())
                metadata_mapping[block_ref] = meta_ref
                tasks[block_ref] = worker

        new_metadata = ray.get([metadata_mapping[b] for b in blocks_out])
        map_bar.close()
        return BlockList(blocks_out, new_metadata)
Exemple #41
0
def test_remote_training_step(ray_start_regular):
    net = ray.remote(TrainActor).remote()
    ray.get(net.training_step.remote(net.get_weights.remote()))
Exemple #42
0
 def setup(self):
     self.square = ray.remote(num_cpus=1)(square)
Exemple #43
0
def run_experiments(
        experiments: Union[Experiment, Mapping, Sequence[Union[Experiment,
                                                               Mapping]]],
        scheduler: Optional[TrialScheduler] = None,
        server_port: Optional[int] = None,
        verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
        progress_reporter: Optional[ProgressReporter] = None,
        resume: bool = False,
        reuse_actors: bool = False,
        trial_executor: Optional[RayTrialExecutor] = None,
        raise_on_failed_trial: bool = True,
        concurrent: bool = True,
        # Deprecated args.
        queue_trials: Optional[bool] = None,
        callbacks: Optional[Sequence[Callback]] = None,
        _remote: Optional[bool] = None):
    """Runs and blocks until all trials finish.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # To be removed in 1.9.
    if queue_trials is not None:
        raise DeprecationWarning(
            "`queue_trials` has been deprecated and is replaced by "
            "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. "
            "Per default at least one Trial is queued at all times, "
            "so you likely don't need to change anything other than "
            "removing this argument from your call to `tune.run()`")

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        remote_run = ray.remote(num_cpus=0)(run_experiments)

        # Make sure tune.run_experiments is run on the server node.
        remote_run = force_on_current_node(remote_run)

        return ray.get(
            remote_run.remote(experiments,
                              scheduler,
                              server_port,
                              verbose,
                              progress_reporter,
                              resume,
                              reuse_actors,
                              trial_executor,
                              raise_on_failed_trial,
                              concurrent,
                              callbacks,
                              _remote=False))

    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)

    if concurrent:
        return run(experiments,
                   server_port=server_port,
                   verbose=verbose,
                   progress_reporter=progress_reporter,
                   resume=resume,
                   reuse_actors=reuse_actors,
                   trial_executor=trial_executor,
                   raise_on_failed_trial=raise_on_failed_trial,
                   scheduler=scheduler,
                   callbacks=callbacks).trials
    else:
        trials = []
        for exp in experiments:
            trials += run(exp,
                          server_port=server_port,
                          verbose=verbose,
                          progress_reporter=progress_reporter,
                          resume=resume,
                          reuse_actors=reuse_actors,
                          trial_executor=trial_executor,
                          raise_on_failed_trial=raise_on_failed_trial,
                          scheduler=scheduler,
                          callbacks=callbacks).trials
        return trials
Exemple #44
0
            # This will usually run on the head node
            @ray.remote
            def _get_head_ip():
                return ray.util.get_node_ip_address()

            ip = ray.get(_get_head_ip.remote())

            remote_tune_script = "/tmp/_tune_script.py"

            print(
                f"Sending tune script to remote node {ip} ({remote_tune_script})"
            )
            send_local_file_to_remote_file(TUNE_SCRIPT, remote_tune_script, ip)
            print("Starting remote cloud test using Ray client")

            _run_test_remote = ray.remote(resources={f"node:{ip}": 0.01},
                                          num_cpus=0)(_run_test)
            result = ray.get(
                _run_test_remote.remote(
                    args.variant,
                    args.trainable,
                    run_time,
                    bucket,
                    args.cpus_per_trial,
                    remote_tune_script,
                ))
    except Exception as e:
        err = e
        result = {}

    if bucket:
        try:
Exemple #45
0
def run(
    run_or_experiment: Union[str, Callable, Type],
    name: Optional[str] = None,
    metric: Optional[str] = None,
    mode: Optional[str] = None,
    stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], bool]] = None,
    time_budget_s: Union[None, int, float, datetime.timedelta] = None,
    config: Optional[Dict[str, Any]] = None,
    resources_per_trial: Union[None, Mapping[str, Union[float, int, Mapping]],
                               PlacementGroupFactory] = None,
    num_samples: int = 1,
    local_dir: Optional[str] = None,
    search_alg: Optional[Union[Searcher, SearchAlgorithm, str]] = None,
    scheduler: Optional[Union[TrialScheduler, str]] = None,
    keep_checkpoints_num: Optional[int] = None,
    checkpoint_score_attr: Optional[str] = None,
    checkpoint_freq: int = 0,
    checkpoint_at_end: bool = False,
    verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
    progress_reporter: Optional[ProgressReporter] = None,
    log_to_file: bool = False,
    trial_name_creator: Optional[Callable[[Trial], str]] = None,
    trial_dirname_creator: Optional[Callable[[Trial], str]] = None,
    sync_config: Optional[SyncConfig] = None,
    export_formats: Optional[Sequence] = None,
    max_failures: int = 0,
    fail_fast: bool = False,
    restore: Optional[str] = None,
    server_port: Optional[int] = None,
    resume: bool = False,
    reuse_actors: bool = False,
    trial_executor: Optional[RayTrialExecutor] = None,
    raise_on_failed_trial: bool = True,
    callbacks: Optional[Sequence[Callback]] = None,
    max_concurrent_trials: Optional[int] = None,
    # Deprecated args
    queue_trials: Optional[bool] = None,
    loggers: Optional[Sequence[Type[Logger]]] = None,
    _remote: Optional[bool] = None,
) -> ExperimentAnalysis:
    """Executes training.

    When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run
    will gracefully shut down and checkpoint the latest experiment state.
    Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step.

    Many aspects of Tune, such as the frequency of global checkpointing,
    maximum pending placement group trials and the path of the result
    directory be configured through environment variables. Refer to
    :ref:`tune-env-vars` for a list of environment variables available.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric (str): Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode (str): Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s (int|float|datetime.timedelta): Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict|PlacementGroupFactory): Machine resources
            to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``.
            Note that GPUs will not be assigned unless you specify them here.
            Defaults to 1 CPU and 0 GPUs in
            ``Trainable.default_resource_request()``. This can also
            be a PlacementGroupFactory object wrapping arguments to create a
            per-trial placement group.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times. If this is -1, (virtually) infinite
            samples are generated until a stopping condition is met.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg (Searcher|SearchAlgorithm|str): Search algorithm for
            optimization. You can also use the name of the algorithm.
        scheduler (TrialScheduler|str): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options. You can also use the
            name of the scheduler.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode.
            0 = silent, 1 = only status updates, 2 = status and brief trial
            results, 3 = status and detailed trial results. Defaults to 3.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator (Callable[[Trial], str]): Optional function
            for generating the trial string representation.
        trial_dirname_creator (Callable[[Trial], str]): Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config (SyncConfig): Configuration object for syncing. See
            tune.SyncConfig.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 0.
        fail_fast (bool | str): Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port (int): Port number for launching TuneServer.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY",
            or bool. LOCAL/True restores the checkpoint from the
            local experiment directory, determined
            by ``name`` and ``local_dir``. REMOTE restores the checkpoint
            from ``upload_dir`` (as passed to ``sync_config``).
            PROMPT provides CLI feedback.
            False forces a new experiment. ERRORED_ONLY resets and reruns
            ERRORED trials upon resume - previous trial artifacts will
            be left untouched.  If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        callbacks (list): List of callbacks that will be called at different
            times in the training loop. Must be instances of the
            ``ray.tune.callback.Callback`` class. If not passed,
            `LoggerCallback` and `SyncerCallback` callbacks are automatically
            added.
        max_concurrent_trials (int): Maximum number of trials to run
            concurrently. Must be non-negative. If None or 0, no limit will
            be applied. This is achieved by wrapping the ``search_alg`` in
            a :class:`ConcurrencyLimiter`, and thus setting this argument
            will raise an exception if the ``search_alg`` is already a
            :class:`ConcurrencyLimiter`. Defaults to None.
        _remote (bool): Whether to run the Tune driver in a remote function.
            This is disabled automatically if a custom trial executor is
            passed in. This is enabled by default in Ray client mode.

    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """

    # To be removed in 1.9.
    if queue_trials is not None:
        raise DeprecationWarning(
            "`queue_trials` has been deprecated and is replaced by "
            "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. "
            "Per default at least one Trial is queued at all times, "
            "so you likely don't need to change anything other than "
            "removing this argument from your call to `tune.run()`")

    # NO CODE IS TO BE ADDED ABOVE THIS COMMENT
    # remote_run_kwargs must be defined before any other
    # code is ran to ensure that at this point,
    # `locals()` is equal to args and kwargs
    remote_run_kwargs = locals().copy()
    remote_run_kwargs.pop("_remote")

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        remote_run = ray.remote(num_cpus=0)(run)

        # Make sure tune.run is called on the sever node.
        remote_run = force_on_current_node(remote_run)

        # JupyterNotebooks don't work with remote tune runs out of the box
        # (e.g. via Ray client) as they don't have access to the main
        # process stdout. So we introduce a queue here that accepts
        # callables, which will then be executed on the driver side.
        if isinstance(progress_reporter, JupyterNotebookReporter):
            execute_queue = Queue(actor_options={
                "num_cpus": 0,
                **force_on_current_node(None)
            })
            progress_reporter.set_output_queue(execute_queue)

            def get_next_queue_item():
                try:
                    return execute_queue.get(block=False)
                except Empty:
                    return None

        else:
            # If we don't need a queue, use this dummy get fn instead of
            # scheduling an unneeded actor
            def get_next_queue_item():
                return None

        def _handle_execute_queue():
            execute_item = get_next_queue_item()
            while execute_item:
                if isinstance(execute_item, Callable):
                    execute_item()

                execute_item = get_next_queue_item()

        remote_future = remote_run.remote(_remote=False, **remote_run_kwargs)

        # ray.wait(...)[1] returns futures that are not ready, yet
        while ray.wait([remote_future], timeout=0.2)[1]:
            # Check if we have items to execute
            _handle_execute_queue()

        # Handle queue one last time
        _handle_execute_queue()

        return ray.get(remote_future)

    del remote_run_kwargs

    all_start = time.time()

    if loggers:
        # Raise DeprecationWarning in 1.9, remove in 1.10/1.11
        warnings.warn(
            "The `loggers` argument is deprecated. Please pass the respective "
            "`LoggerCallback` classes to the `callbacks` argument instead. "
            "See https://docs.ray.io/en/latest/tune/api_docs/logging.html")

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']")

    set_verbosity(verbose)

    config = config or {}
    sync_config = sync_config or SyncConfig()
    set_sync_periods(sync_config)

    if num_samples == -1:
        num_samples = sys.maxsize

    result_buffer_length = None

    # Create scheduler here as we need access to some of its properties
    if isinstance(scheduler, str):
        # importing at top level causes a recursive dependency
        from ray.tune.schedulers import create_scheduler
        scheduler = create_scheduler(scheduler)
    scheduler = scheduler or FIFOScheduler()

    if not scheduler.supports_buffered_results:
        # Result buffering with e.g. a Hyperband scheduler is a bad idea, as
        # hyperband tries to stop trials when processing brackets. With result
        # buffering, we might trigger this multiple times when evaluating
        # a single trial, which leads to unexpected behavior.
        env_result_buffer_length = os.getenv("TUNE_RESULT_BUFFER_LENGTH", "")
        if env_result_buffer_length:
            warnings.warn(
                f"You are using a {type(scheduler)} scheduler, but "
                f"TUNE_RESULT_BUFFER_LENGTH is set "
                f"({env_result_buffer_length}). This can lead to undesired "
                f"and faulty behavior, so the buffer length was forcibly set "
                f"to 1 instead.")
        result_buffer_length = 1

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors, result_buffer_length=result_buffer_length)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                sync_config=sync_config,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if isinstance(search_alg, str):
        # importing at top level causes a recursive dependency
        from ray.tune.suggest import create_searcher
        search_alg = create_searcher(search_alg)

    # if local_mode=True is set during ray.init().
    is_local_mode = ray.worker._mode() == ray.worker.LOCAL_MODE

    if is_local_mode:
        max_concurrent_trials = 1

    if not search_alg:
        search_alg = BasicVariantGenerator(
            max_concurrent=max_concurrent_trials or 0)
    elif max_concurrent_trials:
        if isinstance(search_alg, ConcurrencyLimiter):
            if search_alg.max_concurrent != max_concurrent_trials:
                raise ValueError(
                    "You have specified `max_concurrent_trials="
                    f"{max_concurrent_trials}`, but the `search_alg` is "
                    "already a `ConcurrencyLimiter` with `max_concurrent="
                    f"{search_alg.max_concurrent}. FIX THIS by setting "
                    "`max_concurrent_trials=None`.")
            else:
                logger.warning(
                    "You have specified `max_concurrent_trials="
                    f"{max_concurrent_trials}`, but the `search_alg` is "
                    "already a `ConcurrencyLimiter`. `max_concurrent_trials` "
                    "will be ignored.")
        else:
            if max_concurrent_trials < 1:
                raise ValueError(
                    "`max_concurrent_trials` must be greater or equal than 1, "
                    f"got {max_concurrent_trials}.")
            if isinstance(search_alg, Searcher):
                search_alg = ConcurrencyLimiter(
                    search_alg, max_concurrent=max_concurrent_trials)
            elif not is_local_mode:
                logger.warning(
                    "You have passed a `SearchGenerator` instance as the "
                    "`search_alg`, but `max_concurrent_trials` requires a "
                    "`Searcher` instance`. `max_concurrent_trials` "
                    "will be ignored.")

    if isinstance(search_alg, Searcher):
        search_alg = SearchGenerator(search_alg)

    if config and not set_search_properties_backwards_compatible(
            search_alg.set_search_properties, metric, mode, config, **
            experiments[0].public_spec):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary.")

    if not scheduler.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`")

    # Create syncer callbacks
    callbacks = create_default_callbacks(callbacks,
                                         sync_config,
                                         metric=metric,
                                         loggers=loggers)

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_config=sync_config,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        fail_fast=fail_fast,
        trial_executor=trial_executor,
        callbacks=callbacks,
        metric=metric,
        # Driver should only sync trial checkpoints if
        # checkpoints are not synced to cloud
        driver_sync_trial_checkpoints=not bool(sync_config.upload_dir))

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment but "
                    "updating trial resources.")
        if resources_per_trial:
            runner.update_pending_trial_resources(resources_per_trial)

    progress_reporter = progress_reporter or detect_reporter()

    if not progress_reporter.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the reporter you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your reporter or from your call to `tune.run()`")
    progress_reporter.set_total_samples(search_alg.total_samples)

    # Calls setup on callbacks
    runner.setup_experiments(experiments=experiments,
                             total_num_samples=search_alg.total_samples)

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overridden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    original_handler = signal.getsignal(signal.SIGINT)
    state = {signal.SIGINT: False}

    def sigint_handler(sig, frame):
        logger.warning(
            "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. "
            "This will try to checkpoint the experiment state one last time. "
            "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) "
            "to skip. ")
        state[signal.SIGINT] = True
        # Restore original signal handler to react to future SIGINT signals
        signal.signal(signal.SIGINT, original_handler)

    if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")):
        signal.signal(signal.SIGINT, sigint_handler)

    tune_start = time.time()
    progress_reporter.set_start_time(tune_start)
    while not runner.is_finished() and not state[signal.SIGINT]:
        runner.step()
        if has_verbosity(Verbosity.V1_EXPERIMENT):
            _report_progress(runner, progress_reporter)
    tune_taken = time.time() - tune_start

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if has_verbosity(Verbosity.V1_EXPERIMENT):
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial and not state[signal.SIGINT]:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    all_taken = time.time() - all_start
    if has_verbosity(Verbosity.V1_EXPERIMENT):
        logger.info(f"Total run time: {all_taken:.2f} seconds "
                    f"({tune_taken:.2f} seconds for the tuning loop).")

    if state[signal.SIGINT]:
        logger.warning(
            "Experiment has been interrupted, but the most recent state was "
            "saved. You can continue running this experiment by passing "
            "`resume=True` to `tune.run()`")

    trials = runner.get_trials()
    return ExperimentAnalysis(runner.checkpoint_file,
                              trials=trials,
                              default_metric=metric,
                              default_mode=mode)
async def test_task_runner_custom_method_batch(serve_instance):
    q = ray.remote(Router).remote()
    await q.setup.remote("")

    @serve.accept_batch
    class Batcher:
        def a(self, _):
            return ["a-{}".format(i) for i in range(serve.context.batch_size)]

        def b(self, _):
            return ["b-{}".format(i) for i in range(serve.context.batch_size)]

        def error_different_size(self, _):
            return [""] * (serve.context.batch_size * 2)

        def error_non_iterable(self, _):
            return 42

        def return_np_array(self, _):
            return np.array([1] * serve.context.batch_size).astype(np.int32)

    CONSUMER_NAME = "runner"
    PRODUCER_NAME = "producer"

    backend_config = BackendConfig(
        {
            "max_batch_size": 4,
            "batch_wait_timeout": 2
        }, accepts_batches=True)
    worker = setup_worker(
        CONSUMER_NAME, Batcher, backend_config=backend_config)

    await q.set_traffic.remote(PRODUCER_NAME,
                               TrafficPolicy({
                                   CONSUMER_NAME: 1.0
                               }))
    await q.set_backend_config.remote(CONSUMER_NAME, backend_config)

    def make_request_param(call_method):
        return RequestMetadata(
            PRODUCER_NAME, context.TaskContext.Python, call_method=call_method)

    a_query_param = make_request_param("a")
    b_query_param = make_request_param("b")

    futures = [q.enqueue_request.remote(a_query_param) for _ in range(2)]
    futures += [q.enqueue_request.remote(b_query_param) for _ in range(2)]

    await q.add_new_worker.remote(CONSUMER_NAME, "replica1", worker)

    gathered = await asyncio.gather(*futures)
    assert set(gathered) == {"a-0", "a-1", "b-0", "b-1"}

    with pytest.raises(RayServeException, match="doesn't preserve batch size"):
        different_size = make_request_param("error_different_size")
        await q.enqueue_request.remote(different_size)

    with pytest.raises(RayServeException, match="iterable"):
        non_iterable = make_request_param("error_non_iterable")
        await q.enqueue_request.remote(non_iterable)

    np_array = make_request_param("return_np_array")
    result_np_value = await q.enqueue_request.remote(np_array)
    assert isinstance(result_np_value, np.int32)
Exemple #47
0
def test_gpu_ids(shutdown_only):
    num_gpus = 10
    ray.init(num_cpus=10, num_gpus=num_gpus)

    def get_gpu_ids(num_gpus_per_worker):
        time.sleep(0.1)
        gpu_ids = ray.get_gpu_ids()
        assert len(gpu_ids) == num_gpus_per_worker
        assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join(
            [str(i) for i in gpu_ids]))
        for gpu_id in gpu_ids:
            assert gpu_id in range(num_gpus)
        return gpu_ids

    f0 = ray.remote(num_gpus=0)(lambda: get_gpu_ids(0))
    f1 = ray.remote(num_gpus=1)(lambda: get_gpu_ids(1))
    f2 = ray.remote(num_gpus=2)(lambda: get_gpu_ids(2))
    f4 = ray.remote(num_gpus=4)(lambda: get_gpu_ids(4))
    f5 = ray.remote(num_gpus=5)(lambda: get_gpu_ids(5))

    # Wait for all workers to start up.
    @ray.remote
    def f():
        time.sleep(0.1)
        return os.getpid()

    start_time = time.time()
    while True:
        if len(set(ray.get([f.remote() for _ in range(10)]))) == 10:
            break
        if time.time() > start_time + 10:
            raise RayTestTimeoutException(
                "Timed out while waiting for workers to start "
                "up.")

    list_of_ids = ray.get([f0.remote() for _ in range(10)])
    assert list_of_ids == 10 * [[]]

    list_of_ids = ray.get([f1.remote() for _ in range(10)])
    set_of_ids = {tuple(gpu_ids) for gpu_ids in list_of_ids}
    assert set_of_ids == {(i, ) for i in range(10)}

    list_of_ids = ray.get([f2.remote(), f4.remote(), f4.remote()])
    all_ids = [gpu_id for gpu_ids in list_of_ids for gpu_id in gpu_ids]
    assert set(all_ids) == set(range(10))

    # There are only 10 GPUs, and each task uses 5 GPUs, so there should only
    # be 2 tasks scheduled at a given time.
    t1 = time.time()
    ray.get([f5.remote() for _ in range(20)])
    assert time.time() - t1 >= 10 * 0.1

    # Test that actors have CUDA_VISIBLE_DEVICES set properly.

    @ray.remote
    class Actor0(object):
        def __init__(self):
            gpu_ids = ray.get_gpu_ids()
            assert len(gpu_ids) == 0
            assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join(
                [str(i) for i in gpu_ids]))
            # Set self.x to make sure that we got here.
            self.x = 1

        def test(self):
            gpu_ids = ray.get_gpu_ids()
            assert len(gpu_ids) == 0
            assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join(
                [str(i) for i in gpu_ids]))
            return self.x

    @ray.remote(num_gpus=1)
    class Actor1(object):
        def __init__(self):
            gpu_ids = ray.get_gpu_ids()
            assert len(gpu_ids) == 1
            assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join(
                [str(i) for i in gpu_ids]))
            # Set self.x to make sure that we got here.
            self.x = 1

        def test(self):
            gpu_ids = ray.get_gpu_ids()
            assert len(gpu_ids) == 1
            assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join(
                [str(i) for i in gpu_ids]))
            return self.x

    a0 = Actor0.remote()
    ray.get(a0.test.remote())

    a1 = Actor1.remote()
    ray.get(a1.test.remote())
Exemple #48
0
    def sync_filters(self, new_filters):
        """Changes self's filter to given and rebases any accumulated delta.

        Args:
            new_filters (dict): Filters with new state to update local copy.
        """
        assert all(k in new_filters for k in self.filters)
        for k in self.filters:
            self.filters[k].sync(new_filters[k])

    def get_filters(self, flush_after=False):
        """Returns a snapshot of filters.

        Args:
            flush_after (bool): Clears the filter buffer state.

        Returns:
            return_filters (dict): Dict for serializable filters
        """
        return_filters = {}
        for k, f in self.filters.items():
            return_filters[k] = f.as_serializable()
            if flush_after:
                f.clear_buffer()
        return return_filters


RemoteA3CEvaluator = ray.remote(A3CEvaluator)
GPURemoteA3CEvaluator = ray.remote(num_gpus=1)(A3CEvaluator)
Exemple #49
0
                                  gamma=self.config["gamma"],
                                  use_gae=False)
        return samples

    def get_completed_rollout_metrics(self):
        """Returns metrics on previously completed rollouts.

        Calling this clears the queue of completed rollout metrics.
        """
        return self.sampler.get_metrics()

    def compute_gradients(self, samples):
        """ Returns gradient w.r.t. samples."""
        gradient, info = self.policy.compute_gradients(samples)
        return gradient

    def apply_gradients(self, grads):
        """Applies gradients to evaluator weights."""
        self.policy.apply_gradients(grads)

    def get_weights(self):
        """Returns model weights."""
        return self.policy.get_weights()

    def set_weights(self, weights):
        """Sets model weights."""
        return self.policy.set_weights(weights)


RemotePGEvaluator = ray.remote(PGEvaluator)
Exemple #50
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList) -> BlockList:
        context = DatasetContext.get_current()

        blocks_in = list(blocks.iter_blocks_with_metadata())
        orig_num_blocks = len(blocks_in)
        results = []
        map_bar = ProgressBar("Map Progress", total=orig_num_blocks)

        class BlockWorker:
            def ready(self):
                return "ok"

            def map_block_split(self, block: Block,
                                input_files: List[str]) -> BlockPartition:
                return _map_block_split(block, fn, input_files)

            @ray.method(num_returns=2)
            def map_block_nosplit(
                    self, block: Block,
                    input_files: List[str]) -> Tuple[Block, BlockMetadata]:
                return _map_block_nosplit(block, fn, input_files)

        if not remote_args:
            remote_args["num_cpus"] = 1

        BlockWorker = ray.remote(**remote_args)(BlockWorker)

        self.workers = [BlockWorker.remote()]
        tasks = {w.ready.remote(): w for w in self.workers}
        metadata_mapping = {}
        ready_workers = set()

        while len(results) < orig_num_blocks:
            ready, _ = ray.wait(list(tasks),
                                timeout=0.01,
                                num_returns=1,
                                fetch_local=False)
            if not ready:
                if len(ready_workers) / len(self.workers) > 0.8:
                    w = BlockWorker.remote()
                    self.workers.append(w)
                    tasks[w.ready.remote()] = w
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers),
                            len(self.workers) - len(ready_workers)))
                continue

            [obj_id] = ready
            worker = tasks[obj_id]
            del tasks[obj_id]

            # Process task result.
            if worker in ready_workers:
                results.append(obj_id)
                map_bar.update(1)
            else:
                ready_workers.add(worker)

            # Schedule a new task.
            if blocks_in:
                block, meta = blocks_in.pop()
                if context.block_splitting_enabled:
                    ref = worker.map_block_split.remote(
                        block, meta.input_files)
                else:
                    ref, meta_ref = worker.map_block_nosplit.remote(
                        block, meta.input_files)
                    metadata_mapping[ref] = meta_ref
                tasks[ref] = worker

        map_bar.close()
        new_blocks, new_metadata = [], []
        if context.block_splitting_enabled:
            for result in ray.get(results):
                for block, metadata in result:
                    new_blocks.append(block)
                    new_metadata.append(metadata)
        else:
            for block in results:
                new_blocks.append(block)
                new_metadata.append(metadata_mapping[block])
        return BlockList(new_blocks, new_metadata)
Exemple #51
0
    def repartition(self, num_partitions: int,
                    batch_ms: int = 0) -> "ParallelIterator[T]":
        """Returns a new ParallelIterator instance with num_partitions shards.

        The new iterator contains the same data in this instance except with
        num_partitions shards. The data is split in round-robin fashion for
        the new ParallelIterator.

        Args:
            num_partitions (int): The number of shards to use for the new
                ParallelIterator
            batch_ms (int): Batches items for batch_ms milliseconds
                on each shard before retrieving it.
                Increasing batch_ms increases latency but improves throughput.

        Returns:
            A ParallelIterator with num_partitions number of shards and the
            data of this ParallelIterator split round-robin among the new
            number of shards.

        Examples:
            >>> it = from_range(8, 2)
            >>> it = it.repartition(3)
            >>> list(it.get_shard(0))
            [0, 4, 3, 7]
            >>> list(it.get_shard(1))
            [1, 5]
            >>> list(it.get_shard(2))
            [2, 6]
        """

        # initialize the local iterators for all the actors
        all_actors = []
        for actor_set in self.actor_sets:
            actor_set.init_actors()
            all_actors.extend(actor_set.actors)

        def base_iterator(num_partitions, partition_index, timeout=None):
            futures = {}
            for a in all_actors:
                futures[a.par_iter_slice_batch.remote(
                    step=num_partitions,
                    start=partition_index,
                    batch_ms=batch_ms)] = a
            while futures:
                pending = list(futures)
                if timeout is None:
                    # First try to do a batch wait for efficiency.
                    ready, _ = ray.wait(
                        pending, num_returns=len(pending), timeout=0)
                    # Fall back to a blocking wait.
                    if not ready:
                        ready, _ = ray.wait(pending, num_returns=1)
                else:
                    ready, _ = ray.wait(
                        pending, num_returns=len(pending), timeout=timeout)
                for obj_ref in ready:
                    actor = futures.pop(obj_ref)
                    try:
                        batch = ray.get(obj_ref)
                        futures[actor.par_iter_slice_batch.remote(
                            step=num_partitions,
                            start=partition_index,
                            batch_ms=batch_ms)] = actor
                        for item in batch:
                            yield item
                    except StopIteration:
                        pass
                # Always yield after each round of wait with timeout.
                if timeout is not None:
                    yield _NextValueNotReady()

        def make_gen_i(i):
            return lambda: base_iterator(num_partitions, i)

        name = self.name + f".repartition[num_partitions={num_partitions}]"

        generators = [make_gen_i(s) for s in range(num_partitions)]
        worker_cls = ray.remote(ParallelIteratorWorker)
        actors = [worker_cls.remote(g, repeat=False) for g in generators]
        # need explicit reference to self so actors in this instance do not die
        return ParallelIterator(
            [_ActorSet(actors, [])], name, parent_iterators=[self])
Exemple #52
0
def test_options():
    """General test of option keywords in Ray."""
    import re
    from ray._private import ray_option_utils

    def f():
        return 1

    class A:
        x = 1

    task_defaults = {
        k: v.default_value
        for k, v in ray_option_utils.task_options.items()
    }
    task_defaults_for_options = task_defaults.copy()
    task_defaults_for_options.pop("max_calls")
    ray.remote(f).options(**task_defaults_for_options)
    ray.remote(**task_defaults)(f).options(**task_defaults_for_options)
    with pytest.raises(
            ValueError,
            match=re.escape(
                "Setting 'max_calls' is not supported in '.options()'."),
    ):
        ray.remote(f).options(max_calls=1)

    actor_defaults = {
        k: v.default_value
        for k, v in ray_option_utils.actor_options.items()
    }
    actor_defaults_for_options = actor_defaults.copy()
    actor_defaults_for_options.pop("concurrency_groups")
    ray.remote(A).options(**actor_defaults_for_options)
    ray.remote(**actor_defaults)(A).options(**actor_defaults_for_options)
    with pytest.raises(
            ValueError,
            match=re.escape(
                "Setting 'concurrency_groups' is not supported in '.options()'."
            ),
    ):
        ray.remote(A).options(concurrency_groups=[])

    unique_object = type("###", (), {})()
    for k, v in ray_option_utils.task_options.items():
        v.validate(k, v.default_value)
        with pytest.raises(TypeError):
            v.validate(k, unique_object)

    for k, v in ray_option_utils.actor_options.items():
        v.validate(k, v.default_value)
        with pytest.raises(TypeError):
            v.validate(k, unique_object)

    # test updating each namespace of "_metadata" independently
    assert ray_option_utils.update_options(
        {
            "_metadata": {
                "ns1": {
                    "a1": 1,
                    "b1": 2,
                    "c1": 3
                },
                "ns2": {
                    "a2": 1
                }
            },
            "num_cpus": 1,
            "xxx": {
                "x": 2
            },
            "zzz": 42,
        },
        {
            "_metadata": {
                "ns1": {
                    "b1": 22
                },
                "ns3": {
                    "b3": 2
                }
            },
            "num_cpus": 2,
            "xxx": {
                "y": 2
            },
            "yyy": 3,
        },
    ) == {
        "_metadata": {
            "ns1": {
                "a1": 1,
                "b1": 22,
                "c1": 3
            },
            "ns2": {
                "a2": 1
            },
            "ns3": {
                "b3": 2
            },
        },
        "num_cpus": 2,
        "xxx": {
            "y": 2
        },
        "yyy": 3,
        "zzz": 42,
    }

    # test options for other Ray libraries.
    namespace = "namespace"

    class mock_options:
        def __init__(self, **options):
            self.options = {"_metadata": {namespace: options}}

        def keys(self):
            return ("_metadata", )

        def __getitem__(self, key):
            return self.options[key]

        def __call__(self, f):
            f._default_options.update(self.options)
            return f

    @mock_options(a=1, b=2)
    @ray.remote(num_gpus=2)
    def foo():
        pass

    assert foo._default_options == {
        "_metadata": {
            "namespace": {
                "a": 1,
                "b": 2
            }
        },
        "num_gpus": 2,
    }

    f2 = foo.options(num_cpus=1, num_gpus=1, **mock_options(a=11, c=3))

    # TODO(suquark): The current implementation of `.options()` is so bad that we
    # cannot even access its options from outside. Here we hack the closures to
    # achieve our goal. Need futher efforts to clean up the tech debt.
    assert f2.remote.__closure__[1].cell_contents == {
        "_metadata": {
            "namespace": {
                "a": 11,
                "b": 2,
                "c": 3
            }
        },
        "num_cpus": 1,
        "num_gpus": 1,
    }

    class mock_options2(mock_options):
        def __init__(self, **options):
            self.options = {"_metadata": {namespace + "2": options}}

    f3 = foo.options(num_cpus=1, num_gpus=1, **mock_options2(a=11, c=3))

    assert f3.remote.__closure__[1].cell_contents == {
        "_metadata": {
            "namespace": {
                "a": 1,
                "b": 2
            },
            "namespace2": {
                "a": 11,
                "c": 3
            }
        },
        "num_cpus": 1,
        "num_gpus": 1,
    }

    with pytest.raises(TypeError):
        # Ensure only a single "**option" per ".options()".
        # Otherwise it would be confusing.
        foo.options(
            num_cpus=1,
            num_gpus=1,
            **mock_options(a=11, c=3),
            **mock_options2(a=11, c=3),
        )
Exemple #53
0
    def _build_model(self):
        # To look clearly, whether I use bias.
        use_bias = self._config.use_bias
        
        self.observ = tf.placeholder(tf.float32, (None, 4), name='observ')
        self.target = tf.placeholder(tf.float32, name='target')
        x = tf.layers.dense(self.observ, 1, use_bias=use_bias,
                            kernel_initializer=tf.zeros_initializer)
        self.logits = x
    
    def _set_loss(self):
        losses = tf.losses.mean_squared_error(labels=self.target,
                                              predictions=self.logits)
        self.loss = tf.reduce_mean(losses)
    
    def predict(self, observ):
        baseline = self._sess.run([self.logits],
                                  feed_dict={self.observ: observ})
        return baseline
    
    def apply(self, observ, target):
        _, loss = self._sess.run([self.train_op, self.loss],
                                 feed_dict={self.observ: observ, self.target: target})
        return loss
    
    def get_weights(self):
        return self.variables.get_weights()

# Remote actor(Policy) function enable be called in distribution.
RemotePolicy = ray.remote(Policy)
Exemple #54
0
    def sync_filters(self, new_filters):
        """Changes self's filter to given and rebases any accumulated delta.

        Args:
            new_filters (dict): Filters with new state to update local copy.
        """
        assert all(k in new_filters for k in self.filters)
        for k in self.filters:
            self.filters[k].sync(new_filters[k])

    def get_filters(self, flush_after=False):
        """Returns a snapshot of filters.

        Args:
            flush_after (bool): Clears the filter buffer state.

        Returns:
            return_filters (dict): Dict for serializable filters
        """
        return_filters = {}
        for k, f in self.filters.items():
            return_filters[k] = f.as_serializable()
            if flush_after:
                f.clear_buffer()
        return return_filters


RemoteA3CEvaluator = ray.remote(A3CEvaluator)
GPURemoteA3CEvaluator = ray.remote(num_gpus=1)(A3CEvaluator)
Exemple #55
0
 def __init__(self):
     self._status_client = JobStatusStorageClient()
     self._log_client = JobLogStorageClient()
     self._supervisor_actor_cls = ray.remote(JobSupervisor)
Exemple #56
0
def main(args=None, model=None) -> GenerativeQAModule:
    parser = argparse.ArgumentParser()
    parser = pl.Trainer.add_argparse_args(parser)
    parser = GenerativeQAModule.add_model_specific_args(parser, os.getcwd())
    parser = GenerativeQAModule.add_retriever_specific_args(parser)

    args = args or parser.parse_args()

    Path(args.output_dir).mkdir(exist_ok=True)

    named_actors = []
    if args.distributed_retriever == "ray" and args.gpus > 1:
        if not is_ray_available():
            raise RuntimeError("Please install Ray to use the Ray "
                               "distributed retriever.")
        # Connect to an existing Ray cluster.
        try:
            ray.init(address=args.ray_address)
        except (ConnectionError, ValueError):
            logger.warning(
                "Connection to Ray cluster failed. Make sure a Ray"
                "cluster is running by either using Ray's cluster "
                "launcher (`ray up`) or by manually starting Ray on "
                "each node via `ray start --head` for the head node "
                "and `ray start --address='<ip address>:6379'` for "
                "additional nodes. See "
                "https://docs.ray.io/en/master/cluster/index.html "
                "for more info.")
            raise

        # Create Ray actors only for rank 0.
        if ("LOCAL_RANK" not in os.environ or int(os.environ["LOCAL_RANK"])
                == 0) and ("NODE_RANK" not in os.environ
                           or int(os.environ["NODE_RANK"]) == 0):
            remote_cls = ray.remote(RayRetriever)
            named_actors = [
                remote_cls.options(
                    name="retrieval_worker_{}".format(i)).remote()
                for i in range(args.num_retrieval_workers)
            ]
        else:
            logger.info(
                "Getting named actors for NODE_RANK {}, LOCAL_RANK {}".format(
                    os.environ["NODE_RANK"], os.environ["LOCAL_RANK"]))
            named_actors = [
                ray.get_actor("retrieval_worker_{}".format(i))
                for i in range(args.num_retrieval_workers)
            ]
    args.actor_handles = named_actors
    assert args.actor_handles == named_actors

    if model is None:
        model: GenerativeQAModule = GenerativeQAModule(args)

    dataset = Path(args.data_dir).name
    if (args.logger_name == "default" or args.fast_dev_run
            or str(args.output_dir).startswith("/tmp")
            or str(args.output_dir).startswith("/var")):
        training_logger = True  # don't pollute wandb logs unnecessarily
    elif args.logger_name == "wandb":
        from pytorch_lightning.loggers import WandbLogger

        project = os.environ.get("WANDB_PROJECT", dataset)
        training_logger = WandbLogger(name=model.output_dir.name,
                                      project=project)

    elif args.logger_name == "wandb_shared":
        from pytorch_lightning.loggers import WandbLogger

        training_logger = WandbLogger(name=model.output_dir.name,
                                      project=f"hf_{dataset}")

    es_callback = (get_early_stopping_callback(model.val_metric,
                                               args.early_stopping_patience)
                   if args.early_stopping_patience >= 0 else False)

    trainer: pl.Trainer = generic_train(
        model,
        args,
        logging_callback=Seq2SeqLoggingCallback(),
        checkpoint_callback=get_checkpoint_callback(args.output_dir,
                                                    model.val_metric),
        early_stopping_callback=es_callback,
        logger=training_logger,
        custom_ddp_plugin=CustomDDP() if args.gpus > 1 else None,
        profiler=pl.profiler.AdvancedProfiler() if args.profile else None,
    )
    pickle_save(model.hparams, model.output_dir / "hparams.pkl")

    if not args.do_predict:
        return model

    # test() without a model tests using the best checkpoint automatically
    trainer.test()
    return model
Exemple #57
0
 def setup(self):
     self.square = ray.remote(square)