Exemple #1
0
    def start_workers(self, num_workers):
        logger.debug(f"start_workers: Setting %d workers." % num_workers)

        if num_workers == 1:
            self.local_worker = TorchRunner(**self._params)
            if self._initialization_hook:
                self.apply_all_workers(self._initialization_hook)
            self.local_worker.setup_operator()
        else:

            # Start local worker
            self.local_worker = LocalDistributedRunner(
                num_cpus=self._num_cpus_per_worker,
                num_gpus=int(self._use_gpu),
                **{
                    **self._params,
                    **self._dist_params
                })
            self.remote_worker_group._init_dist_workers(num_workers - 1)
            if self._initialization_hook:
                self.apply_all_workers(self._initialization_hook)

            # Compute URL for initializing distributed PyTorch.
            address = setup_address()

            remote_pgs = self.remote_worker_group._setup_process_group(
                address=address, world_size=num_workers, starting_rank=1)
            # Use the local worker as rank 0. This will help with debugging.
            self.local_worker.setup_process_group(url=address,
                                                  world_rank=0,
                                                  world_size=num_workers,
                                                  timeout=timedelta(
                                                      self._timeout_s))
            ray.get(remote_pgs)

            remote_operators = self.remote_worker_group._setup_operator()
            self.local_worker.setup_operator()
            ray.get(remote_operators)
Exemple #2
0
    def _start_workers(self, num_workers):
        logger.debug(f"start_workers: Setting %d workers." % num_workers)
        worker_config = self.config.copy()
        batch_size_per_worker = self._configure_and_split_batch(num_workers)
        if batch_size_per_worker:
            worker_config[BATCH_SIZE] = batch_size_per_worker

        if num_workers == 1:
            # Start local worker
            self.local_worker = TorchRunner(
                model_creator=self.model_creator,
                data_creator=self.data_creator,
                optimizer_creator=self.optimizer_creator,
                loss_creator=self.loss_creator,
                scheduler_creator=self.scheduler_creator,
                training_operator_cls=self.training_operator_cls,
                config=worker_config,
                use_fp16=self.use_fp16,
                use_tqdm=self.use_tqdm,
                apex_args=self.apex_args,
                scheduler_step_freq=self.scheduler_step_freq)

            if self.initialization_hook:
                self.apply_all_workers(self.initialization_hook)

            self.local_worker.setup()
        else:
            params = dict(model_creator=self.model_creator,
                          data_creator=self.data_creator,
                          optimizer_creator=self.optimizer_creator,
                          loss_creator=self.loss_creator,
                          scheduler_creator=self.scheduler_creator,
                          backend=self.backend,
                          training_operator_cls=self.training_operator_cls,
                          config=worker_config,
                          use_fp16=self.use_fp16,
                          use_tqdm=self.use_tqdm,
                          apex_args=self.apex_args,
                          scheduler_step_freq=self.scheduler_step_freq)

            # Start local worker
            self.local_worker = LocalDistributedRunner(num_cpus=1,
                                                       num_gpus=int(
                                                           self.use_gpu),
                                                       **params)

            # Generate actor class
            RemoteRunner = ray.remote(num_cpus=1, num_gpus=int(
                self.use_gpu))(DistributedTorchRunner)
            # Start workers
            self.remote_workers = [
                RemoteRunner.remote(**params) for i in range(num_workers - 1)
            ]
            if self.initialization_hook:
                self.apply_all_workers(self.initialization_hook)

            # Compute URL for initializing distributed PyTorch
            ip = ray.services.get_node_ip_address()
            port = self.local_worker.find_free_port()

            address = "tcp://{ip}:{port}".format(ip=ip, port=port)

            remote_setups = [
                worker.setup.remote(address, i + 1, num_workers)
                for i, worker in enumerate(self.remote_workers)
            ]
            self.local_worker.setup(address, 0, num_workers)
            # Get setup tasks in order to throw errors on failure
            ray.get(remote_setups)
Exemple #3
0
    def _start_workers(self, num_workers):
        logger.debug(f"start_workers: Setting %d workers." % num_workers)
        worker_config = self.config.copy()
        batch_size_per_worker = self._configure_and_split_batch(num_workers)
        if batch_size_per_worker:
            worker_config[BATCH_SIZE] = batch_size_per_worker

        params = dict(
            training_operator_cls=self.training_operator_cls,
            config=worker_config,
            serialize_data_creation=self.serialize_data_creation,
            use_fp16=self.use_fp16,
            use_gpu=self.use_gpu,
            use_tqdm=self.use_tqdm,
            apex_args=self.apex_args,
            scheduler_step_freq=self.scheduler_step_freq)

        if num_workers == 1:
            # Start local worker
            self.local_worker = TorchRunner(**params)
            if self.initialization_hook:
                self.apply_all_workers(self.initialization_hook)
            self.local_worker.setup_operator()
        else:
            params.update(
                backend=self.backend,
                add_dist_sampler=self.add_dist_sampler,
                wrap_ddp=self.wrap_ddp)

            # Start local worker
            self.local_worker = LocalDistributedRunner(
                num_cpus=self.num_cpus_per_worker,
                num_gpus=int(self.use_gpu),
                **params)

            # Generate actor class
            RemoteRunner = ray.remote(
                num_cpus=self.num_cpus_per_worker,
                num_gpus=int(self.use_gpu))(DistributedTorchRunner)
            # Start workers
            self.remote_workers = [
                RemoteRunner.remote(**params) for i in range(num_workers - 1)
            ]
            if self.initialization_hook:
                self.apply_all_workers(self.initialization_hook)

            # Compute URL for initializing distributed PyTorch
            address = setup_address()

            # Setup the process group among all workers.
            remote_pgroup_setups = [
                worker.setup_process_group.remote(address, i + 1, num_workers,
                                                  timedelta(self.timeout_s))
                for i, worker in enumerate(self.remote_workers)
            ]
            self.local_worker.setup_process_group(address, 0, num_workers,
                                                  timedelta(self.timeout_s))
            # Get setup tasks in order to throw errors on failure
            ray.get(remote_pgroup_setups)

            # Runs code that requires all creator functions to have run.
            remote_operator_setups = [
                worker.setup_operator.remote()
                for worker in self.remote_workers
            ]
            self.local_worker.setup_operator()
            # Get setup tasks in order to throw errors on failure
            ray.get(remote_operator_setups)
Exemple #4
0
    def _start_workers(self, num_workers):
        logger.debug(f"start_workers: Setting {num_workers} workers.")
        worker_config = self.config.copy()
        batch_size_per_worker = self._configure_and_split_batch(num_workers)
        if batch_size_per_worker:
            worker_config[BATCH_SIZE] = batch_size_per_worker

        params = dict(
            model_creator=self.model_creator,
            data_creator=self.data_creator,
            optimizer_creator=self.optimizer_creator,
            loss_creator=self.loss_creator,
            scheduler_creator=self.scheduler_creator,
            training_operator_cls=self.training_operator_cls,
            config=worker_config,
            use_fp16=self.use_fp16,
            use_gpu=True,
            use_tqdm=self.use_tqdm,
            apex_args=self.apex_args,
            scheduler_step_freq=self.scheduler_step_freq,
        )

        if num_workers == 1:
            # Start local worker
            self.local_worker = TorchRunner(**params)
            self.apply_all_workers(_set_device_from_fluid_res)
            if self.initialization_hook:
                self.apply_all_workers(self.initialization_hook)
            self.local_worker.setup()
        else:
            params.update(
                backend=self.backend,
                add_dist_sampler=self.add_dist_sampler,
                wrap_ddp=self.wrap_ddp,
            )

            # Start local worker
            self.local_worker = LocalDistributedRunner(**params)

            # Start remote workers
            # assert num_workers == len(self.extra_assigned_worker_res) + 1
            self.remote_workers = []
            for res_name, res_val in self.extra_assigned_worker_res:
                # Generate actor class
                RemoteRunner = ray.remote(num_cpus=1,
                                          num_gpus=res_val,
                                          resources={res_name: res_val
                                                     })(DistributedTorchRunner)
                self.remote_workers.append(RemoteRunner.remote(**params))

            self.apply_all_workers(_set_device_from_fluid_res)
            if self.initialization_hook:
                self.apply_all_workers(self.initialization_hook)

            # Compute URL for initializing distributed PyTorch
            ip = ray.services.get_node_ip_address()
            port = self.local_worker.find_free_port()

            address = "tcp://{ip}:{port}".format(ip=ip, port=port)

            # Runs the creator functions.
            remote_component_setup = [
                worker.setup_components.remote()
                for i, worker in enumerate(self.remote_workers)
            ]
            self.local_worker.setup_components()
            # Get setup tasks in order to throw errors on failure
            ray.get(remote_component_setup)

            # Setup the process group among all workers.
            remote_pgroup_setups = [
                worker.setup_process_group.remote(address, i + 1, num_workers)
                for i, worker in enumerate(self.remote_workers)
            ]
            self.local_worker.setup_process_group(address, 0, num_workers)
            # Get setup tasks in order to throw errors on failure
            ray.get(remote_pgroup_setups)

            # Runs code that requires all creator functions to have run.
            remote_operator_setups = [
                worker.setup_ddp_and_operator.remote()
                for worker in self.remote_workers
            ]
            self.local_worker.setup_ddp_and_operator()
            # Get setup tasks in order to throw errors on failure
            ray.get(remote_operator_setups)