Beispiel #1
0
    def machine_params(cls,
                       mode="train",
                       gpu_id="default",
                       n_train_processes="default",
                       **kwargs):
        if mode == "train":
            if n_train_processes == "default":
                nprocesses = cls.NUM_TRAIN_SAMPLERS
            else:
                nprocesses = n_train_processes
        elif mode == "valid":
            nprocesses = 0
        elif mode == "test":
            nprocesses = min(100 if torch.cuda.is_available() else 8,
                             cls.NUM_TEST_TASKS)
        else:
            raise NotImplementedError(
                "mode must be 'train', 'valid', or 'test'.")

        if gpu_id == "default":
            devices = [] if cls.GPU_ID is None else [cls.GPU_ID]
        else:
            devices = [gpu_id]

        return MachineParams(nprocesses=nprocesses, devices=devices)
    def machine_params(cls, mode="train", **kwargs) -> MachineParams:
        """Return the number of processes and gpu_ids to use with training."""
        num_gpus = cuda.device_count()
        has_gpu = num_gpus != 0

        sampler_devices = None
        if mode == "train":
            nprocesses = cls.num_train_processes() if torch.cuda.is_available(
            ) else 1
            devices = (list(range(min(nprocesses, num_gpus)))
                       if has_gpu else [torch.device("cpu")])
        elif mode == "valid":
            devices = [num_gpus - 1] if has_gpu else [torch.device("cpu")]
            nprocesses = 2 if has_gpu else 0
        else:
            nprocesses = 20 if has_gpu else 1
            devices = (list(range(min(nprocesses, num_gpus)))
                       if has_gpu else [torch.device("cpu")])

        nprocesses = split_processes_onto_devices(nprocesses=nprocesses,
                                                  ndevices=len(devices))

        return MachineParams(
            nprocesses=nprocesses,
            devices=devices,
            sampler_devices=sampler_devices,
            sensor_preprocessor_graph=cls.resnet_preprocessor_graph(
                mode=mode) if cls.USE_RESNET_CNN else None,
        )
    def machine_params(self, mode="train", **kwargs):
        if mode == "train":
            workers_per_device = 1
            gpu_ids = ([] if not torch.cuda.is_available() else
                       self.TRAINING_GPUS * workers_per_device)
            nprocesses = (1 if not torch.cuda.is_available() else
                          evenly_distribute_count_into_bins(
                              self.NUM_PROCESSES, len(gpu_ids)))
        elif mode == "valid":
            nprocesses = 1
            gpu_ids = [] if not torch.cuda.is_available(
            ) else self.VALIDATION_GPUS
        elif mode == "test":
            nprocesses = 1
            gpu_ids = [] if not torch.cuda.is_available(
            ) else self.TESTING_GPUS
        else:
            raise NotImplementedError(
                "mode must be 'train', 'valid', or 'test'.")

        sensor_preprocessor_graph = (SensorPreprocessorGraph(
            source_observation_spaces=SensorSuite(
                self.SENSORS).observation_spaces,
            preprocessors=self.PREPROCESSORS,
        ) if mode == "train" or (
            (isinstance(nprocesses, int) and nprocesses > 0) or
            (isinstance(nprocesses, Sequence) and sum(nprocesses) > 0)) else
                                     None)

        return MachineParams(
            nprocesses=nprocesses,
            devices=gpu_ids,
            sensor_preprocessor_graph=sensor_preprocessor_graph,
        )
Beispiel #4
0
    def worker_devices(self, mode: str):
        machine_params: MachineParams = MachineParams.instance_from(
            self.config.machine_params(mode))
        devices = machine_params.devices

        assert all_equal(devices) or all(
            d.index >= 0 for d in devices
        ), f"Cannot have a mix of CPU and GPU devices (`devices == {devices}`)"

        get_logger().info("Using {} {} workers on devices {}".format(
            len(devices), mode, devices))
        return devices
Beispiel #5
0
    def machine_params(self, mode="train", **kwargs):
        sampler_devices: Sequence[int] = []
        if mode == "train":
            workers_per_device = 1
            gpu_ids = (
                []
                if not torch.cuda.is_available()
                else self.TRAIN_GPU_IDS * workers_per_device
            )
            nprocesses = (
                1
                if not torch.cuda.is_available()
                else evenly_distribute_count_into_bins(self.NUM_PROCESSES, len(gpu_ids))
            )
            sampler_devices = self.SAMPLER_GPU_IDS
        elif mode == "valid":
            nprocesses = 1
            gpu_ids = [] if not torch.cuda.is_available() else self.VALID_GPU_IDS
        elif mode == "test":
            nprocesses = 5 if torch.cuda.is_available() else 1
            gpu_ids = [] if not torch.cuda.is_available() else self.TEST_GPU_IDS
        else:
            raise NotImplementedError("mode must be 'train', 'valid', or 'test'.")

        sensors = [*self.SENSORS]
        if mode != "train":
            sensors = [s for s in sensors if not isinstance(s, ExpertActionSensor)]

        sensor_preprocessor_graph = (
            SensorPreprocessorGraph(
                source_observation_spaces=SensorSuite(sensors).observation_spaces,
                preprocessors=self.preprocessors(),
            )
            if mode == "train"
            or (
                (isinstance(nprocesses, int) and nprocesses > 0)
                or (isinstance(nprocesses, Sequence) and sum(nprocesses) > 0)
            )
            else None
        )

        return MachineParams(
            nprocesses=nprocesses,
            devices=gpu_ids,
            sampler_devices=sampler_devices
            if mode == "train"
            else gpu_ids,  # ignored with > 1 gpu_ids
            sensor_preprocessor_graph=sensor_preprocessor_graph,
        )
    def machine_params(self, mode="train", **kwargs):
        sampler_devices: Sequence[torch.device] = []
        devices: Sequence[torch.device]
        if mode == "train":
            workers_per_device = 1
            devices = ([torch.device("cpu")] if not torch.cuda.is_available()
                       else cast(Tuple, self.train_gpu_ids) *
                       workers_per_device)
            nprocesses = evenly_distribute_count_into_bins(
                self.num_train_processes, max(len(devices), 1))
            sampler_devices = self.sampler_devices
        elif mode == "valid":
            nprocesses = 1
            devices = ([torch.device("cpu")]
                       if not torch.cuda.is_available() else self.val_gpu_ids)
        elif mode == "test":
            nprocesses = 10 if torch.cuda.is_available() else 1
            devices = ([torch.device("cpu")]
                       if not torch.cuda.is_available() else self.test_gpu_ids)
        else:
            raise NotImplementedError(
                "mode must be 'train', 'valid', or 'test'.")

        sensors = [*self.SENSORS]
        if mode != "train":
            sensors = [
                s for s in sensors if not isinstance(s, ExpertActionSensor)
            ]

        sensor_preprocessor_graph = (SensorPreprocessorGraph(
            source_observation_spaces=SensorSuite(sensors).observation_spaces,
            preprocessors=self.preprocessors(),
        ) if mode == "train" or (
            (isinstance(nprocesses, int) and nprocesses > 0) or
            (isinstance(nprocesses, Sequence) and sum(nprocesses) > 0)) else
                                     None)

        return MachineParams(
            nprocesses=nprocesses,
            devices=devices,
            sampler_devices=sampler_devices
            if mode == "train" else devices,  # ignored with > 1 gpu_ids
            sensor_preprocessor_graph=sensor_preprocessor_graph,
        )
    def machine_params(self, mode="train", **kwargs):
        sampler_devices: Sequence[int] = []
        if mode == "train":
            workers_per_device = 1
            gpu_ids = ([] if not torch.cuda.is_available() else
                       self.TRAIN_GPU_IDS * workers_per_device)
            nprocesses = (1 if not torch.cuda.is_available() else
                          self.split_num_processes(len(gpu_ids)))
            sampler_devices = self.TRAIN_GPU_IDS
        elif mode == "valid":
            nprocesses = 1
            gpu_ids = [] if not torch.cuda.is_available(
            ) else self.VALID_GPU_IDS
        elif mode == "test":
            nprocesses = 7
            gpu_ids = [] if not torch.cuda.is_available(
            ) else self.TEST_GPU_IDS
        else:
            raise NotImplementedError(
                "mode must be 'train', 'valid', or 'test'.")

        # Disable parallelization for validation process
        if mode == "valid":
            for prep in self.PREPROCESSORS:
                prep.kwargs["parallel"] = False

        sensor_preprocessor_graph = (SensorPreprocessorGraph(
            source_observation_spaces=SensorSuite(
                self.SENSORS).observation_spaces,
            preprocessors=self.PREPROCESSORS,
        ) if mode == "train" or (
            (isinstance(nprocesses, int) and nprocesses > 0) or
            (isinstance(nprocesses, Sequence) and sum(nprocesses) > 0)) else
                                     None)

        return MachineParams(
            nprocesses=nprocesses,
            devices=gpu_ids,
            sampler_devices=sampler_devices
            if mode == "train" else gpu_ids,  # ignored with > 1 gpu_ids
            sensor_preprocessor_graph=sensor_preprocessor_graph,
        )
Beispiel #8
0
    def start_train(
        self,
        checkpoint: Optional[str] = None,
        restart_pipeline: bool = False,
        max_sampler_processes_per_worker: Optional[int] = None,
    ):
        if not self.disable_config_saving:
            self.save_project_state()

        devices = self.worker_devices("train")
        num_workers = len(devices)

        # Be extra careful to ensure that all models start
        # with the same initializations.
        set_seed(self.seed)
        initial_model_state_dict = self.config.create_model(
            sensor_preprocessor_graph=MachineParams.instance_from(
                self.config.machine_params(
                    self.mode)).sensor_preprocessor_graph).state_dict()

        distributed_port = 0
        if num_workers > 1:
            distributed_port = find_free_port()

        for trainer_it in range(num_workers):
            train: BaseProcess = self.mp_ctx.Process(
                target=self.train_loop,
                kwargs=dict(
                    id=trainer_it,
                    checkpoint=checkpoint,
                    restart_pipeline=restart_pipeline,
                    experiment_name=self.experiment_name,
                    config=self.config,
                    results_queue=self.queues["results"],
                    checkpoints_queue=self.queues["checkpoints"]
                    if self.running_validation else None,
                    checkpoints_dir=self.checkpoint_dir(),
                    seed=self.seed,
                    deterministic_cudnn=self.deterministic_cudnn,
                    mp_ctx=self.mp_ctx,
                    num_workers=num_workers,
                    device=devices[trainer_it],
                    distributed_port=distributed_port,
                    max_sampler_processes_per_worker=
                    max_sampler_processes_per_worker,
                    initial_model_state_dict=initial_model_state_dict,
                ),
            )
            train.start()
            self.processes["train"].append(train)

        get_logger().info("Started {} train processes".format(
            len(self.processes["train"])))

        # Validation
        if self.running_validation:
            device = self.worker_devices("valid")[0]
            self.init_visualizer("valid")
            valid: BaseProcess = self.mp_ctx.Process(
                target=self.valid_loop,
                args=(0, ),
                kwargs=dict(
                    config=self.config,
                    results_queue=self.queues["results"],
                    checkpoints_queue=self.queues["checkpoints"],
                    seed=
                    12345,  # TODO allow same order for randomly sampled tasks? Is this any useful anyway?
                    deterministic_cudnn=self.deterministic_cudnn,
                    deterministic_agents=self.deterministic_agents,
                    mp_ctx=self.mp_ctx,
                    device=device,
                    max_sampler_processes_per_worker=
                    max_sampler_processes_per_worker,
                ),
            )
            valid.start()
            self.processes["valid"].append(valid)

            get_logger().info("Started {} valid processes".format(
                len(self.processes["valid"])))
        else:
            get_logger().info(
                "No processes allocated to validation, no validation will be run."
            )

        self.log(self.local_start_time_str, num_workers)

        return self.local_start_time_str
Beispiel #9
0
 def init_visualizer(self, mode: str):
     if not self.disable_tensorboard:
         # Note: Avoid instantiating anything in machine_params (use Builder if needed)
         machine_params = MachineParams.instance_from(
             self.config.machine_params(mode))
         self.visualizer = machine_params.visualizer
Beispiel #10
0
 def running_validation(self):
     return (sum(
         MachineParams.instance_from(
             self.config.machine_params("valid")).nprocesses) > 0)
Beispiel #11
0
    def start_train(
        self,
        checkpoint: Optional[str] = None,
        restart_pipeline: bool = False,
        max_sampler_processes_per_worker: Optional[int] = None,
    ):
        self._initialize_start_train_or_start_test()

        if not self.disable_config_saving:
            self.save_project_state()

        devices = self.worker_devices(TRAIN_MODE_STR)
        num_workers = len(devices)

        # Be extra careful to ensure that all models start
        # with the same initializations.
        set_seed(self.seed)
        initial_model_state_dict = self.config.create_model(
            sensor_preprocessor_graph=MachineParams.instance_from(
                self.config.machine_params(
                    self.mode)).sensor_preprocessor_graph).state_dict()

        distributed_port = 0
        if num_workers > 1:
            distributed_port = find_free_port()

        model_hash = None
        for trainer_it in range(num_workers):
            training_kwargs = dict(
                id=trainer_it,
                checkpoint=checkpoint,
                restart_pipeline=restart_pipeline,
                experiment_name=self.experiment_name,
                config=self.config,
                results_queue=self.queues["results"],
                checkpoints_queue=self.queues["checkpoints"]
                if self.running_validation else None,
                checkpoints_dir=self.checkpoint_dir(),
                seed=self.seed,
                deterministic_cudnn=self.deterministic_cudnn,
                mp_ctx=self.mp_ctx,
                num_workers=num_workers,
                device=devices[trainer_it],
                distributed_port=distributed_port,
                max_sampler_processes_per_worker=
                max_sampler_processes_per_worker,
                initial_model_state_dict=initial_model_state_dict
                if model_hash is None else model_hash,
            )
            train: BaseProcess = self.mp_ctx.Process(
                target=self.train_loop,
                kwargs=training_kwargs,
            )
            try:
                train.start()
            except ValueError as e:
                # If the `initial_model_state_dict` is too large we sometimes
                # run into errors passing it with multiprocessing. In such cases
                # we instead has the state_dict and confirm, in each engine worker, that
                # this hash equals the model the engine worker instantiates.
                if e.args[0] == "too many fds":
                    model_hash = md5_hash_of_state_dict(
                        initial_model_state_dict)
                    training_kwargs["initial_model_state_dict"] = model_hash
                    train = self.mp_ctx.Process(
                        target=self.train_loop,
                        kwargs=training_kwargs,
                    )
                    train.start()
                else:
                    raise e

            self.processes[TRAIN_MODE_STR].append(train)

        get_logger().info("Started {} train processes".format(
            len(self.processes[TRAIN_MODE_STR])))

        # Validation
        if self.running_validation:
            device = self.worker_devices("valid")[0]
            self.init_visualizer("valid")
            valid: BaseProcess = self.mp_ctx.Process(
                target=self.valid_loop,
                args=(0, ),
                kwargs=dict(
                    config=self.config,
                    results_queue=self.queues["results"],
                    checkpoints_queue=self.queues["checkpoints"],
                    seed=
                    12345,  # TODO allow same order for randomly sampled tasks? Is this any useful anyway?
                    deterministic_cudnn=self.deterministic_cudnn,
                    deterministic_agents=self.deterministic_agents,
                    mp_ctx=self.mp_ctx,
                    device=device,
                    max_sampler_processes_per_worker=
                    max_sampler_processes_per_worker,
                ),
            )
            valid.start()
            self.processes["valid"].append(valid)

            get_logger().info("Started {} valid processes".format(
                len(self.processes["valid"])))
        else:
            get_logger().info(
                "No processes allocated to validation, no validation will be run."
            )

        self.log_and_close(self.local_start_time_str, num_workers)

        return self.local_start_time_str