Exemple #1
0
    def _dump_logs(self) -> None:
        """
        Write log.
        """
        time_elapsed = time.time() - self.start_time
        fps = int(self.num_timesteps / (time_elapsed + 1e-8))
        self.logger.record("time/episodes",
                           self._episode_num,
                           exclude="tensorboard")
        if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
            self.logger.record(
                "rollout/ep_rew_mean",
                safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
            self.logger.record(
                "rollout/ep_len_mean",
                safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
        self.logger.record("time/fps", fps)
        self.logger.record("time/time_elapsed",
                           int(time_elapsed),
                           exclude="tensorboard")
        self.logger.record("time/total timesteps",
                           self.num_timesteps,
                           exclude="tensorboard")
        if self.use_sde:
            self.logger.record("train/std",
                               (self.actor.get_std()).mean().item())

        if len(self.ep_success_buffer) > 0:
            self.logger.record("rollout/success rate",
                               safe_mean(self.ep_success_buffer))
        # Pass the number of timesteps for tensorboard
        self.logger.dump(step=self.num_timesteps)
Exemple #2
0
    def step_wait(self):
        if self.needs_reset:
            raise RuntimeError(
                'Tried to step vectorized environment that needs reset!')

        obss, rews, dones, infos = self.venv.step_wait()

        self.curr_ep_rewards += rews
        self.curr_ep_lengths += 1

        new_infos = list(infos[:])
        for key in self.curr_ep_data:
            self.curr_ep_data[key] += [
                info[key] for info in infos
            ]  #[dk for dk in map(lambda d: d[key], infos)]

        for i in range(len(dones)):
            if dones[i]:
                info = infos[i].copy()
                ep_rew = self.curr_ep_rewards[i]
                ep_len = self.curr_ep_lengths[i]
                ep_time = round(time.time() - self.t_start, 6)
                ep_info = {'r': ep_rew, 'l': ep_len, 't': ep_time}
                for key in self.curr_ep_data:
                    # Change in behavior: grab only the values in episode that would be overwritten
                    ep_info[key] = self.curr_ep_data[key][i]
                    self.curr_ep_data[key][i] = 0
                self.episode_rewards.append(ep_rew)
                self.episode_lengths.append(ep_len)
                self.episode_times.append(ep_time)
                self.curr_ep_rewards[i] = 0
                self.curr_ep_lengths[i] = 0
                if self.logger:
                    for key in self.curr_rollout_data:
                        self.curr_rollout_data[key].append(ep_info[key])
                info['episode'] = ep_info
                new_infos[i] = info
        self.total_steps += self.num_envs
        self.step_idx_in_rollout += 1

        if self.step_idx_in_rollout == self.rollout_size:
            if self.logger:
                # Correct the value for time (a bit ugly, I know)
                if 't' in self.curr_rollout_data:
                    self.curr_rollout_data['t'] = [time.time() - self.t_start]
                # Store the average values per rollout
                self.logger.writerow({
                    k: safe_mean(self.curr_rollout_data[k])
                    for k in self.curr_rollout_data
                })
                self.file_handler.flush()
                for key in self.info_keywords:
                    logger.record(key, safe_mean(self.curr_rollout_data[key]))
                for key in self.curr_rollout_data:
                    self.curr_rollout_data[key] = []
                self.step_idx_in_rollout = 0

        return obss, rews, dones, new_infos
Exemple #3
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0
        
        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())
        
        # debug ===============================================================
        if mode == 'debug':
            print(['OPA.learn started, ready to loop (OPA.collect_rollouts + OPA.train)'])
            
        
        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps, total_timesteps)
            
            # debug ===========================================================
            if mode == 'debug':
                print(['OPA.learn', 'num_timesteps:', self.num_timesteps, 'total_timesteps:', total_timesteps])

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations", iteration, exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard")
                logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
                logger.dump(step=self.num_timesteps)
            # debug ===============================================================
            if mode == 'debug':
                print(['OPA.learn finished, ready to OPA.train'])
            self.train()

        callback.on_training_end()

        return self
    def learn(self, total_timesteps,
              callback=None,
              log_interval=1,
              tb_log_name="run",
              eval_env=None,
              eval_freq=-1,
              n_eval_episodes=5,
              eval_log_path=None,
              reset_num_timesteps=True,
              ):

        iteration = 0
        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:
            continue_training = self.collect_rollouts(
                self.env, callback, n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(
                self.num_timesteps, total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations", iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed", int(time.time() -
                                                       self.start_time), exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps, exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps, total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                self.fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations", iteration, exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    # logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                    # logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_reward_mean", safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_len_mean", safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer]))
                if len(self.specific_reward_info_buffer) > 0 and len(self.specific_reward_info_buffer[0]) > 0:
                    logger.record('rollout/mimic_qpos_reward', safe_mean([specific_reward_info['mimic_qpos_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_qvel_reward', safe_mean([specific_reward_info['mimic_qvel_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    #logger.record('rollout/mimic_ee_reward', safe_mean([specific_reward_info['mimic_ee_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_orientation_reward', safe_mean([specific_reward_info['mimic_body_orientation_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_reward', safe_mean([specific_reward_info['mimic_body_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_vel_reward', safe_mean([specific_reward_info['mimic_body_vel_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_contact_reward', safe_mean([specific_reward_info['mimic_contact_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                logger.record("time/fps", self.fps)
                logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard")
                logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
    def _dump_logs(self) -> None:
        """
        Write log.
        """
        try:
            fps = int(self.num_timesteps / (time.time() - self.start_time))
        except ZeroDivisionError:
            warnings.warn("fps dump had zero division somehow, storing 0 instead.")
            fps = 0
        logger.record("time/episodes", self._episode_num, exclude="tensorboard")
        if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
            logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
            logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
        logger.record("time/fps", fps)
        logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard")
        logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard")
        if self.use_sde:
            logger.record("train/std", (self.actor.get_std()).mean().item())

        if len(self.ep_success_buffer) > 0:
            logger.record("rollout/success rate", safe_mean(self.ep_success_buffer))
        # Pass the number of timesteps for tensorboard
        logger.dump(step=self.num_timesteps)
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))

                    for k in self.ep_info_buffer[0].keys():
                        if k not in "lrt":
                            logger.record(
                                f"progress/{k}",
                                safe_mean([
                                    ep_info[k]
                                    for ep_info in self.ep_info_buffer
                                ]))

                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

                if iteration % (log_interval *
                                10) == 0:  #save parameters every 10 log steps
                    self.save('./interim_trained_models/')

            self.train()

        callback.on_training_end()

        return self
def train(args):
    cuda_availability = torch.cuda.is_available()
    print('\n*************************')
    print('`CUDA` available: {}'.format(cuda_availability))
    print('Device specified: {}'.format(args.device))
    print('*************************\n')

    # load the config of the trained model:
    with open(args.pretrained_output / "train_arguments.yaml") as yaml_data:
        pretrain_arguments = yaml.load(yaml_data, Loader=yaml.FullLoader)

    pretrained_model = algorithms[pretrain_arguments["alg"]].load(
        args.pretrained_output /
        "".join(pretrain_arguments["model_name"].split(".")[:-1]),
        device='cpu')

    # Prepare tensorboard logging
    log_name = '{}_{}_{}'.format(args.experiment_name, args.task_name,
                                 datetime.now().strftime('%d-%m_%H-%M-%S'))
    run_dir = args.tensorboard_log + "/" + log_name
    Path(run_dir).mkdir(parents=True, exist_ok=True)
    callbacks = []
    # callbacks.append(CheckpointCallback(
    #    save_freq=1000000, save_path=run_dir, name_prefix='rl_model'))
    callbacks.append(LoggingCallback(logpath=run_dir))

    train_args = copy.copy(args)
    train_args.config = train_args.config.name
    pyaml.dump(train_args.__dict__,
               open(os.path.join(run_dir, 'train_arguments.yaml'), 'w'))

    assert args.task_name == pretrain_arguments[
        "task_name"], "Envs must match for transfer learning"

    # Create the vectorized environment
    n_envs = train_args.n_envs  # Number of processes to use
    env = make_vec_env(args.task_name, n_envs=n_envs)

    # define network architecture
    if "GnnPolicy" in args.policy and args.net_arch is not None:
        for net_arch_part in args.net_arch.keys():
            for i, (layer_class_name,
                    layer_size) in enumerate(args.net_arch[net_arch_part]):
                if hasattr(nn, layer_class_name):
                    args.net_arch[net_arch_part][i] = (getattr(
                        nn, layer_class_name), layer_size)
                elif hasattr(nerve_net_conv, layer_class_name):
                    args.net_arch[net_arch_part][i] = (getattr(
                        nerve_net_conv, layer_class_name), layer_size)
                else:

                    def get_class(x):
                        return globals()[x]

                    c = get_class(layer_size)
                    assert c is not None, f"Unkown layer class '{layer_class_name}'"
                    args.net_arch[net_arch_part][i] = (c, layer_size)

    with open(os.path.join(run_dir, 'net_arch.txt'), 'w') as fp:
        fp.write(str(args.net_arch))

    # Create the model
    alg_class = algorithms[args.alg]
    policy_kwargs = dict()
    if args.net_arch is not None:
        policy_kwargs['net_arch'] = args.net_arch
    if args.activation_fn is not None:
        policy_kwargs["activation_fn"] = activation_functions[
            args.activation_fn]
    # policy_kwargs['device'] = args.device if args.device is not None else get_device('auto')
    if "GnnPolicy" in args.policy:
        policy_kwargs["mlp_extractor_kwargs"] = {
            "task_name": args.task_name,
            'device': args.device,
            'gnn_for_values': args.gnn_for_values,
            'controller_option': controller_option[args.controller_option],
            'embedding_option': embedding_option[args.embedding_option],
            'root_option': root_option[args.root_option],
            'drop_body_nodes': args.drop_body_nodes,
            'use_sibling_relations': args.use_sibling_relations,
            'xml_assets_path': args.xml_assets_path,
            'policy_readout_mode': args.policy_readout_mode
        }
    alg_kwargs = args.__dict__.copy()
    alg_kwargs.pop("config", None)
    alg_kwargs.pop("task_name", None)
    alg_kwargs.pop("policy", None)
    alg_kwargs.pop("activation_fn", None)
    alg_kwargs.pop("gnn_for_values", None)
    alg_kwargs.pop("embedding_option", None)
    alg_kwargs.pop("controller_option", None)
    alg_kwargs.pop("root_option", None)
    alg_kwargs.pop("xml_assets_path", None)
    alg_kwargs.pop("alg", None)
    alg_kwargs.pop("net_arch", None)
    alg_kwargs.pop("experiment_name", None)
    alg_kwargs.pop("job_dir", None)
    alg_kwargs.pop("total_timesteps", None)
    alg_kwargs.pop("model_name", None)
    alg_kwargs.pop("n_envs", None)
    alg_kwargs.pop("drop_body_nodes", None)
    alg_kwargs.pop("use_sibling_relations", None)
    alg_kwargs.pop("experiment_name_suffix", None)
    alg_kwargs.pop("policy_readout_mode", None)
    alg_kwargs.pop("pretrained_output", None)

    model = alg_class(
        args.policy,
        env,
        verbose=1,
        #   n_steps=args.n_steps,
        policy_kwargs=policy_kwargs,
        #   device=args.device,
        #   tensorboard_log=args.tensorboard_log,
        #   learning_rate=args.learning_rate,
        #   batch_size=args.batch_size,
        #   n_epochs=args.n_epochs,
        **alg_kwargs)

    # model.learn(total_timesteps=args.total_timesteps,
    #             callback=callbacks,
    #             tb_log_name=log_name)

    # PPO Learn parameters:
    total_timesteps = args.total_timesteps
    callback = callbacks
    log_interval = 1
    eval_env = make_vec_env(args.task_name, n_envs=1)
    eval_freq = 1e4
    n_eval_episodes = 3
    tb_log_name = log_name
    eval_log_path = None
    reset_num_timesteps = True

    #################################
    ##### Custom Transfer Learn #####
    #################################

    iteration = 0
    total_timesteps, callback = model._setup_learn(
        total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
        eval_log_path, reset_num_timesteps, tb_log_name)

    ### setup pretrained model ###
    pretrained_model.num_timesteps = 0
    pretrained_model._episode_num = 0
    pretrained_model._total_timesteps = total_timesteps
    pretrained_model.ep_info_buffer = deque(maxlen=100)
    pretrained_model.ep_success_buffer = deque(maxlen=100)
    pretrained_model._last_obs = model.env.reset()
    pretrained_model._last_dones = np.zeros((model.env.num_envs, ), dtype=bool)

    callback.on_training_start(locals(), globals())

    while pretrained_model.num_timesteps < total_timesteps:

        continue_training = pretrained_model.collect_rollouts(
            model.env,
            callback,
            model.rollout_buffer,
            n_rollout_steps=model.n_steps)

        if continue_training is False:
            break

        iteration += 1
        model._update_current_progress_remaining(
            pretrained_model.num_timesteps, total_timesteps)

        # Display training infos
        if log_interval is not None and iteration % log_interval == 0:
            fps = int(pretrained_model.num_timesteps /
                      (time.time() - model.start_time))
            logger.record("time/iterations", iteration, exclude="tensorboard")
            if len(model.ep_info_buffer) > 0 and len(
                    model.ep_info_buffer[0]) > 0:
                logger.record(
                    "rollout/ep_rew_mean",
                    safe_mean(
                        [ep_info["r"] for ep_info in model.ep_info_buffer]))
                logger.record(
                    "rollout/ep_len_mean",
                    safe_mean(
                        [ep_info["l"] for ep_info in model.ep_info_buffer]))
            logger.record("time/fps", fps)
            logger.record("time/time_elapsed",
                          int(time.time() - model.start_time),
                          exclude="tensorboard")
            logger.record("time/total_timesteps",
                          pretrained_model.num_timesteps,
                          exclude="tensorboard")
            logger.dump(step=pretrained_model.num_timesteps)

        model.train()

    callback.on_training_end()

    model.save(
        os.path.join(args.tensorboard_log + "/" + log_name, args.model_name))
Exemple #9
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        print('setup training')

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        print(f'start training, total timesteps is {total_timesteps}')

        while self.num_timesteps < total_timesteps:

            print(f'num timesteps: {self.num_timesteps}/{total_timesteps}')
            print(f'collect rollouts, rollout steps = {self.n_steps}')

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                print(
                    'stop training (only happens if callback on_step returns false)'
                )
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            print('display training infos')
            # print(f'len(self.ep_info_buffer)={len(self.ep_info_buffer)}, len(self.ep_info_buffer[0])={len(self.ep_info_buffer[0])}')

            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            print('train')
            self.train()

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "PPO",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:
            """Replay buffer size"""
            ### No need to use larger buffer, because that doesn't solve the catastrophic forgetting problem.
            ### For this experiment, just count the best score is enough.
            # Determine buffer size using safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])
            # I want it to be stable when learned walking.
            # Start with small buffer, once
            # ep_len_mean = safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])
            # if ep_len_mean>=1000:
            #     self.use_small_buffer = False

            if not args.single and self.use_small_buffer:
                output(
                    f"Collect rollouts for {self.n_steps//self.env.num_envs} steps.",
                    2)
                continue_training = self.collect_rollouts(
                    self.env,
                    callback,
                    self.rollout_buffer_small,
                    n_rollout_steps=self.n_steps // self.env.num_envs)
            else:
                output(f"Collect rollouts for {self.n_steps} steps.", 2)
                continue_training = self.collect_rollouts(
                    self.env,
                    callback,
                    self.rollout_buffer,
                    n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
Exemple #11
0
    def train(self) -> None:
        """
        Update policy using the currently gathered
        rollout buffer.
        """
        # Update optimizer learning rate
        self._update_learning_rate(self.policy.optimizer)
        # Compute current clip range
        clip_range = self.clip_range(self._current_progress_remaining)
        # Optional: clip range for the value function
        if self.clip_range_vf is not None:
            clip_range_vf = self.clip_range_vf(self._current_progress_remaining)

        entropy_losses, all_kl_divs = [], []
        pg_losses, value_losses = [], []
        clip_fractions = []

        # train for gradient_steps epochs
        for epoch in range(self.n_epochs):
            approx_kl_divs = []
            # Do a complete pass on the rollout buffer
            for rollout_data in self.rollout_buffer.get(self.batch_size):
                actions = rollout_data.actions
                if isinstance(self.action_space, spaces.Discrete):
                    # Convert discrete action from float to long
                    actions = rollout_data.actions.long().flatten()

                # Re-sample the noise matrix because the log_std has changed
                # TODO: investigate why there is no issue with the gradient
                # if that line is commented (as in SAC)
                if self.use_sde:
                    self.policy.reset_noise(self.batch_size)

                values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
                values = values.flatten()
                # Normalize advantage
                advantages = rollout_data.advantages
                advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

                # ratio between old and new policy, should be one at the first iteration
                ratio = th.exp(log_prob - rollout_data.old_log_prob)
                # clipped surrogate loss
                policy_loss_1 = advantages * ratio
                policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range)
                policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()

                # Logging
                pg_losses.append(policy_loss.item())
                clip_fraction = th.mean((th.abs(ratio - 1) > clip_range).float()).item()
                clip_fractions.append(clip_fraction)

                if self.clip_range_vf is None:
                    # No clipping
                    values_pred = values
                else:
                    # Clip the different between old and new value
                    # NOTE: this depends on the reward scaling
                    values_pred = rollout_data.old_values + th.clamp(
                        values - rollout_data.old_values, -clip_range_vf, clip_range_vf
                    )
                # Value loss using the TD(gae_lambda) target
                value_loss = F.mse_loss(rollout_data.returns, values_pred)
                value_losses.append(value_loss.item())

                # Entropy loss favor exploration
                if entropy is None:
                    # Approximate entropy when no analytical form
                    entropy_loss = -th.mean(-log_prob)
                else:
                    entropy_loss = -th.mean(entropy)

                entropy_losses.append(entropy_loss.item())

                loss = policy_loss + self.vf_coef * value_loss

                # Optimization step
                # # Critic
                # self.policy.critic_optimizer.zero_grad()
                # value_loss.backward()
                # # Clip grad norm
                # th.nn.utils.clip_grad_norm_(self.policy.value_net.parameters(), self.max_grad_norm)
                # self.policy.critic_optimizer.step()

                # # Actor
                # self.policy.optimizer.zero_grad()
                # policy_loss.backward()
                # # Clip grad norm
                # th.nn.utils.clip_grad_norm_(self.policy.action_net.parameters(), self.max_grad_norm)
                # self.policy.optimizer.step()

                # Actor and Critic
                self.policy.optimizer.zero_grad()
                loss.backward()
                # Clip grad norm
                th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
                self.policy.optimizer.step()
                approx_kl_divs.append(th.mean(rollout_data.old_log_prob - log_prob).detach().cpu().numpy())

            all_kl_divs.append(np.mean(approx_kl_divs))

            if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl:
                print(f"Early stopping at step {epoch} due to reaching max kl: {np.mean(approx_kl_divs):.2f}")
                break

        self._n_updates += self.n_epochs
        explained_var = explained_variance(self.rollout_buffer.returns.flatten(), self.rollout_buffer.values.flatten())

        # Logs
        logger.record("train/entropy_loss", np.mean(entropy_losses))
        logger.record("train/policy_gradient_loss", np.mean(pg_losses))
        logger.record("train/value_loss", np.mean(value_losses))
        logger.record("train/approx_kl", np.mean(approx_kl_divs))
        logger.record("train/clip_fraction", np.mean(clip_fractions))
        logger.record("train/loss", loss.item())
        logger.record("train/explained_variance", explained_var)
        if hasattr(self.policy, "log_std"):
            logger.record("train/std", th.exp(self.policy.log_std).mean().item())

        logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
        logger.record("train/clip_range", clip_range)
        if self.clip_range_vf is not None:
            logger.record("train/clip_range_vf", clip_range_vf)

        if self.wandb_use:
            if (self._n_updates % 10 == 0):
                t_start = time.time()
                wandb_dict = dict()
                # wandb_dict["Mean Reward"] = np.mean(true_reward)
                wandb_dict["serial_timesteps"] = self._n_updates * self.n_steps
                wandb_dict["n_updates"] = self._n_updates
                wandb_dict["total_timesteps"] = self.num_timesteps
                wandb_dict["fps"] = self.fps
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    wandb_dict["ep_reward_mean"] = safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer])
                    wandb_dict["ep_len_mean"] = safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer])
                if len(self.specific_reward_info_buffer) > 0 :
                    wandb_dict["mimic_qpos_reward"] = safe_mean([specific_reward_info['mimic_qpos_reward'] for specific_reward_info in self.specific_reward_info_buffer])
                    wandb_dict["mimic_qvel_reward"] = safe_mean([specific_reward_info['mimic_qvel_reward'] for specific_reward_info in self.specific_reward_info_buffer])
                    wandb_dict["mimic_body_orientation_reward"] = safe_mean([specific_reward_info['mimic_body_orientation_reward'] for specific_reward_info in self.specific_reward_info_buffer])
                    wandb_dict["mimic_body_reward"] = safe_mean([specific_reward_info['mimic_body_reward'] for specific_reward_info in self.specific_reward_info_buffer])
                    wandb_dict["mimic_body_vel_reward"] = safe_mean([specific_reward_info['mimic_body_vel_reward'] for specific_reward_info in self.specific_reward_info_buffer])
                    wandb_dict["mimic_contact_reward"] = safe_mean([specific_reward_info['mimic_contact_reward'] for specific_reward_info in self.specific_reward_info_buffer])
                wandb_dict["time_elapsed"] = t_start - self.t_first_start
                wandb_dict["train/entropy_loss"] = np.mean(entropy_losses)
                wandb_dict["train/policy_gradient_loss"] = np.mean(pg_losses)
                wandb_dict["train/value_loss"] = np.mean(value_losses)
                wandb_dict["train/approx_kl"] = np.mean(approx_kl_divs)
                wandb_dict["train/clip_fraction"] = np.mean(clip_fractions)
                wandb_dict["train/loss"] = loss.item()
                wandb_dict["train/explained_variance"]= explained_var
                wandb.log(wandb_dict)
Exemple #12
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        from stable_baselines3.common.utils import obs_as_tensor, safe_mean
        import time
        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int((self.num_timesteps - self._num_timesteps_at_start) /
                          (time.time() - self.start_time))
                self.logger.record("time/iterations",
                                   iteration,
                                   exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    self.logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    self.logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                self.logger.record("time/fps", fps)
                self.logger.record("time/time_elapsed",
                                   int(time.time() - self.start_time),
                                   exclude="tensorboard")
                self.logger.record("time/total_timesteps",
                                   self.num_timesteps,
                                   exclude="tensorboard")
                # [RLA] set timesteps
                time_step_holder.set_time(self.num_timesteps)
                self.logger.dump()

            self.train()

        callback.on_training_end()

        return self
Exemple #13
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            for partner_idx in range(self.policy.num_partners):
                try:
                    self.env.envs[0].switch_to_env(partner_idx)
                except:
                    pass
                continue_training = self.collect_rollouts(
                    self.env,
                    callback,
                    self.rollout_buffer[partner_idx],
                    n_rollout_steps=self.n_steps,
                    partner_idx=partner_idx)
            #continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
Exemple #14
0
def main():
    #check for uncommited changes
    commit_check()

    ##setup args
    parser = argparse.ArgumentParser(
        description='Reward learning from preferences')

    parser.add_argument('--env_type', type=str, default='atari')
    parser.add_argument('--env_name', type=str, default='BeamRider')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--log_dir', type=str, default='LOGS')
    parser.add_argument('--log_prefix', type=str, default='')
    parser.add_argument('--log_name', type=str, default='')
    parser.add_argument('--cpu_buffer',
                        dest='on_cuda',
                        action='store_false',
                        help='whether to store buffet on cpu or GPU \
                                                                                        by default requires up to 8GB memory on GPU'
                        )

    parser.add_argument('--resume_training', action='store_true')

    parser.add_argument('--init_buffer_size', type=int, default=500)
    parser.add_argument(
        '--init_train_size',
        type=int,
        default=10**5,
        help=
        'number of labels to process during initial training of the reward model'
    )
    parser.add_argument(
        '--clip_size',
        type=int,
        default=25,
        help='number of frames in each clip generated for comparison')
    parser.add_argument('--total_timesteps',
                        type=int,
                        default=5 * 10**7,
                        help='total number of RL timesteps to be taken')
    parser.add_argument(
        '--n_labels',
        type=int,
        default=6800,
        help="total number of labels to collect throughout the training")
    parser.add_argument('--steps_per_iter',
                        type=int,
                        default=5 * 10**4,
                        help="number of RL steps taken on each iteration")
    parser.add_argument(
        '--pairs_per_iter',
        type=int,
        default=5 * 10**3,
        help='number of labels the reward model is trained on each iteration')
    parser.add_argument('--pairs_in_batch',
                        type=int,
                        default=16,
                        help='batch size for reward model training')
    parser.add_argument('--l2',
                        type=float,
                        default=0.0001,
                        help='initial l2 regularization for a reward model')
    parser.add_argument('--adaptive', dest='adaptive', action='store_true')
    parser.add_argument('--no-adaptive', dest='adaptive', action='store_false')
    parser.set_defaults(adaptive=True)
    parser.add_argument('--dropout', type=float, default=0.5)

    args = parser.parse_args()

    args.ppo_kwargs = dict(verbose=1,
                           n_steps=256,
                           noptepochs=3,
                           nminibatches=8)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f'\n Using {device} for training')

    run_dir, monitor_dir, video_dir = setup_logging(args)
    global LOG_TIME
    LOG_TIME = os.path.join(run_dir, "TIME_LOG.txt")

    ### Initializing objects ###

    # If resuming some earlier training run - load stored objects
    if args.resume_training:
        args = load_args(args)
        reward_model, policy, data_buffer, i_num = load_state(run_dir)

    atari_name = args.env_name + "NoFrameskip-v4"
    venv_fn = lambda: make_atari_continuous(atari_name, n_envs=16)
    annotation_env = make_atari_continuous(atari_name, n_envs=16)
    annotation_env.reset()
    iter_time = 0

    # In case this is a fresh experiment - initialize fresh objects
    if not args.resume_training:
        store_args(args, run_dir)
        policy = A2C('CnnPolicy',
                     venv_fn(),
                     verbose=1,
                     tensorboard_log="TB_LOGS",
                     ent_coef=0.01,
                     learning_rate=0.0007,
                     policy_kwargs={
                         "optimizer_class": torch.optim.Adam,
                         "optimizer_kwargs": {
                             "eps": 1e-5,
                             "betas": [.99, .999]
                         }
                     })
        reward_model = RewardNet(l2=args.l2,
                                 dropout=args.dropout,
                                 env_type=args.env_type)
        data_buffer = AnnotationBuffer()

    # initializing RM optimizer
    rm_optimizer = optim.Adam(reward_model.parameters(),
                              lr=0.0003,
                              weight_decay=reward_model.l2)

    #creating the environment with reward replaced by the prediction from reward_model
    reward_model.to(device)
    proxy_reward_function = lambda x: reward_model(
        torch.from_numpy(x).float().to(device))
    proxy_reward_venv = Vec_reward_wrapper(venv_fn(), proxy_reward_function)

    # resetting the environment to avoid raising error from reset_num_timesteps
    proxy_reward_venv.reset()
    policy.set_env(proxy_reward_venv)

    # eval_env_fn = lambda: make_atari_default(atari_name, n_envs=16, seed = 0, vec_env_cls = SubprocVecEnv)
    # video_env_fn= lambda: make_atari_default(atari_name, vec_env_cls = DummyVecEnv)

    # in case this is a fresh run, collect init_buffer_size samples to AnnotationBuffer
    # and train the reward model on init_train_size number of samples with replacement
    if not args.resume_training:

        t_start = time.time()
        print(f'================== Initial iter ====================')

        annotations = collect_annotations(annotation_env, policy,
                                          args.init_buffer_size,
                                          args.clip_size, args.on_cuda)
        data_buffer.add(annotations)

        print(f'Buffer size = {data_buffer.current_size}')

        reward_model, rm_optimizer, rm_train_stats = train_reward(
            reward_model, rm_optimizer, args.adaptive, data_buffer,
            args.init_train_size, args.pairs_in_batch)
        # this callback adds values to TensorBoard logs for easier plotting
        reward_model.eval()
        callback = TensorboardCallback(
            (data_buffer.total_labels, data_buffer.loss_lb, iter_time,
             rm_train_stats))
        policy = train_policy(policy, args.steps_per_iter, 0, args.log_name,
                              callback)

        save_state(run_dir, 0, reward_model, policy, data_buffer)

        true_performance = safe_mean(
            [ep_info["r"] for ep_info in policy.ep_info_buffer])

        t_finish = time.time()
        iter_time = t_finish - t_start
        log_iter(run_dir, args.steps_per_iter, data_buffer, true_performance,
                 0, rm_train_stats, iter_time)

        print(
            f'Iteration took {time.gmtime(t_finish - t_start).tm_min} min {time.gmtime(t_finish - t_start).tm_sec} sec'
        )

        # i_num is the number of training iterations taken
        i_num = 1

    num_iters = int(args.total_timesteps / args.steps_per_iter)
    # calculating the initial number of pairs to collect
    num_pairs = init_num_pairs = round(
        (args.n_labels - args.init_buffer_size) / 0.239 / num_iters)

    print('init_num_pairs = {}'.format(init_num_pairs))
    for i in range(i_num, num_iters):
        t_start = time.time()
        print(f'================== iter : {i} ====================')

        rl_steps = i * args.steps_per_iter
        # decaying the number of pairs to collect
        num_pairs = round(init_num_pairs / (rl_steps /
                                            (args.total_timesteps / 10) + 1))

        annotations = collect_annotations(annotation_env, policy, num_pairs,
                                          args.clip_size, args.on_cuda)
        data_buffer.add(annotations)

        print(f'Buffer size = {data_buffer.current_size}')

        reward_model, rm_optimizer, rm_train_stats = train_reward(
            reward_model, rm_optimizer, args.adaptive, data_buffer,
            args.pairs_per_iter, args.pairs_in_batch)

        #TODO : pretify passing data to callback
        callback = TensorboardCallback(
            (data_buffer.total_labels, data_buffer.loss_lb, iter_time,
             rm_train_stats))
        policy = train_policy(policy, args.steps_per_iter, rl_steps,
                              args.log_name, callback)

        # storing the state every 1M steps
        # this assumes that steps_per_iter devides 10**6
        if rl_steps % (10**6) == 0:
            save_state(run_dir, i, reward_model, policy, data_buffer)

        # record_video(policy, video_env_fn(), video_dir, 4000, f"{i}_ITER00_{args.env_name}")
        # true_performance = eval_policy(venv_fn(), policy, n_eval_episodes=50)
        # proxy_performance = eval_policy(test_env, policy, n_eval_episodes=50)

        true_performance = safe_mean(
            [ep_info["r"] for ep_info in policy.ep_info_buffer])

        # print(f'True policy preformance = {true_performance}')
        # print(f'Proxy policy preformance = {proxy_performance}')

        t_finish = time.time()
        iter_time = t_finish - t_start
        log_iter(run_dir, rl_steps, data_buffer, true_performance, 0,
                 rm_train_stats, iter_time)

        if LOG_TIME:
            with open(LOG_TIME, 'a') as f:
                f.write(
                    f'Iteration took {time.gmtime(iter_time).tm_min} min {time.gmtime(iter_time).tm_sec} sec\n'
                )
                f.write(
                    f'================== iter : {i+1} ====================\n')
        else:
            print(
                f'Iteration took {time.gmtime(iter_time).tm_min} min {time.gmtime(iter_time).tm_sec} sec'
            )
Exemple #15
0
def main():
    def env_contr():
        return gym.make("CartPole-v0")  #
        # env = multiwalker_v0.env()
        # env = pad_observations(env)
        # env = pad_action_space(env)
        # markov_env = aec_to_markov(env)
        # venv = MarkovVectorEnv(markov_env)
        # return venv

    n_envs = 6
    # def nest_env_const():
    #     cat = ConcatVecEnv([env_contr]*envs_per_proc)
    #     return cat
    example_env = env_contr()
    num_envs = n_envs * 1  #example_env.num_envs
    #cat = ProcConcatVec([nest_env_const]*n_procs,example_env.observation_space, example_env.action_space, num_envs)
    cat = MakeCPUAsyncConstructor(0)([env_contr] * n_envs,
                                     example_env.observation_space,
                                     example_env.action_space)  #, num_envs)
    cat = VecEnvWrapper(cat)
    env = cat
    policy = "MlpPolicy"
    logger = make_logger("log")
    stable_baselines3.common.logger.Logger.CURRENT = logger
    a2c = PPO(policy, cat, n_steps=4, batch_size=6, n_epochs=3)
    print(type(a2c.env))
    #a2c.learn(1000000)

    total_timesteps, callback = a2c._setup_learn(10000,
                                                 None,
                                                 None,
                                                 None,
                                                 n_eval_episodes=5,
                                                 reset_num_timesteps=None,
                                                 tb_log_name="PPo")

    #total_timesteps = 100
    iteration = 0
    log_interval = 1
    for i in range(total_timesteps):
        continue_training = a2c.collect_rollouts(env,
                                                 callback,
                                                 a2c.rollout_buffer,
                                                 n_rollout_steps=a2c.n_steps)
        print(a2c.ep_info_buffer)
        if continue_training is False:
            break

        iteration += 1
        a2c._update_current_progress_remaining(a2c.num_timesteps,
                                               total_timesteps)

        # Display training infos
        if log_interval is not None and iteration % log_interval == 0:
            fps = int(a2c.num_timesteps / (time.time() - a2c.start_time))
            logger.record("time/iterations", iteration, exclude="tensorboard")
            print(a2c.ep_info_buffer)
            if len(a2c.ep_info_buffer) > 0 and len(a2c.ep_info_buffer[0]) > 0:
                logger.record(
                    "rollout/ep_rew_mean",
                    safe_mean([ep_info["r"]
                               for ep_info in a2c.ep_info_buffer]))
                logger.record(
                    "rollout/ep_len_mean",
                    safe_mean([ep_info["l"]
                               for ep_info in a2c.ep_info_buffer]))
            logger.record("time/fps", fps)
            logger.record("time/time_elapsed",
                          int(time.time() - a2c.start_time),
                          exclude="tensorboard")
            logger.record("time/total_timesteps",
                          a2c.num_timesteps,
                          exclude="tensorboard")
            logger.dump(step=a2c.num_timesteps)

        a2c.train()
Exemple #16
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
        param_noise: bool = False,
        sigma: float = 0.1,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            #during rollout we collect batches of states and rewards
            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps,
                param_noise=param_noise,
                sigma=sigma)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            # during training gradient descent is done
            self.train(param_noise, sigma)

            if param_noise:
                sigma = self.update_sigma(sigma)
                # print("current_sigma")
                # print(sigma)

        callback.on_training_end()

        return self
Exemple #17
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
        parameter_noise: bool = False,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        #initiatilizing value of noise std
        current_sigma = 1.0
        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps,
                parameter_noise=parameter_noise,
                sigma=0.5)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

            if parameter_noise:

                states = self.rollout_buffer.observations
                states = th.tensor(states)

                actions_unnoisy, values_unnoisy, log_prob_unnoisy = self.policy(
                    states, parameter_noise=False)
                actions_noisy, values_noisy, log_prob_noisy = self.policy(
                    states, parameter_noise=True, sigma=current_sigma)

                distance = th.sum((actions_unnoisy - actions_noisy)**2)**0.5

                distance_threshold = 1
                sigma_scalefactor = 1.01
                if distance > distance_threshold:
                    current_sigma /= sigma_scalefactor
                else:
                    current_sigma *= sigma_scalefactor

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            rollout = self.collect_rollouts(
                self.env,
                n_episodes=-1,
                n_steps=1,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )

            if rollout.continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                self.train(gradient_steps=1, batch_size=self.batch_size)

        callback.on_training_end()

        return self
    def collect_rollouts(
            self,  # noqa: C901
            env: VecEnv,
            # Type hint as string to avoid circular import
            callback: 'BaseCallback',
            n_episodes: int = 1,
            n_steps: int = -1,
            action_noise: Optional[ActionNoise] = None,
            learning_starts: int = 0,
            replay_buffer: Optional[ReplayBuffer] = None,
            log_interval: Optional[int] = None) -> RolloutReturn:
        """
        Collect experiences and store them into a ReplayBuffer.

        :param env: (VecEnv) The training environment
        :param callback: (BaseCallback) Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param n_episodes: (int) Number of episodes to use to collect rollout data
            You can also specify a ``n_steps`` instead
        :param n_steps: (int) Number of steps to use to collect rollout data
            You can also specify a ``n_episodes`` instead.
        :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration
            Required for deterministic policy (e.g. TD3). This can also be used
            in addition to the stochastic policy for SAC.
        :param learning_starts: (int) Number of steps before learning for the warm-up phase.
        :param replay_buffer: (ReplayBuffer)
        :param log_interval: (int) Log data every ``log_interval`` episodes
        :return: (RolloutReturn)
        """
        episode_rewards, total_timesteps = [], []
        total_steps, total_episodes = 0, 0

        assert isinstance(env, VecEnv), "You must pass a VecEnv"
        assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment"

        if n_episodes > 0 and n_steps > 0:
            # Note we are refering to the constructor arguments
            # that are named `train_freq` and `n_episodes_rollout`
            # but correspond to `n_steps` and `n_episodes` here
            warnings.warn(
                "You passed a positive value for `train_freq` and `n_episodes_rollout`."
                "Please make sure this is intended. "
                "The agent will collect data by stepping in the environment "
                "until both conditions are true: "
                "`number of steps in the env` >= `train_freq` and "
                "`number of episodes` > `n_episodes_rollout`")

        if self.use_sde:
            self.actor.reset_noise()

        callback.on_rollout_start()
        continue_training = True

        while total_steps < n_steps or total_episodes < n_episodes:
            done = False
            episode_reward, episode_timesteps = 0.0, 0

            while not done:

                if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0:
                    # Sample a new noise matrix
                    self.actor.reset_noise()

                # Select action randomly or according to policy
                if self.num_timesteps < learning_starts and not (
                        self.use_sde and self.use_sde_at_warmup):
                    # Warmup phase
                    unscaled_action = np.array([self.action_space.sample()])
                else:
                    # Note: we assume that the policy uses tanh to scale the action
                    # We use non-deterministic action in the case of SAC, for TD3, it does not matter
                    unscaled_action, _ = self.predict(self._last_obs,
                                                      deterministic=False)

                # Rescale the action from [low, high] to [-1, 1]
                if isinstance(self.action_space, gym.spaces.Box):
                    scaled_action = self.policy.scale_action(unscaled_action)

                    # Add noise to the action (improve exploration)
                    if action_noise is not None:
                        # NOTE: in the original implementation of TD3, the noise was applied to the unscaled action
                        # Update(October 2019): Not anymore
                        scaled_action = np.clip(scaled_action + action_noise(),
                                                -1, 1)

                    # We store the scaled action in the buffer
                    buffer_action = scaled_action
                    action = self.policy.unscale_action(scaled_action)
                else:
                    # Discrete case, no need to normalize or clip
                    buffer_action = unscaled_action
                    action = buffer_action

                # Rescale and perform action
                new_obs, reward, done, infos = env.step(action)

                # Only stop training if return value is False, not when it is None.
                if callback.on_step() is False:
                    return RolloutReturn(0.0,
                                         total_steps,
                                         total_episodes,
                                         continue_training=False)

                episode_reward += reward

                # Retrieve reward and episode length if using Monitor wrapper
                self._update_info_buffer(infos, done)

                # Store data in replay buffer
                if replay_buffer is not None:
                    # Store only the unnormalized version
                    if self._vec_normalize_env is not None:
                        new_obs_ = self._vec_normalize_env.get_original_obs()
                        reward_ = self._vec_normalize_env.get_original_reward()
                    else:
                        # Avoid changing the original ones
                        self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward

                    replay_buffer.add(self._last_original_obs, new_obs_,
                                      buffer_action, reward_, done)

                self._last_obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    self._last_original_obs = new_obs_

                self.num_timesteps += 1
                episode_timesteps += 1
                total_steps += 1
                if 0 < n_steps <= total_steps:
                    break

            if done:
                total_episodes += 1
                self._episode_num += 1
                episode_rewards.append(episode_reward)
                total_timesteps.append(episode_timesteps)

                if action_noise is not None:
                    action_noise.reset()

                # Log training infos
                if log_interval is not None and self._episode_num % log_interval == 0:
                    fps = int(self.num_timesteps /
                              (time.time() - self.start_time))
                    logger.record("time/episodes",
                                  self._episode_num,
                                  exclude="tensorboard")
                    if len(self.ep_info_buffer) > 0 and len(
                            self.ep_info_buffer[0]) > 0:
                        logger.record(
                            'rollout/ep_rew_mean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buffer
                            ]))
                        logger.record(
                            'rollout/ep_len_mean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buffer
                            ]))
                    logger.record("time/fps", fps)
                    logger.record('time/time_elapsed',
                                  int(time.time() - self.start_time),
                                  exclude="tensorboard")
                    logger.record("time/total timesteps",
                                  self.num_timesteps,
                                  exclude="tensorboard")
                    if self.use_sde:
                        logger.record("train/std",
                                      (self.actor.get_std()).mean().item())

                    if len(self.ep_success_buffer) > 0:
                        logger.record('rollout/success rate',
                                      safe_mean(self.ep_success_buffer))
                    # Pass the number of timesteps for tensorboard
                    logger.dump(step=self.num_timesteps)

        mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0

        callback.on_rollout_end()

        return RolloutReturn(mean_reward, total_steps, total_episodes,
                             continue_training)