コード例 #1
0
ファイル: shmem_vec_env.py プロジェクト: MrGoogol/baselines
 def reset(self):
     if self.waiting_step:
         logger.warn('Called reset() while waiting for the step to complete')
         self.step_wait()
     for pipe in self.parent_pipes:
         pipe.send(('reset', None))
     return self._decode_obses([pipe.recv() for pipe in self.parent_pipes])
コード例 #2
0
ファイル: tf_util.py プロジェクト: MrGoogol/baselines
def save_state(fname, sess=None):
    from baselines import logger
    logger.warn('save_state method is deprecated, please use save_variables instead')
    sess = sess or get_session()
    dirname = os.path.dirname(fname)
    if any(dirname):
        os.makedirs(dirname, exist_ok=True)
    saver = tf.train.Saver()
    saver.save(tf.get_default_session(), fname)
コード例 #3
0
ファイル: config.py プロジェクト: MrGoogol/baselines
    def make_env(subrank=None):
        env = gym.make(env_name)
        if subrank is not None and logger.get_dir() is not None:
            try:
                from mpi4py import MPI
                mpi_rank = MPI.COMM_WORLD.Get_rank()
            except ImportError:
                MPI = None
                mpi_rank = 0
                logger.warn('Running with a single MPI process. This should work, but the results may differ from the ones publshed in Plappert et al.')

            max_episode_steps = env._max_episode_steps
            env =  Monitor(env,
                           os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(subrank)),
                           allow_early_resets=True)
            # hack to re-expose _max_episode_steps (ideally should replace reliance on it downstream)
            env = gym.wrappers.TimeLimit(env, max_episode_steps=max_episode_steps)
        return env
コード例 #4
0
def load_state(fname, sess=None):
    from baselines import logger
    logger.warn('load_state method is deprecated, please use load_variables instead')
    sess = sess or get_session()
    saver = tf.train.Saver()
    saver.restore(tf.get_default_session(), fname)
 def render(self):
     logger.warn("Render not defined for %s" % self)
コード例 #6
0
ファイル: tf_util.py プロジェクト: MrGoogol/baselines
def load_state(fname, sess=None):
    from baselines import logger
    logger.warn('load_state method is deprecated, please use load_variables instead')
    sess = sess or get_session()
    saver = tf.train.Saver()
    saver.restore(tf.get_default_session(), fname)
コード例 #7
0
    def run(self, acer_step=None):
        if self.goals is None:
            self.goals, self.goal_info = self.dynamics.get_goal(nb_goal=self.nenv)
            if not self.goal_as_image:
                self.goals = self.goal_to_embedding(self.goal_info)
        # enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
        enc_obs = np.split(self.env.stackedobs, self.env.nstack, axis=-1)
        mb_obs = np.empty((self.nenv, self.nsteps + 1) + self.obs_shape, dtype=self.obs_dtype)
        mb_act = np.empty((self.nenv, self.nsteps) + self.ac_shape, dtype=self.ac_dtype)
        mb_mus = np.empty((self.nenv, self.nsteps, self.nact), dtype=np.float32)
        mb_dones = np.empty((self.nenv, self.nsteps), dtype=bool)
        mb_masks = np.empty((self.nenv, self.nsteps + 1), dtype=bool)
        mb_ext_rew = np.empty((self.nenv, self.nsteps), dtype=np.float32)
        mb_obs_infos = np.empty((self.nenv, self.nsteps), dtype=object)
        mb_goals = np.empty((self.nenv, self.nsteps + 1) + self.goal_shape, dtype=self.obs_dtype)
        mb_goal_infos = np.empty((self.nenv, self.nsteps), dtype=object)

        # mb_obs, mb_actions, mb_mus, mb_dones, mb_ext_rewards = [], [], [], [], []
        # mb_obs_infos, mb_goals, mb_goal_infos = [], [], []
        reached_step, done_step = np.array([None for _ in range(self.nenv)]), np.array([None for _ in range(self.nenv)])

        episode_infos = np.asarray([{} for _ in range(self.nenv)], dtype=object)
        for step in range(self.nsteps):
            try:
                check_obs(self.obs)
            except ValueError:
                logger.warn("acer_step:{}, runner_step:{}, empty obs".format(acer_step, step))
                raise ValueError
            actions, mus, states = self.model.step(self.obs, S=self.states, M=self.dones, goals=self.goals)
            if self.sample_goal:
                if self.use_random_policy_expl:
                    actions[self.reached_status] = self.simple_random_action(np.sum(self.reached_status))
                    mus[self.reached_status] = self.get_mu_of_random_action()
                else:
                    if np.sum(self.reached_status) > 0:
                        alt_action, alt_mu, alt_states = self.alt_model.step(self.obs, S=self.states, M=self.dones, goals=self.goals)
                        actions[self.reached_status] = alt_action[self.reached_status]
                        mus[self.reached_status] = alt_mu[self.reached_status]

            mb_obs[:, step] = deepcopy(self.obs)
            mb_act[:, step] = actions
            mb_mus[:, step, :] = mus
            mb_masks[:, step] = deepcopy(self.dones)

            obs, rewards, dones, infos = self.env.step(actions)
            try:
                check_infos(infos)
            except ValueError:
                logger.warn("warning!wrong infos!program continues anyway")
                logger.info("infos:{}, dones:{}, acer_step:{}".format(infos, dones, acer_step))
                logger.info("please debug it in runner_data/data.pkl")
                self.recorder.store(infos)
                self.recorder.dump()
            for info in infos:
                info.update({"source": self.name})

            enc_obs.append(obs[..., -self.nc:])
            mb_dones[:, step] = dones
            mb_ext_rew[:, step] = rewards
            self.episode_reward_to_go[self.reached_status] += rewards[self.reached_status]
            mb_obs_infos[:, step] = np.asarray(infos, dtype=object)
            mb_goals[:, step] = deepcopy(self.goals)
            mb_goal_infos[:, step] = deepcopy(self.goal_info)
            self.episode_step += 1
            # states information for statefull models like LSTM
            self.states = states
            self.dones = dones
            self.obs = obs

            # check reached
            if self.sample_goal:
                for env_idx in range(self.nenv):
                    if not self.reached_status[env_idx]:
                        if self.dist_type == "l1":
                            self.reached_status[env_idx] = self.check_goal_reached_v2(infos[env_idx],
                                                                                      self.goal_info[env_idx])
                        else:
                            raise NotImplementedError("I do not know how to compute goal_latent")
                        if self.reached_status[env_idx]:
                            reached_step[env_idx] = step
                            self.episode_reached_step[env_idx] = deepcopy(self.episode_step[env_idx])

            # check done
            done_step[self.dones] = step

            # revise goal
            if not self.sample_goal:
                for env_idx in range(self.nenv):
                    if self.dones[env_idx]:
                        # (- - done(t)) -> (done done, done(t))
                        start, end = 0, step + 1
                        if self.goal_as_image:
                            mb_goals[env_idx, start:end] = mb_obs[env_idx, step] 
                        else:
                            mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                        mb_goal_infos[env_idx, start:end] = infos[env_idx]
                    elif step == self.nsteps - 1:
                        if done_step[env_idx] is None:
                            # (- - t) -> (t, t, t)
                            start = 0
                        else:
                            # (- - done - - t) -> (- - - t, t, t)
                            start = done_step[env_idx] + 1
                        end = step + 1
                        if end == start:
                            continue
                        if self.goal_as_image:
                            mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                        else:
                            mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                        mb_goal_infos[env_idx, start:end] = infos[env_idx]
            else:
                for env_idx in range(self.nenv):
                    if step != self.nsteps - 1:
                        # dones is instant variable but reached_status is a transitive variable
                        if self.dones[env_idx] and self.reached_status[env_idx]:
                            if reached_step[env_idx] is None:
                                # reach|[- - done] -> [done, done, done]
                                start, end = 0, step + 1
                                if self.goal_as_image:
                                    mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                                else:
                                    mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                                mb_goal_infos[env_idx, start:end] = infos[env_idx]
                            else:
                                # [- - reach(done)] -> [ - - -]  if reached_step[env_idx] == step
                                # [- - reach - - done] -> [- - - done done done]
                                start, end = reached_step[env_idx] + 1, step + 1
                                if end == start:
                                    continue
                                if self.goal_as_image:
                                    mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                                else:
                                    mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                                mb_goal_infos[env_idx, start:end] = infos[env_idx]
                        elif not self.dones[env_idx] and self.reached_status[env_idx]:
                            # reached|[ - - -]  if reached_step[env_idx] is None:
                            # [- - reached - -] if reached_step[env_idx] is not None
                            pass
                        else:
                            # [- - - done] if self.dones[env_idx] and not self.reached_status[env_idx]
                            # [- - - - -] if not self.dones[env_idx] and not self.reached_status[env_idx]
                            pass
                    else:
                        if self.dones[env_idx] and self.reached_status[env_idx]:
                            if reached_step[env_idx] is None:
                                # reach|[- - done(t)] -> [done, done, done(t)]
                                start, end = 0, step + 1
                                if self.goal_as_image:
                                    mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                                else:
                                    mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                                mb_goal_infos[env_idx, start:end] = infos[env_idx]
                            else:
                                # [- - reach(done)(t)] -> [- - -]
                                # [- - reach - - done(t)] -> [- - - done done done(t)]
                                start, end = reached_step[env_idx] + 1, step + 1
                                if end == start:
                                    continue
                                if self.goal_as_image:
                                    mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                                else:
                                    mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                                mb_goal_infos[env_idx, start:end] = infos[env_idx]
                        elif not self.dones[env_idx] and self.reached_status[env_idx]:
                            if reached_step[env_idx] is None:
                                # reached|[ - - t]  -> reached|[t t t]
                                start, end = 0, step + 1
                            else:
                                # reached[- - r - -] -> reached|[- - - t t]
                                start, end = reached_step[env_idx] + 1, step + 1
                            if end == start:
                                continue
                            if self.goal_as_image:
                                mb_goals[env_idx, start:end] = mb_obs[env_idx, step]
                            else:
                                mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx])
                        else:
                            # [- - - done(t)]  if self.dones[env_idx] and not self.reached_status[env_idx]
                            # [- - - - (t)] if not self.dones[env_idx] and not self.reached_status[env_idx]
                            pass
            # summary
            for env_idx in range(self.nenv):
                info = infos[env_idx]
                if self.dones[env_idx]:
                    assert info.get("episode")
                    if info.get("episode"):
                        episode_infos[env_idx]["episode"] = info.get("episode")
                    if not self.sample_goal:
                        episode_infos[env_idx]["reached_info"] = dict(source=self.name,
                                                                      x_pos=infos[env_idx]["x_pos"],
                                                                      y_pos=infos[env_idx]["y_pos"])
                    else:
                        if self.reached_status[env_idx]:
                            reached = 1.0
                            time_ratio = self.episode_reached_step[env_idx] / self.episode_step[env_idx]
                            achieved_pos = {"x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"]}
                            mem = dict(env=env_idx, is_succ=True, goal=self.goal_info[env_idx], final_pos=achieved_pos,
                                       timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx])
                            self.recorder.store(mem)
                            self.log(mem)
                            abs_dist = 10
                        else:
                            reached = 0.0
                            time_ratio = 1.0
                            achieved_pos = {"x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"]}
                            mem = dict(env=env_idx, is_succ=False, goal=self.goal_info[env_idx], final_pos=achieved_pos,
                                       timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx])
                            self.recorder.store(mem)
                            self.log(mem)
                            abs_dist = abs(float(infos[env_idx]["x_pos"]) - float(self.goal_info[env_idx]["x_pos"])) + \
                                       abs(float(infos[env_idx]["y_pos"]) - float(self.goal_info[env_idx]["y_pos"]))
                        episode_infos[env_idx]["reached_info"] = dict(reached=reached, time_ratio=time_ratio,
                                                                      abs_dist=abs_dist, source=self.name,
                                                                      x_pos=infos[env_idx]["x_pos"],
                                                                      y_pos=infos[env_idx]["y_pos"])
                        episode_infos[env_idx]["goal_info"] = dict(x_pos=self.goal_info[env_idx]["x_pos"],
                                                                   y_pos=self.goal_info[env_idx]["y_pos"],
                                                                   source=self.goal_info[env_idx]["source"],
                                                                   reward_to_go=self.episode_reward_to_go[env_idx])
                        # re-plan goal
                        goal_obs, goal_info = self.dynamics.get_goal(nb_goal=1)
                        if self.goal_as_image:
                            self.goals[env_idx] = goal_obs[0]
                        else:
                            self.goals[env_idx] = self.goal_to_embedding(goal_info[0])
                        self.goal_info[env_idx] = goal_info[0]
                        self.episode[env_idx] += 1
                        self.episode_step[env_idx] = 0
                        self.episode_reached_step[env_idx] = 0
                        self.reached_status[env_idx] = False
                        self.episode_reward_to_go[env_idx] = 0

        # next obs and next goal
        mb_obs[:, -1] = deepcopy(self.obs)
        mb_goals[:, -1] = mb_goals[:, -2]  # we cannot use self.goal since it way be revised

        if self.dist_type == "l2":
            raise NotImplementedError
        else:
            mb_int_rewards = self.reward_fn(mb_obs_infos, mb_goal_infos)
        # shapes are adjusted to [nenv, nsteps, []]
        enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0)

        self.recorder.dump()
        results = dict(
            enc_obs=enc_obs,
            obs=mb_obs,
            actions=mb_act,
            ext_rewards=mb_ext_rew,
            mus=mb_mus,
            dones=mb_dones,
            masks=mb_masks,
            obs_infos=mb_obs_infos,  # nenv, nsteps, two purpose: 1)put into dynamics; 2) put into buffer
            episode_infos=episode_infos,
            goal_obs=mb_goals,  # nenv, nsteps+1,
            goal_infos=mb_goal_infos,
            int_rewards=mb_int_rewards
        )
        return results
コード例 #8
0
 def render(self, mode='human'):
     logger.warn('Render not defined for %s'%self)
コード例 #9
0
def learn(*,
          network,
          env,
          total_timesteps,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          policy_save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          **kwargs):

    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.specs[0].id
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    params['rollout_batch_size'] = env.num_envs

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env,
                                   policy,
                                   dims,
                                   logger,
                                   monitor=True,
                                   **rollout_params)
    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size

    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 policy_save_interval=policy_save_interval,
                 demo_file=demo_file)
コード例 #10
0
ファイル: train.py プロジェクト: marcelo-dalmeida/baselines
def launch(
    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this. Please also refer to ' + 
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies)
コード例 #11
0
 def render(self):
     logger.warn('Render not defined for %s'%self)
コード例 #12
0
def learn(*,
          network,
          env,
          total_timesteps,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          policy_save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          params=None,
          **kwargs):
    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = {
        # env
        'max_u': 1.,  # max absolute value of actions on different coordinates
        # ddpg
        'layers': 3,  # number of layers in the critic/actor networks
        'hidden': 256,  # number of neurons in each hidden layers
        'network_class': 'baselines.her.actor_critic:ActorCritic',
        'Q_lr': 0.001,  # critic learning rate
        'pi_lr': 0.001,  # actor learning rate
        'buffer_size': int(1E6),  # for experience replay
        'polyak': 0.95,  # polyak averaging coefficient
        'action_l2':
        1.0,  # quadratic penalty on actions (before rescaling by max_u)
        'clip_obs': 200.,
        'scope': 'ddpg',  # can be tweaked for testing
        'relative_goals': False,
        # training
        'n_cycles': 50,  # per epoch
        'rollout_batch_size': 2,  # per mpi thread
        'n_batches': 40,  # training batches per cycle
        'batch_size':
        256,  # per mpi thread, measured in transitions and reduced to even multiple of chunk_length.
        'n_test_rollouts':
        10,  # number of test rollouts per epoch, each consists of rollout_batch_size rollouts
        'test_with_polyak': False,  # run test episodes with the target network
        # exploration
        'random_eps': 0.2,  # percentage of time a random action is taken
        'noise_eps':
        0.3,  # std of gaussian noise added to not-completely-random actions as a percentage of max_u
        # HER
        'replay_strategy': 'future',  # supported modes: future, none
        'replay_k':
        4,  # number of additional goals used for replay, only used if off_policy_data=future
        # normalization
        'norm_eps': 0.01,  # epsilon used for observation normalization
        'norm_clip': 5,  # normalized observations are cropped to this values
        'bc_loss':
        0,  # whether or not to use the behavior cloning loss as an auxilliary loss
        'q_filter':
        0,  # whether or not a Q value filter should be used on the Actor outputs
        'num_demo': 25,  # number of expert demo episodes
        'demo_batch_size':
        128,  #number of samples to be used from the demonstrations buffer, per mpi thread 128/1024 or 32/256
        'prm_loss_weight': 0.001,  #Weight corresponding to the primary loss
        'aux_loss_weight':
        0.0078,  #Weight corresponding to the auxilliary loss also called the cloning loss
        'perturb': kwargs['pert_type'],
        'n_actions': kwargs['n_actions'],
    }
    params['replay_strategy'] = replay_strategy
    if env is not None:
        env_name = env.spec.id
        params['env_name'] = env_name
        if env_name in config.DEFAULT_ENV_PARAMS:
            params.update(config.DEFAULT_ENV_PARAMS[env_name]
                          )  # merge env-specific parameters in
    else:
        params['env_name'] = 'NuFingers_Experiment'
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)

    if demo_file is not None:
        params['bc_loss'] = 1
        params['q_filter'] = 1
        params['n_cycles'] = 20
        params['random_eps'] = 0.1  # chip: ON
        params['noise_eps'] = 0.1  # chip: ON
        # params['batch_size']: 1024
    params = config.prepare_params(params)
    params['rollout_batch_size'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    if env is not None:
        dims = config.configure_dims(params)
    else:
        dims = dict(o=15, u=4, g=7, info_is_success=1)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    print("NAME={}".format(params['env_name']))

    print(rollout_params)

    if params['env_name'].find('NuFingers_Experiment') == -1:
        rollout_worker = RolloutWorker(env,
                                       policy,
                                       dims,
                                       logger,
                                       monitor=True,
                                       **rollout_params)
        evaluator = RolloutWorker(eval_env, policy, dims, logger,
                                  **eval_params)
    else:
        rollout_worker = RolloutNuFingers(policy,
                                          dims,
                                          logger,
                                          monitor=True,
                                          **rollout_params)
        evaluator = RolloutNuFingers(policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size

    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 policy_save_interval=policy_save_interval,
                 demo_file=demo_file)
コード例 #13
0
ファイル: trainer.py プロジェクト: wwxFromTju/transition
    def _update_one_trans_policy(self, seg, pi, adam, loss, zero_grad, update_trans_oldpi, num_batches):
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        cur_primitive, term = seg["cur_primitive"], seg["term"]
        if self._is_chef:
            info = defaultdict(list)

        optim_batchsize = min(self._optim_batchsize, ob.shape[0])
        logger.log("Optimizing trans_{}... {} epochs * {} batches * {} batchsize <- {} data".format(
            pi.env_name, self._optim_epochs, num_batches, optim_batchsize, ob.shape[0]))

        if np.shape(ob)[0] == 0:
            logger.warn('[!] No transition is used')
            for ob_name in pi.ob_type:
                pi.ob_rms[ob_name].noupdate()
            blank = zero_grad()
            for _ in range(self._optim_epochs):
                for _ in range(num_batches):
                    adam.update(blank, self._optim_stepsize * self._cur_lrmult)
            return None

        # normalize advantage
        atarg = (atarg - atarg.mean()) / max(atarg.std(), 0.000001)

        # prepare batches
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret,
                         cur_primitive=cur_primitive, term=term), shuffle=True)

        ob_dict = self._env.get_ob_dict(ob)
        for ob_name in pi.ob_type:
            pi.ob_rms[ob_name].update(ob_dict[ob_name])

        update_trans_oldpi()
        b = 0
        for _ in range(self._optim_epochs):
            for batch in d.iterate_once(optim_batchsize):
                if b == self._optim_epochs * num_batches:
                    break
                ob_list = pi.get_ob_list(batch["ob"])
                fetched = loss(
                    self._cur_lrmult, batch["cur_primitive"], batch["ac"],
                    batch["atarg"], batch["vtarg"], batch["term"], *ob_list)
                adam.update(fetched['g'], self._optim_stepsize * self._cur_lrmult)
                b += 1
                if self._is_chef:
                    for key, value in fetched.items():
                        if key != 'g':
                            if np.isscalar(value):
                                info[key].append(value)
                            else:
                                info[key].extend(value)
                        else:
                            grad_norm_value = np.linalg.norm(value)
                            info['grad_norm'].append(grad_norm_value)
                            info['grad_norm_clipped'].append(np.clip(
                                grad_norm_value, 0, self._config.trans_max_grad_norm))
        blank = zero_grad()
        for _ in range(self._optim_epochs * num_batches - b):
            adam.update(blank, self._optim_stepsize * self._cur_lrmult)

        term_pred = seg["term"]
        batchsize = optim_batchsize

        if self._is_chef:
            info['term_pred'] = [np.mean(term_pred)]
            info['batch_size'] = [batchsize]
            return info
        return None
コード例 #14
0
ファイル: trainer.py プロジェクト: wwxFromTju/transition
    def _update_one_proximity_predictor_network(self, seg, proximity, adam, loss,
                                                zero_grad, num_batches, idx,
                                                only_use_trans_term_state=False):
        ob = seg["ob"]
        cur_primitive = seg["cur_primitive"]
        success = seg['success']

        logger.log("\033[93m"+"proximity_predictor_{}".format(proximity.env_name)+"\033[0m")

        # remove the info for meta task from ob
        # Assumption: info for meta task is always appended in the end of ob
        # use the states that the match proximity_predictor idx
        if len(cur_primitive == idx) > 0:
            ob = ob[:, :proximity.observation_shape]
            ob_success_final = ob[((cur_primitive == idx) * success * seg['term']) == 1]
            ob_success_intermediate = ob[((cur_primitive == idx) * success * (1 - seg['term'])) == 1]
            ob_fail_final = ob[((cur_primitive == idx) * (1 - success) * seg['term']) == 1]
            ob_fail_intermediate = ob[((cur_primitive == idx) * (1 - success) * (1 - seg['term'])) == 1]

            final_state = seg['term'][(cur_primitive == idx) * success == 1]
            for i in range(final_state.shape[0] - 1, -1, -1):
                if final_state[i] != 1:
                    final_state[i] = final_state[i + 1] + 1
            final_state = final_state[final_state != 1]
            final_state = final_state - 1
            if self._config.proximity_weight_decay_linear:
                ob_success_intermediate_weight = (self._config.trans_duration - final_state) / self._config.trans_duration
            else:
                ob_success_intermediate_weight = self._config.proximity_weight_decay_rate ** final_state
            ob_success_intermediate_weight = ob_success_intermediate_weight.reshape((ob_success_intermediate.shape[0], 1))

            # proximity hist
            rew_success_final = proximity.proximity(ob_success_final)
            rew_success_intermediate = proximity.proximity(ob_success_intermediate)
            rew_fail_final = proximity.proximity(ob_fail_final)
            rew_fail_intermediate = proximity.proximity(ob_fail_intermediate)

            logger.log("    ob_success (final {}, intermediate {})  ob_fail (final {}, intermediate {})".format(
                ob_success_final.shape[0], ob_success_intermediate.shape[0], ob_fail_final.shape[0], ob_fail_intermediate.shape[0]))

            # add [obs, label]
            proximity.fail_buffer.add(np.concatenate((ob_fail_final, np.zeros(
                shape=[ob_fail_final.shape[0], 1])), axis=1))
            proximity.success_buffer.add(np.concatenate((ob_success_final, np.ones(
                shape=[ob_success_final.shape[0], 1])), axis=1))
            proximity.fail_buffer.add(np.concatenate((ob_fail_intermediate, np.zeros(
                shape=[ob_fail_intermediate.shape[0], 1])), axis=1))
            proximity.success_buffer.add(np.concatenate((ob_success_intermediate, np.ones(
                shape=[ob_success_intermediate.shape[0], 1])*ob_success_intermediate_weight), axis=1))

            if proximity.ob_rms:
                proximity.ob_rms.update(ob)
        else:
            ob_success_final = np.zeros(shape=(0, 0))
            ob_success_intermediate = np.zeros(shape=(0, 0))
            ob_fail_final = np.zeros(shape=(0, 0))
            ob_fail_intermediate = np.zeros(shape=(0, 0))
            if proximity.ob_rms:
                proximity.ob_rms.noupdate()

        num_state = ob.shape[0]
        optim_batchsize = self._optim_batchsize
        if 0 in [proximity.fail_buffer.size(), proximity.success_buffer.size()]:
            logger.warn('[!] No transition is used. So the proximity_predictor is not trained.')
            blank = zero_grad()
            for _ in range(self._optim_proximity_epochs * num_batches):
                adam.update(blank, self._optim_proximity_stepsize * self._cur_proximity_lrmult)
            return None

        logger.log("Optimizing proximity_predictor_{}... {} epochs * {} batches * {} batchsize <- {}/{} data".format(
            proximity.env_name, self._optim_proximity_epochs, num_batches,
            optim_batchsize, ob.shape[0], num_state))

        if self._is_chef:
            info = defaultdict(list)

        # update proximity_predictor with replay buffer
        current_iter_loss = []
        for _ in range(self._optim_proximity_epochs * num_batches):
            sampled_fail_states = proximity.sample_fail_batch(optim_batchsize)
            sampled_success_states = proximity.sample_success_batch(optim_batchsize)
            fetched = loss(sampled_fail_states, sampled_success_states)
            current_iter_loss.append(fetched['fake_loss']+fetched['real_loss'])
            adam.update(fetched['g'],
                        self._optim_proximity_stepsize * self._cur_proximity_lrmult)
            if self._is_chef:
                for key, value in fetched.items():
                    if key != 'g':
                        if np.isscalar(value):
                            info[key].append(value)
                        else:
                            info[key].extend(value)
                    else:
                        grad_norm_value = np.linalg.norm(value)
                        info['grad_norm'].append(grad_norm_value)
                        info['grad_norm_clipped'].append(np.clip(
                            grad_norm_value, 0, self._config.proximity_max_grad_norm))
        proximity.last_iter_loss = np.average(current_iter_loss)

        if self._is_chef:
            logger.warn('proximity.last_iter_loss: {}'.format(proximity.last_iter_loss))
            info['batch_size'] = [optim_batchsize]
            info['buffer_size_success_final'] = [proximity.success_buffer.size()]
            info['buffer_size_fail_final'] = [proximity.fail_buffer.size()]
            # hist summary
            if len(cur_primitive == idx) > 0:
                if rew_success_final.shape[0] > 0:
                    info['hist_success_final'] = rew_success_final
                if rew_success_intermediate.shape[0] > 0:
                    info['hist_success_intermediate'] = rew_success_intermediate
                if rew_fail_final.shape[0] > 0:
                    info['hist_fail_final'] = rew_fail_final
                if rew_fail_intermediate.shape[0] > 0:
                    info['hist_fail_intermediate'] = rew_fail_intermediate
            return info
        return None
コード例 #15
0
 def render(self):
     logger.warn('Render not defined for %s' % self)
コード例 #16
0
ファイル: __init__.py プロジェクト: Divyankpandey/baselines
 def render(self, mode='human'):
     logger.warn('Render not defined for %s'%self)
コード例 #17
0
ファイル: train.py プロジェクト: wwxFromTju/curious
def launch(env, trial_id, n_epochs, num_cpu, seed, policy_save_interval, clip_return, normalize_obs,
           structure, task_selection, goal_selection, goal_replay, task_replay, perturb, save_policies=True):

    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        save_dir = find_save_path('./save/' + env + "/", trial_id)
        logger.configure(dir=save_dir)
    else:
        save_dir = None

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params, add main function arguments and log all parameters
    if structure == 'curious' or structure == 'task_experts':
        params = config.MULTI_TASK_PARAMS
    else:
        params = config.DEFAULT_PARAMS

    time = str(datetime.datetime.now())
    params['time'] = time
    params['env_name'] = env
    params['task_selection'] = task_selection
    params['goal_selection'] = goal_selection
    params['task_replay'] = task_replay
    params['goal_replay'] = goal_replay
    params['structure'] = structure
    params['normalize_obs'] = normalize_obs
    params['num_cpu'] = num_cpu
    params['clip_return'] = clip_return
    params['trial_id'] = trial_id
    params['seed'] = seed
    if rank == 0:
        with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
            json.dump(params, f)
    params = config.prepare_params(params)
    params['ddpg_params']['normalize_obs'] = normalize_obs
    if rank == 0:
        config.log_params(params, logger=logger)

    if num_cpu != 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Colas et al. (2018, https://arxiv.org/abs/1810.06284) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)

    buffers = config.configure_buffer(dims=dims, params=params)

    # creates several policies with shared buffers in the task-experts structure, otherwise use just one policy
    if structure == 'task_experts':
        policy = [config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return, t_id=i) for i in range(params['nb_tasks'])]
    else:
        policy = config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return)


    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'structure': structure,
        'task_selection': task_selection,
        'goal_selection': goal_selection,
        'queue_length': params['queue_length'],
        'eval': False,
        'eps_task': params['eps_task']
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
        'structure' : structure,
        'task_selection': task_selection,
        'goal_selection' : goal_selection,
        'queue_length': params['queue_length'],
        'eval': True,
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    if structure == 'task_experts':
        # create one rollout worker per policy/task
        rollout_worker = [RolloutWorker(params['make_env'], policy[i], dims, logger, unique_task=i, **rollout_params) for i in range(params['nb_tasks'])]
        for i in range(params['nb_tasks']):
            rollout_worker[i].seed(rank_seed + i)
    else:
        rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params)
        rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
    evaluator.seed(rank_seed + 100)

    train(logdir=save_dir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], perturbation_study=perturb,
          policy_save_interval=policy_save_interval, save_policies=save_policies, structure=structure, task_selection=task_selection, params=params)
コード例 #18
0
def launch(env_name,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'use_imitation': False,
        'eval_play': False
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
        'use_imitation': False,
        'eval_play': False
    }
    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger,
                                   **rollout_params)
    rollout_worker.seed(rank_seed)

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies)
コード例 #19
0
ファイル: her.py プロジェクト: MrGoogol/baselines
def learn(*, network, env, total_timesteps,
    seed=None,
    eval_env=None,
    replay_strategy='future',
    policy_save_interval=5,
    clip_return=True,
    demo_file=None,
    override_params=None,
    load_path=None,
    save_path=None,
    **kwargs
):

    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.spec.id
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
         json.dump(params, f)
    params = config.prepare_params(params)
    params['rollout_batch_size'] = env.num_envs

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
            'are looking to reproduce those results, be aware of this. Please also refer to ' +
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params)
    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size

    return train(
        save_path=save_path, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, demo_file=demo_file)
コード例 #20
0
    def __init__(self,
                 name,
                 path,
                 env,
                 ob_env_name,
                 is_train=True,
                 use_traj_portion_start=0.0,
                 use_traj_portion_end=1.0,
                 config=None):
        self._scope = 'proximity_predictor/' + name
        self.env_name = name.split('.')[0]
        self._config = config

        # make primitive env for observation
        self._env = make_env(ob_env_name, config)
        self._include_acc = config.proximity_include_acc
        self._ob_shape = self._env.unwrapped.ob_shape
        self.ob_type = sorted(self._env.unwrapped.ob_type)
        if not self._include_acc and 'acc' in self.ob_type:
            self._ob_shape.pop('acc')
            self.ob_type.remove('acc')

        self.obs_norm = config.proximity_obs_norm
        self.observation_shape = np.sum(
            [np.prod(ob) for ob in self._ob_shape.values()])

        # replay buffers
        self.fail_buffer = Replay(max_size=config.proximity_replay_size,
                                  name='fail_buffer')
        self.success_buffer = Replay(max_size=config.proximity_replay_size,
                                     name='success_buffer')

        # build the architecture
        self._num_hidden_layer = config.proximity_num_hid_layers
        self._hidden_size = config.proximity_hid_size
        self._activation_fn = activation(config.proximity_activation_fn)
        self._build_ph()

        logger.info('===== Proximity_predictor for {} ====='.format(
            self._scope))
        # load collected states
        if is_train or config.evaluate_proximity_predictor:
            state_file_path = osp.join(config.primitive_dir,
                                       path.split('/')[0], 'state')
            logger.info('Search state files from: {}'.format(
                config.primitive_dir))
            state_file_list = glob.glob(osp.join(state_file_path, '*.hdf5'))
            logger.info('Candidate state files: {}'.format(' '.join(
                [f.split('/')[-1] for f in state_file_list])))
            state_file = {}
            try:
                logger.info('Use state files: {}'.format(
                    state_file_list[0].split('/')[-1]))
                state_file = h5py.File(state_file_list[0], 'r')
            except:
                logger.warn(
                    "No collected state hdf5 file is located at {}".format(
                        state_file_path))
            logger.info('Use traj portion: {} to {}'.format(
                use_traj_portion_start, use_traj_portion_end))

            if self._config.proximity_keep_collected_obs:
                add_obs = self.success_buffer.add_collected_obs
            else:
                add_obs = self.success_buffer.add

            for k in list(state_file.keys()):
                traj_state = state_file[k]['obs'].value
                start_idx = int(traj_state.shape[0] * use_traj_portion_start)
                end_idx = int(traj_state.shape[0] * use_traj_portion_end)
                try:
                    if state_file[k]['success'].value == 1:
                        traj_state = traj_state[start_idx:end_idx]
                    else:
                        continue
                except:
                    traj_state = traj_state[start_idx:end_idx]
                for t in range(traj_state.shape[0]):
                    ob = traj_state[t][:self.observation_shape]
                    # [ob, label]
                    add_obs(np.concatenate((ob, [1.0]), axis=0))

            # shape [num_state, dim_state]
            logger.info('Size of collected state: {}'.format(
                self.success_buffer.size()))
            logger.info('Average of collected state: {}'.format(
                np.mean(self.success_buffer.list(), axis=0)))

        # build graph
        fail_logits, fail_target_value, success_logits, success_target_value = \
            self._build_graph(self.fail_obs_ph, self.success_obs_ph, reuse=False)

        # compute prob
        fake_prob = tf.reduce_mean(fail_logits)  # should go to 0
        real_prob = tf.reduce_mean(success_logits)  # should go to 1

        # compute loss
        if config.proximity_loss_type == 'lsgan':
            self.fake_loss = tf.reduce_mean(
                (fail_logits - fail_target_value)**2)
            self.real_loss = tf.reduce_mean(
                (success_logits - success_target_value)**2)
        elif config.proximity_loss_type == 'wgan':
            self.fake_loss = tf.reduce_mean(
                tf.abs(fail_logits - fail_target_value))
            self.real_loss = tf.reduce_mean(
                tf.abs(success_logits - success_target_value))

        # loss + accuracy terms
        self.total_loss = self.fake_loss + self.real_loss
        self.losses = {
            "fake_loss": self.fake_loss,
            "real_loss": self.real_loss,
            "fake_prob": fake_prob,
            "real_prob": real_prob,
            "total_loss": self.total_loss
        }

        # predict proximity
        self._proximity_op = tf.clip_by_value(success_logits, 0, 1)[:, 0]
コード例 #21
0
def _clip_to_action_space(action):
    clipped_action = np.clip(action, 0., 1.)
    actions_equal = clipped_action == action
    if not np.all(actions_equal):
        logger.warn("Had to clip action since it wasn't constrained to the [0,1] action space:", action)
    return clipped_action