def reset(self): if self.waiting_step: logger.warn('Called reset() while waiting for the step to complete') self.step_wait() for pipe in self.parent_pipes: pipe.send(('reset', None)) return self._decode_obses([pipe.recv() for pipe in self.parent_pipes])
def save_state(fname, sess=None): from baselines import logger logger.warn('save_state method is deprecated, please use save_variables instead') sess = sess or get_session() dirname = os.path.dirname(fname) if any(dirname): os.makedirs(dirname, exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname)
def make_env(subrank=None): env = gym.make(env_name) if subrank is not None and logger.get_dir() is not None: try: from mpi4py import MPI mpi_rank = MPI.COMM_WORLD.Get_rank() except ImportError: MPI = None mpi_rank = 0 logger.warn('Running with a single MPI process. This should work, but the results may differ from the ones publshed in Plappert et al.') max_episode_steps = env._max_episode_steps env = Monitor(env, os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) # hack to re-expose _max_episode_steps (ideally should replace reliance on it downstream) env = gym.wrappers.TimeLimit(env, max_episode_steps=max_episode_steps) return env
def load_state(fname, sess=None): from baselines import logger logger.warn('load_state method is deprecated, please use load_variables instead') sess = sess or get_session() saver = tf.train.Saver() saver.restore(tf.get_default_session(), fname)
def render(self): logger.warn("Render not defined for %s" % self)
def run(self, acer_step=None): if self.goals is None: self.goals, self.goal_info = self.dynamics.get_goal(nb_goal=self.nenv) if not self.goal_as_image: self.goals = self.goal_to_embedding(self.goal_info) # enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps enc_obs = np.split(self.env.stackedobs, self.env.nstack, axis=-1) mb_obs = np.empty((self.nenv, self.nsteps + 1) + self.obs_shape, dtype=self.obs_dtype) mb_act = np.empty((self.nenv, self.nsteps) + self.ac_shape, dtype=self.ac_dtype) mb_mus = np.empty((self.nenv, self.nsteps, self.nact), dtype=np.float32) mb_dones = np.empty((self.nenv, self.nsteps), dtype=bool) mb_masks = np.empty((self.nenv, self.nsteps + 1), dtype=bool) mb_ext_rew = np.empty((self.nenv, self.nsteps), dtype=np.float32) mb_obs_infos = np.empty((self.nenv, self.nsteps), dtype=object) mb_goals = np.empty((self.nenv, self.nsteps + 1) + self.goal_shape, dtype=self.obs_dtype) mb_goal_infos = np.empty((self.nenv, self.nsteps), dtype=object) # mb_obs, mb_actions, mb_mus, mb_dones, mb_ext_rewards = [], [], [], [], [] # mb_obs_infos, mb_goals, mb_goal_infos = [], [], [] reached_step, done_step = np.array([None for _ in range(self.nenv)]), np.array([None for _ in range(self.nenv)]) episode_infos = np.asarray([{} for _ in range(self.nenv)], dtype=object) for step in range(self.nsteps): try: check_obs(self.obs) except ValueError: logger.warn("acer_step:{}, runner_step:{}, empty obs".format(acer_step, step)) raise ValueError actions, mus, states = self.model.step(self.obs, S=self.states, M=self.dones, goals=self.goals) if self.sample_goal: if self.use_random_policy_expl: actions[self.reached_status] = self.simple_random_action(np.sum(self.reached_status)) mus[self.reached_status] = self.get_mu_of_random_action() else: if np.sum(self.reached_status) > 0: alt_action, alt_mu, alt_states = self.alt_model.step(self.obs, S=self.states, M=self.dones, goals=self.goals) actions[self.reached_status] = alt_action[self.reached_status] mus[self.reached_status] = alt_mu[self.reached_status] mb_obs[:, step] = deepcopy(self.obs) mb_act[:, step] = actions mb_mus[:, step, :] = mus mb_masks[:, step] = deepcopy(self.dones) obs, rewards, dones, infos = self.env.step(actions) try: check_infos(infos) except ValueError: logger.warn("warning!wrong infos!program continues anyway") logger.info("infos:{}, dones:{}, acer_step:{}".format(infos, dones, acer_step)) logger.info("please debug it in runner_data/data.pkl") self.recorder.store(infos) self.recorder.dump() for info in infos: info.update({"source": self.name}) enc_obs.append(obs[..., -self.nc:]) mb_dones[:, step] = dones mb_ext_rew[:, step] = rewards self.episode_reward_to_go[self.reached_status] += rewards[self.reached_status] mb_obs_infos[:, step] = np.asarray(infos, dtype=object) mb_goals[:, step] = deepcopy(self.goals) mb_goal_infos[:, step] = deepcopy(self.goal_info) self.episode_step += 1 # states information for statefull models like LSTM self.states = states self.dones = dones self.obs = obs # check reached if self.sample_goal: for env_idx in range(self.nenv): if not self.reached_status[env_idx]: if self.dist_type == "l1": self.reached_status[env_idx] = self.check_goal_reached_v2(infos[env_idx], self.goal_info[env_idx]) else: raise NotImplementedError("I do not know how to compute goal_latent") if self.reached_status[env_idx]: reached_step[env_idx] = step self.episode_reached_step[env_idx] = deepcopy(self.episode_step[env_idx]) # check done done_step[self.dones] = step # revise goal if not self.sample_goal: for env_idx in range(self.nenv): if self.dones[env_idx]: # (- - done(t)) -> (done done, done(t)) start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif step == self.nsteps - 1: if done_step[env_idx] is None: # (- - t) -> (t, t, t) start = 0 else: # (- - done - - t) -> (- - - t, t, t) start = done_step[env_idx] + 1 end = step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] else: for env_idx in range(self.nenv): if step != self.nsteps - 1: # dones is instant variable but reached_status is a transitive variable if self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reach|[- - done] -> [done, done, done] start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] else: # [- - reach(done)] -> [ - - -] if reached_step[env_idx] == step # [- - reach - - done] -> [- - - done done done] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif not self.dones[env_idx] and self.reached_status[env_idx]: # reached|[ - - -] if reached_step[env_idx] is None: # [- - reached - -] if reached_step[env_idx] is not None pass else: # [- - - done] if self.dones[env_idx] and not self.reached_status[env_idx] # [- - - - -] if not self.dones[env_idx] and not self.reached_status[env_idx] pass else: if self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reach|[- - done(t)] -> [done, done, done(t)] start, end = 0, step + 1 if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] else: # [- - reach(done)(t)] -> [- - -] # [- - reach - - done(t)] -> [- - - done done done(t)] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) mb_goal_infos[env_idx, start:end] = infos[env_idx] elif not self.dones[env_idx] and self.reached_status[env_idx]: if reached_step[env_idx] is None: # reached|[ - - t] -> reached|[t t t] start, end = 0, step + 1 else: # reached[- - r - -] -> reached|[- - - t t] start, end = reached_step[env_idx] + 1, step + 1 if end == start: continue if self.goal_as_image: mb_goals[env_idx, start:end] = mb_obs[env_idx, step] else: mb_goals[env_idx, start:end] = self.goal_to_embedding(infos[env_idx]) else: # [- - - done(t)] if self.dones[env_idx] and not self.reached_status[env_idx] # [- - - - (t)] if not self.dones[env_idx] and not self.reached_status[env_idx] pass # summary for env_idx in range(self.nenv): info = infos[env_idx] if self.dones[env_idx]: assert info.get("episode") if info.get("episode"): episode_infos[env_idx]["episode"] = info.get("episode") if not self.sample_goal: episode_infos[env_idx]["reached_info"] = dict(source=self.name, x_pos=infos[env_idx]["x_pos"], y_pos=infos[env_idx]["y_pos"]) else: if self.reached_status[env_idx]: reached = 1.0 time_ratio = self.episode_reached_step[env_idx] / self.episode_step[env_idx] achieved_pos = {"x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"]} mem = dict(env=env_idx, is_succ=True, goal=self.goal_info[env_idx], final_pos=achieved_pos, timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx]) self.recorder.store(mem) self.log(mem) abs_dist = 10 else: reached = 0.0 time_ratio = 1.0 achieved_pos = {"x_pos": infos[env_idx]["x_pos"], "y_pos": infos[env_idx]["y_pos"]} mem = dict(env=env_idx, is_succ=False, goal=self.goal_info[env_idx], final_pos=achieved_pos, timestep=acer_step, episode=self.episode[env_idx], step=self.episode_step[env_idx]) self.recorder.store(mem) self.log(mem) abs_dist = abs(float(infos[env_idx]["x_pos"]) - float(self.goal_info[env_idx]["x_pos"])) + \ abs(float(infos[env_idx]["y_pos"]) - float(self.goal_info[env_idx]["y_pos"])) episode_infos[env_idx]["reached_info"] = dict(reached=reached, time_ratio=time_ratio, abs_dist=abs_dist, source=self.name, x_pos=infos[env_idx]["x_pos"], y_pos=infos[env_idx]["y_pos"]) episode_infos[env_idx]["goal_info"] = dict(x_pos=self.goal_info[env_idx]["x_pos"], y_pos=self.goal_info[env_idx]["y_pos"], source=self.goal_info[env_idx]["source"], reward_to_go=self.episode_reward_to_go[env_idx]) # re-plan goal goal_obs, goal_info = self.dynamics.get_goal(nb_goal=1) if self.goal_as_image: self.goals[env_idx] = goal_obs[0] else: self.goals[env_idx] = self.goal_to_embedding(goal_info[0]) self.goal_info[env_idx] = goal_info[0] self.episode[env_idx] += 1 self.episode_step[env_idx] = 0 self.episode_reached_step[env_idx] = 0 self.reached_status[env_idx] = False self.episode_reward_to_go[env_idx] = 0 # next obs and next goal mb_obs[:, -1] = deepcopy(self.obs) mb_goals[:, -1] = mb_goals[:, -2] # we cannot use self.goal since it way be revised if self.dist_type == "l2": raise NotImplementedError else: mb_int_rewards = self.reward_fn(mb_obs_infos, mb_goal_infos) # shapes are adjusted to [nenv, nsteps, []] enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0) self.recorder.dump() results = dict( enc_obs=enc_obs, obs=mb_obs, actions=mb_act, ext_rewards=mb_ext_rew, mus=mb_mus, dones=mb_dones, masks=mb_masks, obs_infos=mb_obs_infos, # nenv, nsteps, two purpose: 1)put into dynamics; 2) put into buffer episode_infos=episode_infos, goal_obs=mb_goals, # nenv, nsteps+1, goal_infos=mb_goal_infos, int_rewards=mb_int_rewards ) return results
def render(self, mode='human'): logger.warn('Render not defined for %s'%self)
def learn(*, network, env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.specs[0].id params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['rollout_batch_size'] = env.num_envs if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size return train(save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file)
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def render(self): logger.warn('Render not defined for %s'%self)
def learn(*, network, env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, params=None, **kwargs): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = { # env 'max_u': 1., # max absolute value of actions on different coordinates # ddpg 'layers': 3, # number of layers in the critic/actor networks 'hidden': 256, # number of neurons in each hidden layers 'network_class': 'baselines.her.actor_critic:ActorCritic', 'Q_lr': 0.001, # critic learning rate 'pi_lr': 0.001, # actor learning rate 'buffer_size': int(1E6), # for experience replay 'polyak': 0.95, # polyak averaging coefficient 'action_l2': 1.0, # quadratic penalty on actions (before rescaling by max_u) 'clip_obs': 200., 'scope': 'ddpg', # can be tweaked for testing 'relative_goals': False, # training 'n_cycles': 50, # per epoch 'rollout_batch_size': 2, # per mpi thread 'n_batches': 40, # training batches per cycle 'batch_size': 256, # per mpi thread, measured in transitions and reduced to even multiple of chunk_length. 'n_test_rollouts': 10, # number of test rollouts per epoch, each consists of rollout_batch_size rollouts 'test_with_polyak': False, # run test episodes with the target network # exploration 'random_eps': 0.2, # percentage of time a random action is taken 'noise_eps': 0.3, # std of gaussian noise added to not-completely-random actions as a percentage of max_u # HER 'replay_strategy': 'future', # supported modes: future, none 'replay_k': 4, # number of additional goals used for replay, only used if off_policy_data=future # normalization 'norm_eps': 0.01, # epsilon used for observation normalization 'norm_clip': 5, # normalized observations are cropped to this values 'bc_loss': 0, # whether or not to use the behavior cloning loss as an auxilliary loss 'q_filter': 0, # whether or not a Q value filter should be used on the Actor outputs 'num_demo': 25, # number of expert demo episodes 'demo_batch_size': 128, #number of samples to be used from the demonstrations buffer, per mpi thread 128/1024 or 32/256 'prm_loss_weight': 0.001, #Weight corresponding to the primary loss 'aux_loss_weight': 0.0078, #Weight corresponding to the auxilliary loss also called the cloning loss 'perturb': kwargs['pert_type'], 'n_actions': kwargs['n_actions'], } params['replay_strategy'] = replay_strategy if env is not None: env_name = env.spec.id params['env_name'] = env_name if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in else: params['env_name'] = 'NuFingers_Experiment' params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) if demo_file is not None: params['bc_loss'] = 1 params['q_filter'] = 1 params['n_cycles'] = 20 params['random_eps'] = 0.1 # chip: ON params['noise_eps'] = 0.1 # chip: ON # params['batch_size']: 1024 params = config.prepare_params(params) params['rollout_batch_size'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() if env is not None: dims = config.configure_dims(params) else: dims = dict(o=15, u=4, g=7, info_is_success=1) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env print("NAME={}".format(params['env_name'])) print(rollout_params) if params['env_name'].find('NuFingers_Experiment') == -1: rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) else: rollout_worker = RolloutNuFingers(policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutNuFingers(policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size return train(save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file)
def _update_one_trans_policy(self, seg, pi, adam, loss, zero_grad, update_trans_oldpi, num_batches): ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] cur_primitive, term = seg["cur_primitive"], seg["term"] if self._is_chef: info = defaultdict(list) optim_batchsize = min(self._optim_batchsize, ob.shape[0]) logger.log("Optimizing trans_{}... {} epochs * {} batches * {} batchsize <- {} data".format( pi.env_name, self._optim_epochs, num_batches, optim_batchsize, ob.shape[0])) if np.shape(ob)[0] == 0: logger.warn('[!] No transition is used') for ob_name in pi.ob_type: pi.ob_rms[ob_name].noupdate() blank = zero_grad() for _ in range(self._optim_epochs): for _ in range(num_batches): adam.update(blank, self._optim_stepsize * self._cur_lrmult) return None # normalize advantage atarg = (atarg - atarg.mean()) / max(atarg.std(), 0.000001) # prepare batches d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret, cur_primitive=cur_primitive, term=term), shuffle=True) ob_dict = self._env.get_ob_dict(ob) for ob_name in pi.ob_type: pi.ob_rms[ob_name].update(ob_dict[ob_name]) update_trans_oldpi() b = 0 for _ in range(self._optim_epochs): for batch in d.iterate_once(optim_batchsize): if b == self._optim_epochs * num_batches: break ob_list = pi.get_ob_list(batch["ob"]) fetched = loss( self._cur_lrmult, batch["cur_primitive"], batch["ac"], batch["atarg"], batch["vtarg"], batch["term"], *ob_list) adam.update(fetched['g'], self._optim_stepsize * self._cur_lrmult) b += 1 if self._is_chef: for key, value in fetched.items(): if key != 'g': if np.isscalar(value): info[key].append(value) else: info[key].extend(value) else: grad_norm_value = np.linalg.norm(value) info['grad_norm'].append(grad_norm_value) info['grad_norm_clipped'].append(np.clip( grad_norm_value, 0, self._config.trans_max_grad_norm)) blank = zero_grad() for _ in range(self._optim_epochs * num_batches - b): adam.update(blank, self._optim_stepsize * self._cur_lrmult) term_pred = seg["term"] batchsize = optim_batchsize if self._is_chef: info['term_pred'] = [np.mean(term_pred)] info['batch_size'] = [batchsize] return info return None
def _update_one_proximity_predictor_network(self, seg, proximity, adam, loss, zero_grad, num_batches, idx, only_use_trans_term_state=False): ob = seg["ob"] cur_primitive = seg["cur_primitive"] success = seg['success'] logger.log("\033[93m"+"proximity_predictor_{}".format(proximity.env_name)+"\033[0m") # remove the info for meta task from ob # Assumption: info for meta task is always appended in the end of ob # use the states that the match proximity_predictor idx if len(cur_primitive == idx) > 0: ob = ob[:, :proximity.observation_shape] ob_success_final = ob[((cur_primitive == idx) * success * seg['term']) == 1] ob_success_intermediate = ob[((cur_primitive == idx) * success * (1 - seg['term'])) == 1] ob_fail_final = ob[((cur_primitive == idx) * (1 - success) * seg['term']) == 1] ob_fail_intermediate = ob[((cur_primitive == idx) * (1 - success) * (1 - seg['term'])) == 1] final_state = seg['term'][(cur_primitive == idx) * success == 1] for i in range(final_state.shape[0] - 1, -1, -1): if final_state[i] != 1: final_state[i] = final_state[i + 1] + 1 final_state = final_state[final_state != 1] final_state = final_state - 1 if self._config.proximity_weight_decay_linear: ob_success_intermediate_weight = (self._config.trans_duration - final_state) / self._config.trans_duration else: ob_success_intermediate_weight = self._config.proximity_weight_decay_rate ** final_state ob_success_intermediate_weight = ob_success_intermediate_weight.reshape((ob_success_intermediate.shape[0], 1)) # proximity hist rew_success_final = proximity.proximity(ob_success_final) rew_success_intermediate = proximity.proximity(ob_success_intermediate) rew_fail_final = proximity.proximity(ob_fail_final) rew_fail_intermediate = proximity.proximity(ob_fail_intermediate) logger.log(" ob_success (final {}, intermediate {}) ob_fail (final {}, intermediate {})".format( ob_success_final.shape[0], ob_success_intermediate.shape[0], ob_fail_final.shape[0], ob_fail_intermediate.shape[0])) # add [obs, label] proximity.fail_buffer.add(np.concatenate((ob_fail_final, np.zeros( shape=[ob_fail_final.shape[0], 1])), axis=1)) proximity.success_buffer.add(np.concatenate((ob_success_final, np.ones( shape=[ob_success_final.shape[0], 1])), axis=1)) proximity.fail_buffer.add(np.concatenate((ob_fail_intermediate, np.zeros( shape=[ob_fail_intermediate.shape[0], 1])), axis=1)) proximity.success_buffer.add(np.concatenate((ob_success_intermediate, np.ones( shape=[ob_success_intermediate.shape[0], 1])*ob_success_intermediate_weight), axis=1)) if proximity.ob_rms: proximity.ob_rms.update(ob) else: ob_success_final = np.zeros(shape=(0, 0)) ob_success_intermediate = np.zeros(shape=(0, 0)) ob_fail_final = np.zeros(shape=(0, 0)) ob_fail_intermediate = np.zeros(shape=(0, 0)) if proximity.ob_rms: proximity.ob_rms.noupdate() num_state = ob.shape[0] optim_batchsize = self._optim_batchsize if 0 in [proximity.fail_buffer.size(), proximity.success_buffer.size()]: logger.warn('[!] No transition is used. So the proximity_predictor is not trained.') blank = zero_grad() for _ in range(self._optim_proximity_epochs * num_batches): adam.update(blank, self._optim_proximity_stepsize * self._cur_proximity_lrmult) return None logger.log("Optimizing proximity_predictor_{}... {} epochs * {} batches * {} batchsize <- {}/{} data".format( proximity.env_name, self._optim_proximity_epochs, num_batches, optim_batchsize, ob.shape[0], num_state)) if self._is_chef: info = defaultdict(list) # update proximity_predictor with replay buffer current_iter_loss = [] for _ in range(self._optim_proximity_epochs * num_batches): sampled_fail_states = proximity.sample_fail_batch(optim_batchsize) sampled_success_states = proximity.sample_success_batch(optim_batchsize) fetched = loss(sampled_fail_states, sampled_success_states) current_iter_loss.append(fetched['fake_loss']+fetched['real_loss']) adam.update(fetched['g'], self._optim_proximity_stepsize * self._cur_proximity_lrmult) if self._is_chef: for key, value in fetched.items(): if key != 'g': if np.isscalar(value): info[key].append(value) else: info[key].extend(value) else: grad_norm_value = np.linalg.norm(value) info['grad_norm'].append(grad_norm_value) info['grad_norm_clipped'].append(np.clip( grad_norm_value, 0, self._config.proximity_max_grad_norm)) proximity.last_iter_loss = np.average(current_iter_loss) if self._is_chef: logger.warn('proximity.last_iter_loss: {}'.format(proximity.last_iter_loss)) info['batch_size'] = [optim_batchsize] info['buffer_size_success_final'] = [proximity.success_buffer.size()] info['buffer_size_fail_final'] = [proximity.fail_buffer.size()] # hist summary if len(cur_primitive == idx) > 0: if rew_success_final.shape[0] > 0: info['hist_success_final'] = rew_success_final if rew_success_intermediate.shape[0] > 0: info['hist_success_intermediate'] = rew_success_intermediate if rew_fail_final.shape[0] > 0: info['hist_fail_final'] = rew_fail_final if rew_fail_intermediate.shape[0] > 0: info['hist_fail_intermediate'] = rew_fail_intermediate return info return None
def render(self): logger.warn('Render not defined for %s' % self)
def launch(env, trial_id, n_epochs, num_cpu, seed, policy_save_interval, clip_return, normalize_obs, structure, task_selection, goal_selection, goal_replay, task_replay, perturb, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: save_dir = find_save_path('./save/' + env + "/", trial_id) logger.configure(dir=save_dir) else: save_dir = None # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params, add main function arguments and log all parameters if structure == 'curious' or structure == 'task_experts': params = config.MULTI_TASK_PARAMS else: params = config.DEFAULT_PARAMS time = str(datetime.datetime.now()) params['time'] = time params['env_name'] = env params['task_selection'] = task_selection params['goal_selection'] = goal_selection params['task_replay'] = task_replay params['goal_replay'] = goal_replay params['structure'] = structure params['normalize_obs'] = normalize_obs params['num_cpu'] = num_cpu params['clip_return'] = clip_return params['trial_id'] = trial_id params['seed'] = seed if rank == 0: with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['ddpg_params']['normalize_obs'] = normalize_obs if rank == 0: config.log_params(params, logger=logger) if num_cpu != 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Colas et al. (2018, https://arxiv.org/abs/1810.06284) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) buffers = config.configure_buffer(dims=dims, params=params) # creates several policies with shared buffers in the task-experts structure, otherwise use just one policy if structure == 'task_experts': policy = [config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return, t_id=i) for i in range(params['nb_tasks'])] else: policy = config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'structure': structure, 'task_selection': task_selection, 'goal_selection': goal_selection, 'queue_length': params['queue_length'], 'eval': False, 'eps_task': params['eps_task'] } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'structure' : structure, 'task_selection': task_selection, 'goal_selection' : goal_selection, 'queue_length': params['queue_length'], 'eval': True, } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] if structure == 'task_experts': # create one rollout worker per policy/task rollout_worker = [RolloutWorker(params['make_env'], policy[i], dims, logger, unique_task=i, **rollout_params) for i in range(params['nb_tasks'])] for i in range(params['nb_tasks']): rollout_worker[i].seed(rank_seed + i) else: rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed + 100) train(logdir=save_dir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], perturbation_study=perturb, policy_save_interval=policy_save_interval, save_policies=save_policies, structure=structure, task_selection=task_selection, params=params)
def launch(env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'use_imitation': False, 'eval_play': False } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'use_imitation': False, 'eval_play': False } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def learn(*, network, env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs ): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.spec.id params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['rollout_batch_size'] = env.num_envs if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size return train( save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file)
def __init__(self, name, path, env, ob_env_name, is_train=True, use_traj_portion_start=0.0, use_traj_portion_end=1.0, config=None): self._scope = 'proximity_predictor/' + name self.env_name = name.split('.')[0] self._config = config # make primitive env for observation self._env = make_env(ob_env_name, config) self._include_acc = config.proximity_include_acc self._ob_shape = self._env.unwrapped.ob_shape self.ob_type = sorted(self._env.unwrapped.ob_type) if not self._include_acc and 'acc' in self.ob_type: self._ob_shape.pop('acc') self.ob_type.remove('acc') self.obs_norm = config.proximity_obs_norm self.observation_shape = np.sum( [np.prod(ob) for ob in self._ob_shape.values()]) # replay buffers self.fail_buffer = Replay(max_size=config.proximity_replay_size, name='fail_buffer') self.success_buffer = Replay(max_size=config.proximity_replay_size, name='success_buffer') # build the architecture self._num_hidden_layer = config.proximity_num_hid_layers self._hidden_size = config.proximity_hid_size self._activation_fn = activation(config.proximity_activation_fn) self._build_ph() logger.info('===== Proximity_predictor for {} ====='.format( self._scope)) # load collected states if is_train or config.evaluate_proximity_predictor: state_file_path = osp.join(config.primitive_dir, path.split('/')[0], 'state') logger.info('Search state files from: {}'.format( config.primitive_dir)) state_file_list = glob.glob(osp.join(state_file_path, '*.hdf5')) logger.info('Candidate state files: {}'.format(' '.join( [f.split('/')[-1] for f in state_file_list]))) state_file = {} try: logger.info('Use state files: {}'.format( state_file_list[0].split('/')[-1])) state_file = h5py.File(state_file_list[0], 'r') except: logger.warn( "No collected state hdf5 file is located at {}".format( state_file_path)) logger.info('Use traj portion: {} to {}'.format( use_traj_portion_start, use_traj_portion_end)) if self._config.proximity_keep_collected_obs: add_obs = self.success_buffer.add_collected_obs else: add_obs = self.success_buffer.add for k in list(state_file.keys()): traj_state = state_file[k]['obs'].value start_idx = int(traj_state.shape[0] * use_traj_portion_start) end_idx = int(traj_state.shape[0] * use_traj_portion_end) try: if state_file[k]['success'].value == 1: traj_state = traj_state[start_idx:end_idx] else: continue except: traj_state = traj_state[start_idx:end_idx] for t in range(traj_state.shape[0]): ob = traj_state[t][:self.observation_shape] # [ob, label] add_obs(np.concatenate((ob, [1.0]), axis=0)) # shape [num_state, dim_state] logger.info('Size of collected state: {}'.format( self.success_buffer.size())) logger.info('Average of collected state: {}'.format( np.mean(self.success_buffer.list(), axis=0))) # build graph fail_logits, fail_target_value, success_logits, success_target_value = \ self._build_graph(self.fail_obs_ph, self.success_obs_ph, reuse=False) # compute prob fake_prob = tf.reduce_mean(fail_logits) # should go to 0 real_prob = tf.reduce_mean(success_logits) # should go to 1 # compute loss if config.proximity_loss_type == 'lsgan': self.fake_loss = tf.reduce_mean( (fail_logits - fail_target_value)**2) self.real_loss = tf.reduce_mean( (success_logits - success_target_value)**2) elif config.proximity_loss_type == 'wgan': self.fake_loss = tf.reduce_mean( tf.abs(fail_logits - fail_target_value)) self.real_loss = tf.reduce_mean( tf.abs(success_logits - success_target_value)) # loss + accuracy terms self.total_loss = self.fake_loss + self.real_loss self.losses = { "fake_loss": self.fake_loss, "real_loss": self.real_loss, "fake_prob": fake_prob, "real_prob": real_prob, "total_loss": self.total_loss } # predict proximity self._proximity_op = tf.clip_by_value(success_logits, 0, 1)[:, 0]
def _clip_to_action_space(action): clipped_action = np.clip(action, 0., 1.) actions_equal = clipped_action == action if not np.all(actions_equal): logger.warn("Had to clip action since it wasn't constrained to the [0,1] action space:", action) return clipped_action