def init_demo_buffer(self, demoDataFile, update_stats=True): #function that initializes the demo buffer demoData = np.load(demoDataFile) #load the demonstration data from data file info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')] info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys] demo_data_obs = demoData['obs'] demo_data_acs = demoData['acs'] demo_data_info = demoData['info'] for epsd in range(self.num_demo): # we initialize the whole demo buffer at the start of the training obs, acts, goals, achieved_goals = [], [] ,[] ,[] i = 0 for transition in range(self.T - 1): obs.append([demo_data_obs[epsd][transition].get('observation')]) acts.append([demo_data_acs[epsd][transition]]) goals.append([demo_data_obs[epsd][transition].get('desired_goal')]) achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')]) for idx, key in enumerate(info_keys): info_values[idx][transition, i] = demo_data_info[epsd][transition][key] obs.append([demo_data_obs[epsd][self.T - 1].get('observation')]) achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')]) episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals) for key, value in zip(info_keys, info_values): episode['info_{}'.format(key)] = value episode = convert_episode_to_batch_major(episode) global DEMO_BUFFER DEMO_BUFFER.store_episode(episode) # create the observation dict and append them into the demonstration buffer logger.debug("Demo buffer size currently ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size if update_stats: # add transitions to normalizer to normalize the demo data as well episode['o_2'] = episode['o'][:, 1:, :] episode['ag_2'] = episode['ag'][:, 1:, :] num_normalizing_transitions = transitions_in_episode_batch(episode) transitions = self.sample_transitions(episode, num_normalizing_transitions) o, g, ag = transitions['o'], transitions['g'], transitions['ag'] transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) # No need to preprocess the o_2 and g_2 since this is only used for stats self.o_stats.update(transitions['o']) self.g_stats.update(transitions['g']) self.o_stats.recompute_stats() self.g_stats.recompute_stats() episode.clear() logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size
def display_var_info(vars): from baselines import logger count_params = 0 for v in vars: name = v.name if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue count_params += np.prod(v.shape.as_list()) if "/b:" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print logger.info(" %s%s%s" % (name, " "*(55-len(name)), str(v.shape))) logger.info("Total model parameters: %0.1f million" % (count_params*1e-6))
def get_target_updates(vars, target_vars, tau): logger.info('setting up target updates ...') soft_updates = [] init_updates = [] assert len(vars) == len(target_vars) for var, target_var in zip(vars, target_vars): logger.info(' {} <- {}'.format(target_var.name, var.name)) init_updates.append(tf.assign(target_var, var)) soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var)) assert len(init_updates) == len(vars) assert len(soft_updates) == len(vars) return tf.group(*init_updates), tf.group(*soft_updates)
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): assert len(actor.vars) == len(perturbed_actor.vars) assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) updates = [] for var, perturbed_var in zip(actor.vars, perturbed_actor.vars): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var)) assert len(updates) == len(actor.vars) return tf.group(*updates)
def step_wait(self): obs, rews, dones, infos = self.venv.step_wait() self.step_id += 1 if self.recording: self.video_recorder.capture_frame() self.recorded_frames += 1 if self.recorded_frames > self.video_length: logger.info("Saving video to ", self.video_recorder.path) self.close_video_recorder() elif self._video_enabled(): self.start_video_recorder() return obs, rews, dones, infos
def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, save_policies, **kwargs): rank = MPI.COMM_WORLD.Get_rank() latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl') periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl') logger.info("Training...") best_success_rate = -1 for epoch in range(n_epochs): # train rollout_worker.clear_history() for _ in range(n_cycles): episode = rollout_worker.generate_rollouts() policy.store_episode(episode) for _ in range(n_batches): policy.train() policy.update_target_net() # test evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs logger.record_tabular('epoch', epoch) for key, val in evaluator.logs('test'): logger.record_tabular(key, mpi_average(val)) for key, val in rollout_worker.logs('train'): logger.record_tabular(key, mpi_average(val)) for key, val in policy.logs(): logger.record_tabular(key, mpi_average(val)) if rank == 0: logger.dump_tabular() # save the policy if it's better than the previous ones success_rate = mpi_average(evaluator.current_success_rate()) if rank == 0 and success_rate >= best_success_rate and save_policies: best_success_rate = success_rate logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path)) evaluator.save_policy(best_policy_path) evaluator.save_policy(latest_policy_path) if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies: policy_path = periodic_policy_path.format(epoch) logger.info('Saving periodic policy to {} ...'.format(policy_path)) evaluator.save_policy(policy_path) # make sure that different threads have different seeds local_uniform = np.random.uniform(size=(1,)) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if rank != 0: assert local_uniform[0] != root_uniform[0]
def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def run_baselines(env, seed, log_dir): """ Create baselines model and training. Replace the ddpg and its training with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trail. :param log_dir: Log dir path. :return """ rank = MPI.COMM_WORLD.Get_rank() seed = seed + 1000000 * rank set_global_seeds(seed) env.seed(seed) # Set up logger for baselines configure(dir=log_dir) baselines_logger.info('rank {}: seed={}, logdir={}'.format( rank, seed, baselines_logger.get_dir())) # Set up params for baselines ddpg nb_actions = env.action_space.shape[-1] layer_norm = False action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(params["sigma"]) * np.ones(nb_actions)) memory = Memory( limit=params["replay_buffer_size"], action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) training.train( env=env, eval_env=None, param_noise=None, action_noise=action_noise, actor=actor, critic=critic, memory=memory, nb_epochs=params["n_epochs"], nb_epoch_cycles=params["n_epoch_cycles"], render_eval=False, reward_scale=1., render=False, normalize_returns=False, normalize_observations=False, critic_l2_reg=0, actor_lr=params["policy_lr"], critic_lr=params["qf_lr"], popart=False, gamma=params["discount"], clip_norm=None, nb_train_steps=params["n_train_steps"], nb_rollout_steps=params["n_rollout_steps"], nb_eval_steps=100, batch_size=64) return osp.join(log_dir, "progress.csv")
def learn(env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, LOG_STD_MIN=0.1, LOG_STD_MAX=0.9, reward_scale=1.0, render=False, render_eval=False, normalize_returns=False, normalize_observations=True, actor_lr=1e-4, critic_lr=1e-3, gamma=0.99, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, nb_save_epochs=None, batch_size=64, # per MPI worker tau=0.01, action_range=(-250.0, 250.0), observation_range=(-5.0, 5.0), eval_env=None, load_path=None, save_dir=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 memory = Memory(limit=int(1e6)) network_spec = [ { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] vnetwork_spec = [ { 'layer_type': 'concat', 'nodes_in': ['action_movement', 'observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] vnetwork_spec2 = [ { 'layer_type': 'concat', 'nodes_in': ['observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] ob_space = env.observation_space ac_space = env.action_space network = SacPolicy(scope="sac", ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec, v_network_spec2=vnetwork_spec2, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range, LOG_STD_MIN=LOG_STD_MIN, LOG_STD_MAX=LOG_STD_MAX) target_network = SacPolicy(scope="sac_target", ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec, v_network_spec2=vnetwork_spec2, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range, LOG_STD_MIN=LOG_STD_MIN, LOG_STD_MAX=LOG_STD_MAX) # obs0 obs0 = dict() for k, v in ob_space.spaces.items(): obs0[k] = network.phs[k] # obs1 obs1 = dict() for k, v in ob_space.spaces.items(): obs1[k] = target_network.phs[k] # actions acts = dict() for k, v in ac_space.spaces.items(): acts[k] = network.phs[k] self.dones = tf.placeholder(tf.float32, shape=(None, 1), name='dones') self.rewards = tf.placeholder(tf.float32, shape=(None, None, 1), name='rewards') # min double-q min_q_pi = tf.minimum(network.q1_pi, network.q2_pi) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() saver = functools.partial(save_variables, sess=sess) loader = functools.partial(load_variables, sess=sess) if load_path != None: loader(load_path) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = env.num_envs n_agents = obs['observation_self'].shape[0] episode_reward = np.zeros((nenvs, n_agents), dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch for k, v in action.items(): action[k] *= max_action nenvs_actions = [] for i in range(nenvs): nenv_action = {'action_movement' : action['action_movement'][i*n_agents:(i + 1)*n_agents]} nenvs_actions.append(nenv_action) new_obs, r, done, info = env.step(nenvs_actions) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) if nb_save_epochs != None and (epoch + 1) % nb_save_epochs == 0: if save_dir == None: checkdir = osp.join(logger.get_dir(), 'checkpoints') else: checkdir = osp.join(save_dir, 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%epoch) print('Saving to', savepath) saver(savepath) return agent
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs): kwargs['logdir'] = logdir whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': sys.exit(0) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Write to temp directory for all non-master workers. actual_dir = None Logger.CURRENT.close() Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[]) logger.set_level(logger.DISABLED) # Create envs. if rank == 0: env = gym.make(env_id) if gym_monitor and logdir: env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True) env = SimpleMonitor(env) if evaluation: eval_env = gym.make(env_id) if gym_monitor and logdir: eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True) eval_env = SimpleMonitor(eval_env) else: eval_env = None else: env = gym.make(env_id) if evaluation: eval_env = gym.make(env_id) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() Logger.CURRENT.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, save_policies, num_cpu, dump_buffer, w_potential, w_linear, w_rotational, rank_method, clip_energy, **kwargs): rank = MPI.COMM_WORLD.Get_rank() latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl') periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl') logger.info("Training...") best_success_rate = -1 t = 1 for epoch in range(n_epochs): # train rollout_worker.clear_history() for cycle in range(n_cycles): episode = rollout_worker.generate_rollouts() policy.store_episode(episode, dump_buffer, w_potential, w_linear, w_rotational, rank_method, clip_energy) for batch in range(n_batches): t = ((epoch * n_cycles * n_batches) + (cycle * n_batches) + batch) * num_cpu policy.train(t, dump_buffer) policy.update_target_net() # test evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs logger.record_tabular('epoch', epoch) for key, val in evaluator.logs('test'): logger.record_tabular(key, mpi_average(val)) for key, val in rollout_worker.logs('train'): logger.record_tabular(key, mpi_average(val)) for key, val in policy.logs(): logger.record_tabular(key, mpi_average(val)) if rank == 0: logger.dump_tabular() if dump_buffer: policy.dump_buffer(epoch) # save the policy if it's better than the previous ones success_rate = mpi_average(evaluator.current_success_rate()) if rank == 0 and success_rate >= best_success_rate and save_policies: best_success_rate = success_rate logger.info( 'New best success rate: {}. Saving policy to {} ...'.format( best_success_rate, best_policy_path)) evaluator.save_policy(best_policy_path) evaluator.save_policy(latest_policy_path) if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies: policy_path = periodic_policy_path.format(epoch) logger.info('Saving periodic policy to {} ...'.format(policy_path)) evaluator.save_policy(policy_path) # make sure that different threads have different seeds local_uniform = np.random.uniform(size=(1, )) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if rank != 0: assert local_uniform[0] != root_uniform[0]
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement): # Dynamics loss with random features. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info( f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input" ) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv( xr, "c1r", nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2), )) xr = tf.nn.leaky_relu( conv( xr, "c2r", nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2), )) xr = tf.nn.leaky_relu( conv( xr, "c3r", nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2), )) rgbr = [to2d(xr)] X_r = fc(rgbr[0], "fc1r", nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) assert ac_one_hot.get_shape().ndims == 3 assert ac_one_hot.get_shape().as_list() == [ None, None, self.ac_space.n, ], ac_one_hot.get_shape().as_list() ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n)) def cond(x): return tf.concat([x, ac_one_hot], 1) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info( f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input" ) xrp = ph[:, :-1] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:])) # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok? xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv( xrp, "c1rp_pred", nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2), )) xrp = tf.nn.leaky_relu( conv( xrp, "c2rp_pred", nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2), )) xrp = tf.nn.leaky_relu( conv( xrp, "c3rp_pred", nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2), )) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc( cond(rgbrp), "fc1r_hat1_pred", nh=256 * enlargement, init_scale=np.sqrt(2), )) X_r_hat = tf.nn.relu( fc( cond(X_r_hat), "fc1r_hat2_pred", nh=256 * enlargement, init_scale=np.sqrt(2), )) X_r_hat = fc(cond(X_r_hat), "fc1r_hat3_pred", nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0.0, maxval=1.0, dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.0)
def apply_policy( ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, additional_inputs=None, ): meta_rl = False data_format = "NHWC" ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info( f"CnnPolicy: using '{ph.name}' shape {ph.shape} as image input") X = tf.cast(ph, tf.float32) / 255.0 X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope( scope, reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"): X = activ( conv( X, "c1", nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format, )) X = activ( conv( X, "c2", nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format, )) X = activ( conv( X, "c3", nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format, )) X = to2d(X) mix_other_observations = [X] if ('prev_acs' in additional_inputs) and ('prev_rew' in additional_inputs): # Cast numpy arrays to tf tensors prev_acs = tf.cast(additional_inputs['prev_acs'], tf.float32) prev_rew = tf.cast(additional_inputs['prev_rew'], tf.float32) # Flatten out time dimension prev_acs = tf.reshape(prev_acs, (-1, *prev_acs.shape.as_list()[2:])) prev_rew = tf.reshape(prev_rew, (-1, *prev_rew.shape.as_list()[2:])) # Add to 2D features going to FC layers mix_other_observations.extend([prev_acs, prev_rew]) X = tf.concat(mix_other_observations, axis=1) X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2))) additional_size = 448 X = activ( fc(X, "fc_additional", nh=additional_size, init_scale=np.sqrt(2))) snext = tf.zeros((sy_nenvs, memsize)) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) if extrahid: Xtout = X + activ( fc(Xtout, "fc2val", nh=additional_size, init_scale=0.1)) X = X + activ( fc(X, "fc2act", nh=additional_size, init_scale=0.1)) pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01) vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, callback=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) # if save_interval and logger.get_dir(): # import cloudpickle # with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: # fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): if callback is not None: if callback(locals(), globals()): break assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) # pylint: disable=E0632 epinfobuf.extend(epinfos) mblrvals = [] mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for iteration in range(noptepochs): np.random.shuffle(inds) lrnow = lr(frac, iteration) logger.info('lrnow={} (frac={},iteration={})'.format( lrnow, frac, iteration)) mblrvals.append(lrnow) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for iteration in range(noptepochs): np.random.shuffle(envinds) lrnow = lr(frac, iteration) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) lrvals = np.mean(mblrvals) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) logger.logkv('lr_mean', lrvals) logger.logkv('epscoremean', safemean([epinfo['s'] for epinfo in epinfobuf])) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1 or update == nupdates) and logger.get_dir( ) and MPI.COMM_WORLD.Get_rank() == 0: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) env.close() return model
def call(self, perform, save_networks, use_expert, expert, on_policy): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps expert_buffer = self.expert_buffer if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() runner.myrun() # if self.flag>0: # print(self.flag,'=================================') # print(enc_obs, obs, actions, rewards, mus, dones, masks) # self.flag = self.flag -1 self.episode_stats.feed(rewards, dones) if buffer is not None and not perform: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() #enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() if not perform: # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) if not use_expert: names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps) else: expert_obs, expert_actions, expert_rewards, expert_mus, expert_dones, expert_masks = expert.get( ) expert_obs = expert_obs.reshape(runner.batch_ob_shape) expert_actions = expert_actions.reshape([runner.nbatch]) expert_rewards = expert_rewards.reshape([runner.nbatch]) expert_mus = expert_mus.reshape([runner.nbatch, runner.nact]) expert_dones = expert_dones.reshape([runner.nbatch]) expert_masks = expert_masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = model.expert_train( obs, actions, rewards, dones, mus, model.initial_state, masks, steps, expert_obs, expert_actions, expert_rewards, expert_mus, expert_dones, expert_masks) if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() if save_networks and (int(steps / runner.nbatch) % self.log_interval * 10 == 0): model.save(int(steps)) else: #if perform expert_buffer.append( [enc_obs, actions, rewards, mus, dones, masks]) if len(expert_buffer) > 0 and len(expert_buffer) % 100 == 0: expert_dir = os.path.join('./expert') + '/' if not os.path.exists(expert_dir): os.makedirs(expert_dir) pwritefile = open(os.path.join(expert_dir, 'expert_test.pkl'), 'wb') pickle.dump(expert_buffer, pwritefile, -1) pwritefile.close() logger.info('Successfully Saved the Expert Data') obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) logger.dump_tabular()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def learn(network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def learn(*, network, env, env_name, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, data_aug='normal', **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, data_aug=data_aug, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, data_aug=data_aug) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, data_aug=data_aug, eval_flag=True) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() # random filter sess = tf.get_default_session() rand_processes = [v for v in tf.global_variables() if 'randcnn' in v.name] if len(rand_processes) > 0: init_process = tf.variables_initializer(rand_processes) else: init_process = None nupdates = total_timesteps // nbatch traj_dict = {'obs': [], 'acts': [], 'rews': [], 'dones': [], 'infos': []} traj_count = 0 for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # randomize parameters if len(rand_processes) > 0: sess.run(init_process) logger.info('Randomizing parameters...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 traj_dict['obs'].append(obs) traj_dict['acts'].append(actions) traj_dict['rews'].append(returns) traj_dict['dones'].append(masks) traj_dict['infos'].append(epinfos) if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) if load_path is not None: mblossvals.append( model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] if load_path is not None: mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: if load_path is not None: video_out_path = osp.join( logger.get_dir(), f'video_{env_name}_traj_{traj_count}.mp4') video_writer = TensorFrameWriter(video_out_path, make_grid=False, adjust_axis=False) obs = obs for image, done in zip(obs, masks): if not done: video_writer.add_tensor(image) else: video_writer.close() breakpoint() traj_count += 1 video_out_path = osp.join( logger.get_dir(), f'video_{env_name}_traj_{traj_count}.mp4') video_writer = TensorFrameWriter(video_out_path, make_grid=False, adjust_axis=False) else: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) break all_dones = traj_dict['dones'].copy() all_dones = np.concatenate(all_dones, axis=0) traj_ends, = np.nonzero(all_dones) real_n_traj = len(traj_ends) trajectories = [] traj_starts = np.concatenate(([0], traj_ends[:-1]), axis=0) traj_start_end = np.stack((traj_starts, traj_ends), axis=1) for start_idx, end_idx in traj_start_end: trajectories.append( {k: v[start_idx:end_idx] for k, v in traj_dict.items()}) n_traj = len(trajectories) import pickle traj_filename = osp.join(logger.get_dir(), f'demo_{env_name}.pickle') with open(traj_filename, 'wb') as handle: pickle.dump(traj_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) video_writer.close() return model
def __init__( self, *, scope, ob_space, ac_space, stochpol_fn, nsteps, nepochs=4, nminibatches=1, gamma=0.99, gamma_ext=0.99, lam=0.95, ent_coef=0, cliprange=0.2, max_grad_norm=1.0, vf_coef=1.0, lr=30e-5, adam_hps=None, testing=False, comm=None, comm_train=None, use_news=False, update_ob_stats_every_step=True, int_coeff=None, ext_coeff=None, ): self.lr = lr self.ext_coeff = ext_coeff self.int_coeff = int_coeff self.use_news = use_news self.update_ob_stats_every_step = update_ob_stats_every_step self.abs_scope = (tf.get_variable_scope().name + '/' + scope).lstrip('/') self.testing = testing self.comm_log = MPI.COMM_SELF if comm is not None and comm.Get_size() > 1: self.comm_log = comm assert not testing or comm.Get_rank( ) != 0, "Worker number zero can't be testing" if comm_train is not None: self.comm_train, self.comm_train_size = comm_train, comm_train.Get_size( ) else: self.comm_train, self.comm_train_size = self.comm_log, self.comm_log.Get_size( ) self.is_log_leader = self.comm_log.Get_rank() == 0 self.is_train_leader = self.comm_train.Get_rank() == 0 with tf.variable_scope(scope): self.best_ret = -np.inf self.local_best_ret = -np.inf self.rooms = [] self.local_rooms = [] self.scores = [] self.ob_space = ob_space self.ac_space = ac_space self.stochpol = stochpol_fn() self.nepochs = nepochs self.cliprange = cliprange self.nsteps = nsteps self.nminibatches = nminibatches self.gamma = gamma self.gamma_ext = gamma_ext self.lam = lam self.adam_hps = adam_hps or dict() self.ph_adv = tf.placeholder(tf.float32, [None, None]) self.ph_ret_int = tf.placeholder(tf.float32, [None, None]) self.ph_ret_ext = tf.placeholder(tf.float32, [None, None]) self.ph_oldnlp = tf.placeholder(tf.float32, [None, None]) self.ph_oldvpred = tf.placeholder(tf.float32, [None, None]) self.ph_lr = tf.placeholder(tf.float32, []) self.ph_lr_pred = tf.placeholder(tf.float32, []) self.ph_cliprange = tf.placeholder(tf.float32, []) #Define loss. neglogpac = self.stochpol.pd_opt.neglogp(self.stochpol.ph_ac) entropy = tf.reduce_mean(self.stochpol.pd_opt.entropy()) vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean( tf.square(self.stochpol.vpred_int_opt - self.ph_ret_int)) vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean( tf.square(self.stochpol.vpred_ext_opt - self.ph_ret_ext)) vf_loss = vf_loss_int + vf_loss_ext ratio = tf.exp(self.ph_oldnlp - neglogpac) # p_new / p_old negadv = -self.ph_adv pg_losses1 = negadv * ratio pg_losses2 = negadv * tf.clip_by_value( ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange) pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2)) ent_loss = (-ent_coef) * entropy approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.ph_oldnlp)) maxkl = .5 * tf.reduce_max(tf.square(neglogpac - self.ph_oldnlp)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.ph_cliprange))) loss = pg_loss + ent_loss + vf_loss + self.stochpol.aux_loss #Create optimizer. params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.abs_scope) logger.info("PPO: using MpiAdamOptimizer connected to %i peers" % self.comm_train_size) trainer = MpiAdamOptimizer(self.comm_train, learning_rate=self.ph_lr, **self.adam_hps) grads_and_vars = trainer.compute_gradients(loss, params) grads, vars = zip(*grads_and_vars) if max_grad_norm: _, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) global_grad_norm = tf.global_norm(grads) grads_and_vars = list(zip(grads, vars)) self._train = trainer.apply_gradients(grads_and_vars) #Quantities for reporting. self._losses = [ loss, pg_loss, vf_loss, entropy, clipfrac, approxkl, maxkl, self.stochpol.aux_loss, self.stochpol.feat_var, self.stochpol.max_feat, global_grad_norm ] self.loss_names = [ 'tot', 'pg', 'vf', 'ent', 'clipfrac', 'approxkl', 'maxkl', "auxloss", "featvar", "maxfeat", "gradnorm" ] self.I = None self.disable_policy_update = None allvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.abs_scope) if self.is_log_leader: tf_util.display_var_info(allvars) tf.get_default_session().run(tf.variables_initializer(allvars)) sync_from_root(tf.get_default_session(), allvars) #Syncs initialization across mpi workers. self.t0 = time.time() self.global_tcount = 0
def save(training_step): logger.info( 'Saved network with {} training steps'.format(training_step)) self.saver.save(self.sess, self.cpk_dir + 'demo', global_step=training_step)
elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components of DDPG memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=args.layer_norm) actor = Actor(nb_actions, layer_norm=args.layer_norm) # Seed everything to make things reproducible. seed = args.seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) # tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() # Derive the different numbers for the training process num_timesteps = args.num_timesteps nb_rollout_steps = args.nb_rollout_steps nb_epoch_cycles = args.nb_epoch_cycles nb_epochs = num_timesteps//(nb_rollout_steps*nb_epoch_cycles)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. # Log stats. epoch_train_duration = time.time() - epoch_start_time duration = time.time() - start_time stats = agent.get_stats() combined_stats = {} for key in sorted(stats.keys()): combined_stats[key] = mpi_mean(stats[key]) # Rollout statistics. combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = mpi_mean( np.mean(episode_rewards_history)) combined_stats['rollout/episode_steps'] = mpi_mean( epoch_episode_steps) combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes) combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions) combined_stats['rollout/actions_std'] = mpi_std(epoch_actions) combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs) # Train statistics. combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses) combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = mpi_mean( epoch_adaptive_distances) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = mpi_mean(eval_episode_rewards) combined_stats['eval/return_history'] = mpi_mean( np.mean(eval_episode_rewards_history)) combined_stats['eval/Q'] = mpi_mean(eval_qs) combined_stats['eval/episodes'] = mpi_mean( len(eval_episode_rewards)) # Total statistics. combined_stats['total/duration'] = mpi_mean(duration) combined_stats['total/steps_per_second'] = mpi_mean( float(t) / float(duration)) combined_stats['total/episodes'] = mpi_mean(episodes) combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def log_params(params, logger=logger): for key in sorted(params.keys()): logger.info('{}: {}'.format(key, params[key]))
def run(seed, noise_type, layer_norm, **kwargs): """Configure things.""" rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) """Create Simulation envs.""" env = PegintoHoles() """Create True envs""" # env = Env_robot_control() """Parse noise_type""" action_noise = None param_noise = None nb_actions = env.action_dim for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions)) param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) """Configure components.""" memory = Memory(limit=int(1e6), action_shape=env.action_dim, observation_shape=env.state_dim) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) """Seed everything to make things reproducible.""" seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) """Disable logging to avoid noise.""" start_time = time.time() """Train the model""" training.train(env=env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) """Eval the result""" logger.info('total runtime: {}s'.format(time.time() - start_time))
def learn(*, network, env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.spec.id params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['rollout_batch_size'] = env.num_envs if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size logger.info("n_epochs:", n_epochs) return train(save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file)
def define_self_prediction_rew(self, convfeat, rep_size, enlargement): logger.info( "Using RND BONUS ****************************************************" ) # RND bonus. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info( f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input" ) xr = ph[:, 1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu( conv( xr, "c1r", nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2), )) xr = tf.nn.leaky_relu( conv( xr, "c2r", nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2), )) xr = tf.nn.leaky_relu( conv( xr, "c3r", nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2), )) rgbr = [to2d(xr)] X_r = fc(rgbr[0], "fc1r", nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info( f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input" ) xrp = ph[:, 1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu( conv( xrp, "c1rp_pred", nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2), )) xrp = tf.nn.leaky_relu( conv( xrp, "c2rp_pred", nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2), )) xrp = tf.nn.leaky_relu( conv( xrp, "c3rp_pred", nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2), )) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu( fc( rgbrp, "fc1r_hat1_pred", nh=256 * enlargement, init_scale=np.sqrt(2), )) X_r_hat = tf.nn.relu( fc( X_r_hat, "fc1r_hat2_pred", nh=256 * enlargement, init_scale=np.sqrt(2), )) X_r_hat = fc(X_r_hat, "fc1r_hat3_pred", nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean( tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0.0, maxval=1.0, dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum( tf.reduce_sum(mask), 1.0)
def __init__(self, *, sess, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): # sess = tf.get_default_session() train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps) norm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) params = tf.trainable_variables() weight_params = [v for v in params if '/b' not in v.name] total_num_params = 0 for p in params: shape = p.get_shape().as_list() num_params = np.prod(shape) # mpi_print('param', p, num_params) total_num_params += num_params logger.info('total num params:', total_num_params) l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params]) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * L2_WEIGHT trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values adv_mean = np.mean(advs, axis=0, keepdims=True) adv_std = np.std(advs, axis=0, keepdims=True) advs = (advs - adv_mean) / (adv_std + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, l2_loss, _train], td_map )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'l2_loss'] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load initialize()
def save_model(self, num_steps): logger.info("Saving model at step: {}".format(num_steps)) tf_util.save_state(self.model_path)
def learn(*, network, sess, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, save_path=None, load_path=None, **network_kwargs): comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_size = comm.Get_size() #sess = tf.get_default_session() # tb_writer = TB_Writer(sess) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches policy = RandCropCnnPolicy model = Model(policy=policy, sess=sess, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) # utils.load_all_params(sess) if load_path is not None: model.load(load_path) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) logger.info("Initilizing runner") epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) tfirststart = time.time() active_ep_buf = epinfobuf100 nupdates = total_timesteps//nbatch logger.info("Running {} updates, each needs {} batches".format(nupdates, nbatch)) mean_rewards = [] datapoints = [] run_t_total = 0 train_t_total = 0 can_save = True checkpoints = list(range(0,2049,10)) saved_key_checkpoints = [False] * len(checkpoints) #init_rand = tf.variables_initializer([v for v in tf.global_variables() if 'randcnn' in v.name]) # if Config.SYNC_FROM_ROOT and rank != 0: # can_save = False # def save_model(base_name=None): # base_dict = {'datapoints': datapoints} # utils.save_params_in_scopes( # sess, ['model'], Config.get_save_file(base_name=base_name), base_dict) for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) logger.info('collecting rollouts...') run_tstart = time.time() obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) run_elapsed = time.time() - run_tstart run_t_total += run_elapsed logger.info('rollouts complete') mblossvals = [] logger.info('updating parameters...') train_tstart = time.time() if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # update the dropout mask sess.run([model.train_model.dropout_assign_ops]) train_elapsed = time.time() - train_tstart train_t_total += train_elapsed logger.info('update complete') lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: step = update*nbatch #rew_mean_10 = utils.process_ep_buf(active_ep_buf, tb_writer=tb_writer, suffix='', step=step) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', update) mean_rewards.append(rew_mean_10) datapoints.append([step, rew_mean_10]) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) #logger.info('time_elapsed', tnow - tfirststart, run_t_total, train_t_total) logger.logkv('misc/total_time_elapsed', tnow - tfirststart) logger.logkv('misc/run_t_total', run_t_total) logger.logkv('misc/train_t_total', train_t_total) logger.logkv('nupdate', update) #logger.info('timesteps', update*nsteps, total_timesteps) logger.logkv("misc/total_timesteps", update*nbatch) logger.logkv("misc/serial_timesteps", update*nsteps) #logger.info('fps', fps) logger.logkv("fps", fps) if len(mblossvals): for (lossval, lossname) in zip(lossvals, model.loss_names): logger.info(lossname, lossval) #tb_writer.log_scalar(lossval, lossname) logger.logkv('loss/' + lossname, lossval) logger.info('----\n') logger.dumpkvs() #if can_save: if 0: ## not doing checkpoint saving yet if save_interval and (update % save_interval == 0): save_model() for j, checkpoint in enumerate(checkpoints): if (not saved_key_checkpoints[j]) and (step >= (checkpoint * 1e6)): saved_key_checkpoints[j] = True save_model(str(checkpoint) + 'M') # save_model() if save_path: model.save(save_path) env.close() return model
def update(self): #Some logic gathering best ret, rooms etc using MPI. temp = sum(MPI.COMM_WORLD.allgather(self.local_rooms), []) temp = sorted(list(set(temp))) self.rooms = temp temp = sum(MPI.COMM_WORLD.allgather(self.scores), []) temp = sorted(list(set(temp))) self.scores = temp temp = sum(MPI.COMM_WORLD.allgather([self.local_best_ret]), []) self.best_ret = max(temp) eprews = MPI.COMM_WORLD.allgather( np.mean(list(self.I.statlists["eprew"]))) local_best_rets = MPI.COMM_WORLD.allgather(self.local_best_ret) n_rooms = sum(MPI.COMM_WORLD.allgather([len(self.local_rooms)]), []) if MPI.COMM_WORLD.Get_rank() == 0: logger.info(f"Rooms visited {self.rooms}") logger.info(f"Best return {self.best_ret}") logger.info(f"Best local return {sorted(local_best_rets)}") logger.info(f"eprews {sorted(eprews)}") logger.info(f"n_rooms {sorted(n_rooms)}") logger.info(f"Extrinsic coefficient {self.ext_coeff}") logger.info(f"Gamma {self.gamma}") logger.info(f"Gamma ext {self.gamma_ext}") logger.info(f"All scores {sorted(self.scores)}") #Normalize intrinsic rewards. rffs_int = np.array( [self.I.rff_int.update(rew) for rew in self.I.buf_rews_int.T]) self.I.rff_rms_int.update(rffs_int.ravel()) rews_int = self.I.buf_rews_int / np.sqrt(self.I.rff_rms_int.var) self.mean_int_rew = np.mean(rews_int) self.max_int_rew = np.max(rews_int) #Don't normalize extrinsic rewards. rews_ext = self.I.buf_rews_ext rewmean, rewstd, rewmax = self.I.buf_rews_int.mean( ), self.I.buf_rews_int.std(), np.max(self.I.buf_rews_int) #Calculate intrinsic returns and advantages. lastgaelam = 0 for t in range(self.nsteps - 1, -1, -1): # nsteps-2 ... 0 if self.use_news: nextnew = self.I.buf_news[:, t + 1] if t + 1 < self.nsteps else self.I.buf_new_last else: nextnew = 0.0 #No dones for intrinsic reward. nextvals = self.I.buf_vpreds_int[:, t + 1] if t + 1 < self.nsteps else self.I.buf_vpred_int_last nextnotnew = 1 - nextnew delta = rews_int[:, t] + self.gamma * nextvals * nextnotnew - self.I.buf_vpreds_int[:, t] self.I.buf_advs_int[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam rets_int = self.I.buf_advs_int + self.I.buf_vpreds_int #Calculate extrinsic returns and advantages. lastgaelam = 0 for t in range(self.nsteps - 1, -1, -1): # nsteps-2 ... 0 nextnew = self.I.buf_news[:, t + 1] if t + 1 < self.nsteps else self.I.buf_new_last #Use dones for extrinsic reward. nextvals = self.I.buf_vpreds_ext[:, t + 1] if t + 1 < self.nsteps else self.I.buf_vpred_ext_last nextnotnew = 1 - nextnew delta = rews_ext[:, t] + self.gamma_ext * nextvals * nextnotnew - self.I.buf_vpreds_ext[:, t] self.I.buf_advs_ext[:, t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam rets_ext = self.I.buf_advs_ext + self.I.buf_vpreds_ext #Combine the extrinsic and intrinsic advantages. self.I.buf_advs = self.int_coeff * self.I.buf_advs_int + self.ext_coeff * self.I.buf_advs_ext #Collects info for reporting. info = dict( advmean=self.I.buf_advs.mean(), advstd=self.I.buf_advs.std(), retintmean=rets_int.mean(), # previously retmean retintstd=rets_int.std(), # previously retstd retextmean=rets_ext.mean(), # previously not there retextstd=rets_ext.std(), # previously not there rewintmean_unnorm=rewmean, # previously rewmean rewintmax_unnorm=rewmax, # previously not there rewintmean_norm=self.mean_int_rew, # previously rewintmean rewintmax_norm=self.max_int_rew, # previously rewintmax rewintstd_unnorm=rewstd, # previously rewstd vpredintmean=self.I.buf_vpreds_int.mean(), # previously vpredmean vpredintstd=self.I.buf_vpreds_int.std(), # previously vrpedstd vpredextmean=self.I.buf_vpreds_ext.mean(), # previously not there vpredextstd=self.I.buf_vpreds_ext.std(), # previously not there ev_int=np.clip( explained_variance(self.I.buf_vpreds_int.ravel(), rets_int.ravel()), -1, None), ev_ext=np.clip( explained_variance(self.I.buf_vpreds_ext.ravel(), rets_ext.ravel()), -1, None), rooms=SemicolonList(self.rooms), n_rooms=len(self.rooms), best_ret=self.best_ret, reset_counter=self.I.reset_counter) info[f'mem_available'] = psutil.virtual_memory().available to_record = { 'acs': self.I.buf_acs, 'rews_int': self.I.buf_rews_int, 'rews_int_norm': rews_int, 'rews_ext': self.I.buf_rews_ext, 'vpred_int': self.I.buf_vpreds_int, 'vpred_ext': self.I.buf_vpreds_ext, 'adv_int': self.I.buf_advs_int, 'adv_ext': self.I.buf_advs_ext, 'ent': self.I.buf_ent, 'ret_int': rets_int, 'ret_ext': rets_ext, } if self.I.venvs[0].record_obs: to_record['obs'] = self.I.buf_obs[None] self.recorder.record(bufs=to_record, infos=self.I.buf_epinfos) #Create feeddict for optimization. envsperbatch = self.I.nenvs // self.nminibatches ph_buf = [ (self.stochpol.ph_ac, self.I.buf_acs), (self.ph_ret_int, rets_int), (self.ph_ret_ext, rets_ext), (self.ph_oldnlp, self.I.buf_nlps), (self.ph_adv, self.I.buf_advs), ] if self.I.mem_state is not NO_STATES: ph_buf.extend([ (self.stochpol.ph_istate, self.I.seg_init_mem_state), (self.stochpol.ph_new, self.I.buf_news), ]) verbose = True if verbose and self.is_log_leader: samples = np.prod(self.I.buf_advs.shape) logger.info( "buffer shape %s, samples_per_mpi=%i, mini_per_mpi=%i, samples=%i, mini=%i " % (str(self.I.buf_advs.shape), samples, samples // self.nminibatches, samples * self.comm_train_size, samples * self.comm_train_size // self.nminibatches)) logger.info(" " * 6 + fmt_row(13, self.loss_names)) epoch = 0 start = 0 #Optimizes on current data for several epochs. while epoch < self.nepochs: end = start + envsperbatch mbenvinds = slice(start, end, None) fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange}) fd[self.stochpol.ph_ob[None]] = np.concatenate([ self.I.buf_obs[None][mbenvinds], self.I.buf_ob_last[None][mbenvinds, None] ], 1) assert list(fd[self.stochpol.ph_ob[None]].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape), \ [fd[self.stochpol.ph_ob[None]].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape)] fd.update({ self.stochpol.ph_mean: self.stochpol.ob_rms.mean, self.stochpol.ph_std: self.stochpol.ob_rms.var**0.5 }) ret = tf.get_default_session().run(self._losses + [self._train], feed_dict=fd)[:-1] if not self.testing: lossdict = dict(zip([n for n in self.loss_names], ret), axis=0) else: lossdict = {} #Synchronize the lossdict across mpi processes, otherwise weights may be rolled back on one process but not another. _maxkl = lossdict.pop('maxkl') lossdict = dict_gather(self.comm_train, lossdict, op='mean') maxmaxkl = dict_gather(self.comm_train, {"maxkl": _maxkl}, op='max') lossdict["maxkl"] = maxmaxkl["maxkl"] if verbose and self.is_log_leader: logger.info( "%i:%03i %s" % (epoch, start, fmt_row(13, [lossdict[n] for n in self.loss_names]))) start += envsperbatch if start == self.I.nenvs: epoch += 1 start = 0 if self.is_train_leader: self.I.stats["n_updates"] += 1 info.update([('opt_' + n, lossdict[n]) for n in self.loss_names]) tnow = time.time() info['tps'] = self.nsteps * self.I.nenvs / (tnow - self.I.t_last_update) info['time_elapsed'] = time.time() - self.t0 self.I.t_last_update = tnow self.stochpol.update_normalization( # Necessary for continuous control tasks with odd obs ranges, only implemented in mlp policy, ob=self.I.buf_obs # NOTE: not shared via MPI ) return info
def __init__(self, sess, prefix, env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, actor_dis_lr, critic_lr, exp_scale, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_dis_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, dis_batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): self.sess = sess self.prefix = prefix self.env = env self.nb_epochs = nb_epochs self.nb_epoch_cycles = nb_epoch_cycles self.render_eval = render_eval self.reward_scale = reward_scale self.render = render self.param_noise = param_noise self.actor = actor self.critic = critic self.normalize_returns = normalize_returns self.normalize_observations = normalize_observations self.critic_l2_reg = critic_l2_reg self.actor_lr = actor_lr self.actor_dis_lr = actor_dis_lr self.critic_lr = critic_lr self.exp_scale = exp_scale self.action_noise = action_noise self.popart = popart self.gamma = gamma self.clip_norm = clip_norm self.nb_train_steps = nb_train_steps self.nb_dis_train_steps = nb_dis_train_steps self.nb_rollout_steps = nb_rollout_steps self.nb_eval_steps = nb_eval_steps self.batch_size = batch_size self.dis_batch_size = dis_batch_size self.memory = memory self.tau = tau self.eval_env = eval_env self.param_noise_adaption_interval = param_noise_adaption_interval self.rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(self.env.action_space.low) == self.env.action_space.high ).all() # we assume symmetric actions. self.max_action = self.env.action_space.high logger.info('scaling actions by {} before executing in env'.format( self.max_action)) self.agent = DDPG(self.prefix, self.actor, self.critic, self.memory, self.env.observation_space.shape, self.env.action_space.shape, gamma=self.gamma, tau=self.tau, normalize_returns=self.normalize_returns, normalize_observations=self.normalize_observations, batch_size=self.batch_size, dis_batch_size=self.dis_batch_size, action_noise=self.action_noise, param_noise=self.param_noise, critic_l2_reg=self.critic_l2_reg, actor_lr=self.actor_lr, actor_dis_lr=self.actor_dis_lr, critic_lr=self.critic_lr, exp_scale=self.exp_scale, enable_popart=self.popart, clip_norm=self.clip_norm, reward_scale=self.reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(self.agent.__dict__.items())) # Set up logging stuff only for a single worker. if self.rank == 0: self.saver = tf.train.Saver() else: self.saver = None self.step = 0 self.episode = 0 self.eval_episode_rewards_history = deque(maxlen=100) self.episode_rewards_history = deque(maxlen=100) #with U.single_threaded_session() as sess: #self.sess = U.single_threaded_session() # Prepare everything. self.agent.initialize(self.sess) #self.sess.graph.finalize() self.agent.reset() self.obs = self.env.reset() if self.eval_env is not None: self.eval_obs = eval_env.reset() self.done = False self.episode_reward = 0. self.episode_step = 0 self.episodes = 0 self.t = 0 self.epoch = 0 self.start_time = time.time() self.epoch_episode_rewards = [] self.epoch_episode_steps = [] self.epoch_episode_eval_rewards = [] self.epoch_episode_eval_steps = [] self.epoch_start_time = time.time() self.epoch_actions = [] self.epoch_qs = [] self.epoch_episodes = 0 self.epoch_actor_losses = [] self.epoch_critic_losses = [] self.epoch_adaptive_distances = [] self.eval_episode_rewards = [] self.eval_qs = []
parser.add_argument('--nb-train-steps', type=int, default=50) # per epoch cycle and MPI worker parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none parser.add_argument('--logdir', type=str, default=None) boolean_flag(parser, 'gym-monitor', default=False) boolean_flag(parser, 'evaluation', default=True) boolean_flag(parser, 'bind-to-core', default=False) return vars(parser.parse_args()) if __name__ == '__main__': args = parse_args() # Figure out what logdir to use. if args['logdir'] is None: args['logdir'] = os.getenv('OPENAI_LOGDIR') # Print and save arguments. logger.info('Arguments:') for key in sorted(args.keys()): logger.info('{}: {}'.format(key, args[key])) logger.info('') if args['logdir']: with open(os.path.join(args['logdir'], 'args.json'), 'w') as f: json.dump(args, f) # Run actual script. run(**args)
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('uo_stats') as vs: if reuse: vs.reuse_variables() self.uo_stats = Normalizer(self.dimo + self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('lo_stats') as vs: if reuse: vs.reuse_variables() self.lo_stats = Normalizer(2 * self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) ''' # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) ''' # polyak averaging #self.main_vars = self._vars('main/Q') + self._vars('main/pi') #self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') #self.init_target_net_op = list( # map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) #self.update_target_net_op = list( # map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run()
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, use_demo=False, demos=None, render_env=False, init_fn=None, mpi_rank_weight=1, comm=None, base_path=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if save_interval and logger.get_dir(): if base_path is None: base_path = os.path.dirname(os.path.abspath(__file__)) if not os.path.isdir(osp.join(base_path, "models")): os.mkdir(osp.join(base_path, "models")) # with open(osp.join(base_path, "models", 'model.pkl'), 'wb') as fh: # fh.write(cloudpickle.dumps(model)) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, use_demo=use_demo, demos=demos, render_env=render_env) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch # initialize dictionary for data saving performance = {"reward": []} for name in model.loss_names: performance[name] = [] for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 rewards = runner.mb_rewards # save reward data performance["reward"].extend(rewards) if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() # save loss data for i in range(len(model.loss_names)): loss_name = model.loss_names[i] performance[loss_name].extend([lossvals[i]]) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: model_dir = osp.join(base_path, "models") os.makedirs(model_dir, exist_ok=True) savepath = osp.join(model_dir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) print("Saved model successfully.") performance_fname = os.path.join(base_path, "performance.p") with open(performance_fname, "wb") as f: pickle.dump(performance, f) model_dir = osp.join(base_path, "models") os.makedirs(model_dir, exist_ok=True) savepath = osp.join(model_dir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) print("Saved model successfully.") performance_fname = os.path.join(base_path, "performance.p") with open(performance_fname, "wb") as f: pickle.dump(performance, f) return performance
def learn( network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/return_history_std'] = np.std( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = np.array( [np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def view(env, seed=None, total_timesteps=None, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, nb_save_epochs=None, batch_size=64, # per MPI worker tau=0.01, action_range=(-250.0, 250.0), observation_range=(-5.0, 5.0), eval_env=None, load_path=None, save_dir=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 memory = Memory(limit=int(1e6)) network_spec = [ { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (1), 'activation': 'tanh', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] vnetwork_spec = [ { 'layer_type': 'concat', 'nodes_in': ['action_movement', 'observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (1), 'activation': '', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] network = DdpgPolicy(scope="ddpg", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range) target_network = DdpgPolicy(scope="target", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range) max_action = action_range[1] logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(network, target_network, memory, env.observation_space, env.action_space, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=None, param_noise=None, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() loader = functools.partial(load_variables, sess=sess) if load_path != None: loader(load_path) # Prepare everything. agent.initialize(sess) sess.graph.finalize() nenvs = env.num_envs obs = env.reset() n_agents = obs['observation_self'].shape[0] for epoch in range(total_timesteps): agent.reset() obs = env.reset() while True: action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=False) # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch for k, v in action.items(): action[k] *= max_action nenvs_actions = [] for i in range(nenvs): nenv_action = {'action_movement' : action['action_movement'][i*n_agents:(i + 1)*n_agents]} nenvs_actions.append(nenv_action) print(nenvs_actions) obs, r, done, info = env.step(nenvs_actions) env.render() if True in done: break
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 total_timesteps = 1_000_000 ## now this counts steps in testing runs use_vf_clipping = True ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 L2_WEIGHT = 10e-4 FM_COEFF = 0.002 REAL_THRES = 0.1 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1000) ## default starting_level set to 50 to test on unseen levels! parser.add_argument('--start_level', type=int, default=1000) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--load_id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) args = parser.parse_args() args.total_timesteps = total_timesteps if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) run_ID += '_load{}'.format(args.load_id) comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_rank_weight = 0 num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") ## Modified based on random_ppo.learn env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) policy = build_policy(env, network) model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id) model.load(LOAD_PATH) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) # tfirststart = time.time() ## Not doing timing yet # active_ep_buf = epinfobuf100 mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) ## differnent from random_ppo! epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critics, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.005, eval_env=None, param_noise_adaption_interval=50, initial_random_steps=1e4, policy_and_target_update_period=1, use_mpi_adam=False, stop_actor_steps=None, stop_critic_steps=None, **kwargs): if kwargs is not None: logger.info("Warning: redundant hyper-parameters: " + str(kwargs)) rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. max_action = env.action_space.high logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = EnsembleDDPG(actor, critics, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale, use_mpi_adam=use_mpi_adam) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. if t < initial_random_steps: action = np.random.uniform(low=-1, high=1, size=env.action_space.shape) else: action, _ = agent.pi(obs, apply_noise=True, compute_Q=False) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step( max_action * action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # use take_update flag to control actor training take_update = (t_train % policy_and_target_update_period == 0) stop_actor_training = False stop_critic_training = False if stop_actor_steps is not None and t > stop_actor_steps: stop_actor_training = True if stop_critic_steps is not None and t > stop_critic_steps: stop_critic_training = True cl, al = agent.train(take_update, stop_critic_training, stop_actor_training) epoch_critic_losses.append(cl) # use take_update flag to control target updating if take_update: epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/return_latest'] = np.mean( epoch_episode_rewards[-1]) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank==0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))