def test_microbatches(): def env_fn(): env = gym.make('CartPole-v0') env.seed(0) return env learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0) env_ref = DummyVecEnv([env_fn]) sess_ref = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_ref) vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()} env_test = DummyVecEnv([env_fn]) sess_test = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2)) # learn_fn(env=env_test) vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()} for v in vars_ref: np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
def test_serialization(learn_fn, network_fn): ''' Test if the trained model can be serialized ''' if network_fn.endswith('lstm') and learn_fn in [ 'acer', 'acktr', 'trpo_mpi', 'deepq' ]: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return def make_env(): env = MnistEnv(episode_len=100) env.seed(10) return env env = DummyVecEnv([make_env]) ob = env.reset().copy() learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) with tempfile.TemporaryDirectory() as td: model_path = os.path.join(td, 'serialization_test_model') with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=100) model.save(model_path) mean1, std1 = _get_action_stats(model, ob) variables_dict1 = _serialize_variables() with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=0, load_path=model_path) mean2, std2 = _get_action_stats(model, ob) variables_dict2 = _serialize_variables() for k, v in variables_dict1.items(): np.testing.assert_allclose( v, variables_dict2[k], atol=0.01, err_msg='saved and loaded variable {} value mismatch'.format( k)) np.testing.assert_allclose(mean1, mean2, atol=0.5) np.testing.assert_allclose(std1, std2, atol=0.5)
def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def train(args): from model.encoder import bi_direction_lstm from model.action_decoder import MlpPolicy from model.mlp_state_decoder import MlpPolicy_state U.make_session(num_cpu=1).__enter__() env = humanoid_CMU.stand() obs_space = env.physics.data.qpos ac_space = env.action_spec() def encoder(name): return bi_direction_lstm(name=name, obs_space=obs_space, batch_size=args.lstm_batch, time_steps=args.time_steps, LSTM_size=args.LSTM_size, laten_size=args.laten_size) def action_decorder(name): return MlpPolicy(name=name, obs_space=obs_space, ac_space=ac_space, embedding_shape=args.laten_size, hid_size=args.pol_hid_size, num_hid_layers=args.pol_layers) def state_decorder(name): return MlpPolicy_state(name=name, obs_space=obs_space, embedding_shape=args.laten_size, hid_size=args.state_de_hid_size, num_hid_layers=args.state_de_hid_num) state_dataset = load_state_dataset(args.state_dir_path, env, args.control_timestep) learn(encoder=encoder, action_decorder=action_decorder, state_decorder=state_decorder, embedding_shape=args.laten_size, dataset=state_dataset, logdir=args.logdir, batch_size=args.lstm_batch, time_steps=args.time_steps, epsilon=args.epsilon, lr_rate=args.lr_rate)
def train(args): from model.encoder import bi_direction_lstm from model.action_decoder import MlpPolicy from model.WaveNet import WaveNetModel U.make_session(num_cpu=1).__enter__() env = humanoid_CMU.stand() obs_space = env.physics.data.qpos ac_space = env.action_spec() def encoder(name): return bi_direction_lstm(name=name, obs_space=obs_space, batch_size=args.lstm_batch, time_steps= args.time_steps, LSTM_size= args.LSTM_size, laten_size = args.laten_size) def action_decorder(name): return MlpPolicy(name=name, obs_space = obs_space, ac_space = ac_space, embedding_shape = args.laten_size, hid_size = pol_hid_size, num_hid_layers = pol_layers) with open(args.wavenet_params, 'r') as f: wavenet_params = json.load(f) def state_decorder(name): ##也要加个name return WaveNetModel( name = name, obs_shape= obs_space, embedding_shape= args.laten_size, batch_size=args.time_steps, dilations=wavenet_params["dilations"], filter_width=wavenet_params["filter_width"], residual_channels=wavenet_params["residual_channels"], dilation_channels=wavenet_params["dilation_channels"], skip_channels=wavenet_params["skip_channels"], quantization_channels=wavenet_params["quantization_channels"], use_biases=wavenet_params["use_biases"], scalar_input=wavenet_params["scalar_input"], initial_filter_width=wavenet_params["initial_filter_width"], histograms=args.histograms, global_condition_channels=args.gc_channels) state_dataset = load_state_dataset(args.state_dir_path, env, args.control_timestep) ##感觉数据会有点少,可以尝试多加一点走路的数 optimizer = optimizer_factory[args.optimizer]( learning_rate=args.learning_rate, momentum=args.momentum) learn(env=env, encoder = encoder, action_decorder=action_decorder, state_decorder=state_decorder, embedding_shape= args.laten_size ,dataset=state_dataset, optimizer = optimizer, logdir=args.logdir, batch_size = args.lstm_batch, time_steps = args.time_steps)
def test_coexistence(learn_fn, network_fn): ''' Test if more than one model can exist at a time ''' if learn_fn == 'deepq': # TODO enable multiple DQN models to be useable at the same time # github issue https://github.com/openai/baselines/issues/656 return if network_fn.endswith('lstm') and learn_fn in [ 'acktr', 'trpo_mpi', 'deepq' ]: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return env = DummyVecEnv([lambda: gym.make('CartPole-v0')]) learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs) make_session(make_default=True, graph=tf.Graph()) model1 = learn(seed=1) make_session(make_default=True, graph=tf.Graph()) model2 = learn(seed=2) model1.step(env.observation_space.sample()) model2.step(env.observation_space.sample())
def load(path, num_cpu=16): with open(path, "rb") as f: model_data, act_params = dill.load(f) act = deepq.build_act(**act_params) sess = U.make_session(num_cpu=num_cpu) sess.__enter__() with tempfile.TemporaryDirectory() as td: arc_path = os.path.join(td, "packed.zip") with open(arc_path, "wb") as f: f.write(model_data) zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) U.load_state(os.path.join(td, "model")) return ActWrapper(act, act_params)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.compat.v1.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.compat.v1.placeholder( tf.float32, [], name="learning_rate_ph") with tf.compat.v1.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean( input_tensor=self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod( self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.compat.v1.get_variable( 'log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.compat.v1.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.compat.v1.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean(input_tensor=(q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean(input_tensor=(q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( input_tensor=self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean( input_tensor=self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean( input_tensor=(value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # Value train op value_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars( 'model/values_fn') source_params = tf_util.get_trainable_vars( "model/values_fn") target_params = tf_util.get_trainable_vars( "target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.compat.v1.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.compat.v1.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ 'ent_coef_loss', 'ent_coef' ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef ] # Monitor losses and entropy in tensorboard tf.compat.v1.summary.scalar('policy_loss', policy_loss) tf.compat.v1.summary.scalar('qf1_loss', qf1_loss) tf.compat.v1.summary.scalar('qf2_loss', qf2_loss) tf.compat.v1.summary.scalar('value_loss', value_loss) tf.compat.v1.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.compat.v1.summary.scalar('ent_coef_loss', ent_coef_loss) tf.compat.v1.summary.scalar('ent_coef', self.ent_coef) tf.compat.v1.summary.scalar( 'learning_rate', tf.reduce_mean(input_tensor=self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars( "target/values_fn") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.compat.v1.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.compat.v1.summary.merge_all()
def main(args): from ppo1 import mlp_policy U.make_session(num_cpu=args.num_cpu).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, ret_threshold=args.ret_threshold, traj_limitation=args.traj_limitation) pretrained_weight = None if (args.pretrained and args.task == 'train') or args.algo == 'bc': # Pretrain with behavior cloning from gailtf.algo import behavior_clone if args.algo == 'bc' and args.task == 'evaluate': behavior_clone.evaluate(env, policy_fn, args.load_model_path, stochastic_policy=args.stochastic_policy) sys.exit() pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, pretrained=args.pretrained, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name) if args.algo == 'bc': sys.exit() from gailtf.network.adversary import TransitionClassifier # discriminator discriminator = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) if args.algo == 'trpo': # Set up for MPI seed from mpi4py import MPI rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) from gailtf.algo import trpo_mpi if args.task == 'train': trpo_mpi.learn(env, policy_fn, discriminator, dataset, pretrained=args.pretrained, pretrained_weight=pretrained_weight, g_step=args.g_step, d_step=args.d_step, timesteps_per_batch=1024, max_kl=args.max_kl, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, entcoeff=args.policy_entcoeff, gamma=0.995, lam=0.97, vf_iters=5, vf_stepsize=1e-3, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, save_per_iter=args.save_per_iter, load_model_path=args.load_model_path, task_name=task_name) elif args.task == 'evaluate': trpo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy) else: raise NotImplementedError elif args.algo == 'ppo': # Set up for MPI seed from mpi4py import MPI rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) from gailtf.algo import ppo_mpi if args.task == 'train': ppo_mpi.learn(env, policy_fn, discriminator, dataset, # pretrained=args.pretrained, pretrained_weight=pretrained_weight, timesteps_per_batch=1024, g_step=args.g_step, d_step=args.d_step, # max_kl=args.max_kl, cg_iters=10, cg_damping=0.1, clip_param= 0.2,entcoeff=args.policy_entcoeff, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.95, # vf_iters=5, vf_stepsize=1e-3, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, d_stepsize=3e-4, schedule='linear', ckpt_dir=args.checkpoint_dir, save_per_iter=100, task=args.task, sample_stochastic=args.stochastic_policy, load_model_path=args.load_model_path, task_name=task_name) elif args.task == 'evaluate': ppo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy) else: raise NotImplementedError else: raise NotImplementedError env.close()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ "an instance of common.policies.ActorCriticPolicy." self.n_batch = self.n_envs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\ "the number of environments run in parallel should be a multiple of nminibatches." n_batch_step = self.n_envs n_batch_train = self.n_batch // self.nminibatches act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs // self.nminibatches, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.action_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.old_neglog_pac_ph = tf.placeholder( tf.float32, [None], name="old_neglog_pac_ph") self.old_vpred_ph = tf.placeholder(tf.float32, [None], name="old_vpred_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") self.clip_range_ph = tf.placeholder(tf.float32, [], name="clip_range_ph") neglogpac = train_model.proba_distribution.neglogp( self.action_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) vpred = train_model.value_flat # Value function clipping: not present in the original PPO if self.cliprange_vf is None: # Default behavior (legacy from OpenAI baselines): # use the same clipping as for the policy self.clip_range_vf_ph = self.clip_range_ph self.cliprange_vf = self.cliprange elif isinstance(self.cliprange_vf, (float, int)) and self.cliprange_vf < 0: # Original PPO implementation: no value function clipping self.clip_range_vf_ph = None else: # Last possible behavior: clipping range # specific to the value function self.clip_range_vf_ph = tf.placeholder( tf.float32, [], name="clip_range_vf_ph") if self.clip_range_vf_ph is None: # No clipping vpred_clipped = train_model.value_flat else: # Clip the different between old and new value # NOTE: this depends on the reward scaling vpred_clipped = self.old_vpred_ph + \ tf.clip_by_value(train_model.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpred_clipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean( tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value( ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) self.pg_loss = tf.reduce_mean( tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean( tf.cast( tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph), tf.float32)) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('approximate_kullback-leibler', self.approxkl) tf.summary.scalar('clip_factor', self.clipfrac) tf.summary.scalar('loss', loss) with tf.variable_scope('model'): self.params = tf.trainable_variables() if self.full_tensorboard_log: for var in self.params: tf.summary.histogram(var.name, var) grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) trainer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train = trainer.apply_gradients(grads) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph)) if self.clip_range_vf_ph is not None: tf.summary.scalar( 'clip_range_vf', tf.reduce_mean(self.clip_range_vf_ph)) tf.summary.scalar('old_neglog_action_probability', tf.reduce_mean(self.old_neglog_pac_ph)) tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) tf.summary.histogram('clip_range', self.clip_range_ph) tf.summary.histogram('old_neglog_action_probability', self.old_neglog_pac_ph) tf.summary.histogram('old_value_pred', self.old_vpred_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.proba_step = act_model.proba_step self.value = act_model.value self.initial_state = act_model.initial_state tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 self.summary = tf.summary.merge_all()
from deepq.replay_buffer import ReplayBuffer from deepq.utils import ObservationInput from common.schedules import LinearSchedule def model(inpt, num_actions, scope, reuse=False): """This model takes as input an observation and returns values of all actions.""" with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) return out if __name__ == '__main__': with U.make_session(num_cpu=8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
def model(inpt, num_actions, scope, reuse=False): """This model takes as input an observation and returns values of all actions.""" with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) return out if __name__ == '__main__': with U.make_session(8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000,
def main(args): from ppo1 import mlp_policy ##for policy from model.encoder import bi_direction_lstm from dm_control.suite import humanoid_CMU U.make_session(num_cpu=args.num_cpu).__enter__() set_global_seeds(args.seed) env = humanoid_CMU.stand() obs_space = env.physics.data.qpos ac_space = env.action_spec() def policy_fn(name, ob_space, ac_space, reuse=False): ###mlp policy 要不要用用之前训好的policy,不是的 return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size= [300, 200, 100], num_hid_layers=3) def encoder(name): return bi_direction_lstm(name=name, obs_space=obs_space, batch_size=args.lstm_batch, time_steps= args.time_steps, LSTM_size= args.LSTM_size, laten_size = args.laten_size) lstm_encoder = encoder("lstm_encoder") saver = lstm_encoder.get_trainable_variables() load(saver=saver, sess=tf.get_default_session(), logdir = args.encoder_load_path) ###将encoder的参数load进去 # env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), "monitor.json")) # env.seed(args.seed) # gym.logger.setLevel(logging.WARN) # task_name = get_task_name(args) task_name = "Humanoid-CMU" args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) # dataset = Mujoco_Dset(expert_path=args.expert_path, ret_threshold=args.ret_thres hold, traj_limitation=args.traj_limitation) # ================ Sample trajectory τj from the demonstration ============================= # 相当于expert dataset,仅需要obs即可 from model.VAE import load_state_dataset dataset = load_state_dataset(data_dir_path=args.expert_data_dir, env = env, control_timestep=args.control_timestep) pretrained_weight = None if (args.pretrained and args.task == 'train') or args.algo == 'bc': # Pretrain with behavior cloning from gail import behavior_clone if args.algo == 'bc' and args.task == 'evaluate': behavior_clone.evaluate(env, policy_fn, args.load_model_path, stochastic_policy=args.stochastic_policy) sys.exit() pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, pretrained=args.pretrained, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name) if args.algo == 'bc': sys.exit() from network.adversary import TransitionClassifier # discriminator discriminator = TransitionClassifier(env, args.adversary_hidden_size, hidden_layers = args.adversary_hidden_layers, lr_rate = args.adversary_learning_rate, entcoeff=args.adversary_entcoeff, embedding_shape=args.laten_size) ###embedding_z,现在还没有处理 observations = dataset.get_next_batch(batch_size=128)[0].transpose((1, 0)) ### !!!!这个地方还是稍微有点儿乱啊 embedding_z = lstm_encoder.get_laten_vector(observations) if args.algo == 'trpo': # Set up for MPI seed from mpi4py import MPI rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) from gail import trpo_mpi if args.task == 'train': trpo_mpi.learn(env, policy_fn, discriminator, dataset, embedding_z=None, ##embedding_z这里现在我还没有想好 pretrained=args.pretrained, pretrained_weight=pretrained_weight, g_step=args.g_step, d_step=args.d_step, timesteps_per_batch=1024, max_kl=args.max_kl, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, entcoeff=args.policy_entcoeff, gamma=0.995, lam=0.97, vf_iters=5, vf_stepsize=1e-3, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, save_per_iter=args.save_per_iter, load_model_path=args.load_model_path, task_name=task_name) elif args.task == 'evaluate': trpo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy) else: raise NotImplementedError elif args.algo == 'ppo': # Set up for MPI seed from mpi4py import MPI rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) from gail import ppo_mpi if args.task == 'train': ppo_mpi.learn(env, policy_fn, discriminator, dataset, # pretrained=args.pretrained, pretrained_weight=pretrained_weight, timesteps_per_batch=1024, g_step=args.g_step, d_step=args.d_step, # max_kl=args.max_kl, cg_iters=10, cg_damping=0.1, clip_param= 0.2,entcoeff=args.policy_entcoeff, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.95, # vf_iters=5, vf_stepsize=1e-3, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, d_stepsize=3e-4, schedule='linear', ckpt_dir=args.checkpoint_dir, save_per_iter=100, task=args.task, sample_stochastic=args.stochastic_policy, load_model_path=args.load_model_path, task_name=task_name) elif args.task == 'evaluate': ppo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy) else: raise NotImplementedError else: raise NotImplementedError env.close()
rollouts["vpreds"] = np.array(vpreds) rollouts["op_vpred"] = np.array(op_vpreds) f = open("results/MOAC/exp_5/data/rollout_data.pkl", "rb") p = pickle.load(f) f.close() horizon = 150 rolloutSize = 75 modes = 3 num_options = 9 queueSize = 5000 env = gym.make('BlockSlide2D-v1') env.seed(1) U.make_session(num_cpu=1).__enter__() np.random.seed(1) tf1.set_random_seed(1) ob_space = env.observation_space ac_space = env.action_space # Initialize the model model = partialHybridModel(env, model_learning_params, svm_grid_params, svm_params_interest, svm_params_guard, horizon, modes, num_options, rolloutSize) pi = policy_fn("pi", ob_space, ac_space, model, num_options) # Construct network for new policy policy_path = "results/MOAC/exp_5/model/"
def learn_continuous_tasks(env, q_func, env_name, dir_path, time_stamp, total_num_episodes, num_actions_pad=33, lr=1e-4, grad_norm_clipping=10, max_timesteps=int(1e8), buffer_size=int(1e6), train_freq=1, batch_size=64, print_freq=10, learning_starts=1000, gamma=0.99, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=int(1e8), num_cpu=16, epsilon_greedy=False, timesteps_std=1e6, initial_std=0.4, final_std=0.05, eval_freq=100, n_eval_episodes=10, eval_std=0.01, log_index=0, log_prefix='q', loss_type="L2", model_file='./', callback=None): """Train a branching deepq model to solve continuous control tasks via discretization. Current assumptions in the implementation: - for solving continuous control domains via discretization (can be adjusted to be compatible with naturally disceret-action domains using 'env.action_space.n') - uniform number of sub-actions per action dimension (can be generalized to heterogeneous number of sub-actions across branches) Parameters ------- env : gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions_pad: int number of sub-actions per action dimension (= num of discretization grains/bars + 1) lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimize for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed 0.1 for dqn-baselines exploration_final_eps: float final value of random action probability 0.02 for dqn-baselines train_freq: int update the model every `train_freq` steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor grad_norm_clipping: int set None for no clipping target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the unified TD error for updating priorities. Erratum: The camera-ready copy of this paper incorrectly reported 1e-8. The value used to produece the results is 1e8. num_cpu: int number of cpus to use for training dir_path: str path for logs and results to be stored in callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) print('Observation shape:' + str(env.observation_space.shape)) num_action_grains = num_actions_pad - 1 num_action_dims = env.action_space.shape[0] num_action_streams = num_action_dims num_actions = num_actions_pad * num_action_streams # total numb network outputs for action branching with one action dimension per branch print('Number of actions in total:' + str(num_actions)) act, q_val, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, num_action_streams=num_action_streams, batch_size=batch_size, optimizer_name="Adam", learning_rate=lr, grad_norm_clipping=grad_norm_clipping, gamma=gamma, double_q=True, scope="deepq", reuse=None, loss_type="L2") print('TRAIN VARS:') print(tf.trainable_variables()) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, 'num_action_streams': num_action_streams, } print('Create the log writer for TensorBoard visualizations.') log_dir = "{}/tensorboard_logs/{}".format(dir_path, env_name) if not os.path.exists(log_dir): os.makedirs(log_dir) score_placeholder = tf.placeholder(tf.float32, [], name='score_placeholder') tf.summary.scalar('score', score_placeholder) lr_constant = tf.constant(lr, name='lr_constant') tf.summary.scalar('learning_rate', lr_constant) eval_placeholder = tf.placeholder(tf.float32, [], name='eval_placeholder') eval_summary = tf.summary.scalar('evaluation', eval_placeholder) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None if epsilon_greedy: approximate_num_iters = 2e6 / 4 exploration = PiecewiseSchedule([(0, 1.0), (approximate_num_iters / 50, 0.1), (approximate_num_iters / 5, 0.01)], outside_value=0.01) else: exploration = ConstantSchedule(value=0.0) # greedy policy std_schedule = LinearSchedule(schedule_timesteps=timesteps_std, initial_p=initial_std, final_p=final_std) # Initialize the parameters and copy them to the target network. U.initialize() update_target() # Initialize the parameters used for converting branching, discrete action indeces to continuous actions low = env.action_space.low high = env.action_space.high actions_range = np.subtract(high, low) print('###################################') print(low) print(high) print('###################################') episode_rewards = [] reward_sum = 0.0 time_steps = [0] time_spent_exploring = [0] prev_time = time.time() n_trainings = 0 # Open a dircetory for recording results results_dir = "{}/results/{}".format(dir_path, env_name) if not os.path.exists(results_dir): os.makedirs(results_dir) displayed_mean_reward = None score_timesteps = [] game_scores = [] def evaluate(step, episode_number): global max_eval_reward_mean, model_saved print('Evaluate...') eval_reward_sum = 0.0 # Run evaluation episodes for eval_episode in range(n_eval_episodes): obs = env.reset() done = False while not done: # Choose action action_idxes = np.array( act(np.array(obs)[None], stochastic=False)) # deterministic actions_greedy = action_idxes / num_action_grains * actions_range + low if eval_std == 0.0: action = actions_greedy else: action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: a_stoch = np.random.normal(loc=a_greedy, scale=eval_std) a_idx_stoch = np.rint( (a_stoch + high[index]) / actions_range[index] * num_action_grains) if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad: action.append(a_stoch) out_of_range_action = False # Step obs, rew, done, _ = env.step(action) eval_reward_sum += rew # Average the rewards and log eval_reward_mean = eval_reward_sum / n_eval_episodes print(eval_reward_mean, 'over', n_eval_episodes, 'episodes') game_scores.append(eval_reward_mean) score_timesteps.append(step) if max_eval_reward_mean is None or eval_reward_mean > max_eval_reward_mean: logger.log( "Saving model due to mean eval increase: {} -> {}".format( max_eval_reward_mean, eval_reward_mean)) U.save_state(model_file) model_saved = True max_eval_reward_mean = eval_reward_mean intact = ActWrapper(act, act_params) intact.save(model_file + "_" + str(episode_number) + "_" + str(int(np.round(max_eval_reward_mean)))) print('Act saved to ' + model_file + "_" + str(episode_number) + "_" + str(int(np.round(max_eval_reward_mean)))) with tempfile.TemporaryDirectory() as td: td = './logs' evaluate(0, 0) obs = env.reset() t = -1 all_means = [] q_stats = [] current_qs = [] training_game_scores = [] training_timesteps = [] while True: t += 1 # Select action and update exploration probability action_idxes = np.array( act(np.array(obs)[None], update_eps=exploration.value(t))) qs = np.array(q_val(np.array(obs)[None], stochastic=False)) # deterministic tt = [] for val in qs: tt.append(np.std(val)) current_qs.append(tt) # Convert sub-actions indexes (discrete sub-actions) to continuous controls action = action_idxes / num_action_grains * actions_range + low if not epsilon_greedy: # Gaussian noise actions_greedy = action action_idx_stoch = [] action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: # Sample from a Gaussian with mean at the greedy action and a std following a schedule of choice a_stoch = np.random.normal(loc=a_greedy, scale=std_schedule.value(t)) # Convert sampled cont action to an action idx a_idx_stoch = np.rint( (a_stoch + high[index]) / actions_range[index] * num_action_grains) # Check if action is in range if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad: action_idx_stoch.append(a_idx_stoch) action.append(a_stoch) out_of_range_action = False action_idxes = action_idx_stoch new_obs, rew, done, _ = env.step(np.array(action)) # Store transition in the replay buffer replay_buffer.add(obs, action_idxes, rew, new_obs, float(done)) obs = new_obs reward_sum += rew if done: obs = env.reset() time_spent_exploring[-1] = int(100 * exploration.value(t)) time_spent_exploring.append(0) episode_rewards.append(reward_sum) training_game_scores.append(reward_sum) training_timesteps.append(t) time_steps[-1] = t reward_sum = 0.0 time_steps.append(0) q_stats.append(np.mean(current_qs, 0)) current_qs = [] if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train( obses_t, actions, rewards, obses_tp1, dones, weights) # np.ones_like(rewards)) #TEMP AT NEW if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) n_trainings += 1 if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically update_target() if len(episode_rewards) == 0: mean_100ep_reward = 0 elif len(episode_rewards) < 100: mean_100ep_reward = np.mean(episode_rewards) else: mean_100ep_reward = np.mean(episode_rewards[-100:]) all_means.append(mean_100ep_reward) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) current_time = time.time() logger.record_tabular("trainings per second", n_trainings / (current_time - prev_time)) logger.dump_tabular() n_trainings = 0 prev_time = current_time if t > learning_starts and num_episodes > 100: if displayed_mean_reward is None or mean_100ep_reward > displayed_mean_reward: if print_freq is not None: logger.log("Mean reward increase: {} -> {}".format( displayed_mean_reward, mean_100ep_reward)) displayed_mean_reward = mean_100ep_reward # Performance evaluation with a greedy policy if done and num_episodes % eval_freq == 0: evaluate(t + 1, num_episodes) obs = env.reset() # STOP training if num_episodes >= total_num_episodes: break pickle.dump(q_stats, open( str(log_index) + "q_stat_stds99_" + log_prefix + ".pkl", 'wb'), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(game_scores, open( str(log_index) + "q_stat_scores99_" + log_prefix + ".pkl", 'wb'), protocol=pickle.HIGHEST_PROTOCOL) return ActWrapper(act, act_params)
def learn_continuous_tasks(env, q_func, env_name, time_stamp, total_num_episodes, num_actions_pad=33, lr=1e-4, grad_norm_clipping=10, max_timesteps=int(1e8), buffer_size=int(1e6), train_freq=1, batch_size=64, print_freq=10, learning_starts=1000, gamma=0.99, target_network_update_freq=500, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=2e6, prioritized_replay_eps=int(1e8), num_cpu=16, timesteps_std=1e6, initial_std=0.4, final_std=0.05, eval_freq=100, n_eval_episodes=10, eval_std=0.01, callback=None): """Train a branching deepq model to solve continuous control tasks via discretization. Current assumptions in the implementation: - for solving continuous control domains via discretization (can be adjusted to be compatible with naturally disceret-action domains using 'env.action_space.n') - uniform number of sub-actions per action dimension (can be generalized to heterogeneous number of sub-actions across branches) Parameters ------- env : gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions_pad: int number of sub-actions per action dimension (= num of discretization grains/bars + 1) lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimize for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed 0.1 for dqn-baselines exploration_final_eps: float final value of random action probability 0.02 for dqn-baselines train_freq: int update the model every `train_freq` steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor grad_norm_clipping: int set None for no clipping target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the unified TD error for updating priorities. Erratum: The camera-ready copy of this paper incorrectly reported 1e-8. The value used to produece the results is 1e8. num_cpu: int number of cpus to use for training losses_version: int optimization version number dir_path: str path for logs and results to be stored in callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) num_action_grains = num_actions_pad - 1 num_action_dims = env.action_space.shape[0] num_action_streams = num_action_dims num_actions = num_actions_pad * num_action_streams # total numb network outputs for action branching with one action dimension per branch act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, num_action_streams=num_action_streams, batch_size=batch_size, learning_rate=lr, grad_norm_clipping=grad_norm_clipping, gamma=gamma, scope="deepq", reuse=None) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, 'num_action_streams': num_action_streams, } # prioritized_replay: create the replay buffer replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # epsilon_greedy = False: just greedy policy exploration = ConstantSchedule(value=0.0) # greedy policy std_schedule = LinearSchedule(schedule_timesteps=timesteps_std, initial_p=initial_std, final_p=final_std) # Initialize the parameters and copy them to the target network. U.initialize() update_target() # Initialize the parameters used for converting branching, discrete action indeces to continuous actions low = env.action_space.low high = env.action_space.high actions_range = np.subtract(high, low) episode_rewards = [] reward_sum = 0.0 num_episodes = 0 time_steps = [0] time_spent_exploring = [0] prev_time = time.time() n_trainings = 0 # Set up on-demand rendering of Gym environments using keyboard controls: 'r'ender or 's'top import termios, fcntl, sys fd = sys.stdin.fileno() oldterm = termios.tcgetattr(fd) newattr = termios.tcgetattr(fd) newattr[3] = newattr[3] & ~termios.ICANON & ~termios.ECHO render = False displayed_mean_reward = None def evaluate(step, episode_number): global max_eval_reward_mean, model_saved print('Evaluate...') eval_reward_sum = 0.0 # Run evaluation episodes for eval_episode in range(n_eval_episodes): obs = env.reset() done = False while not done: # Choose action action_idxes = np.array( act(np.array(obs)[None], stochastic=False)) # deterministic actions_greedy = action_idxes / num_action_grains * actions_range + low if eval_std == 0.0: action = actions_greedy else: action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: a_stoch = np.random.normal(loc=a_greedy, scale=eval_std) a_idx_stoch = np.rint( (a_stoch + high[index]) / actions_range[index] * num_action_grains) if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad: action.append(a_stoch) out_of_range_action = False # Step obs, rew, done, _ = env.step(action) eval_reward_sum += rew # Average the rewards and log eval_reward_mean = eval_reward_sum / n_eval_episodes print(eval_reward_mean, 'over', n_eval_episodes, 'episodes') with open("results/{}_{}_eval.csv".format(time_stamp, env_name), "a") as eval_fw: eval_writer = csv.writer( eval_fw, delimiter="\t", lineterminator="\n", ) eval_writer.writerow([episode_number, step, eval_reward_mean]) if max_eval_reward_mean is None or eval_reward_mean > max_eval_reward_mean: logger.log( "Saving model due to mean eval increase: {} -> {}".format( max_eval_reward_mean, eval_reward_mean)) U.save_state(model_file) model_saved = True max_eval_reward_mean = eval_reward_mean with tempfile.TemporaryDirectory() as td: model_file = os.path.join(td, "model") evaluate(0, 0) obs = env.reset() with open("results/{}_{}.csv".format(time_stamp, env_name), "w") as fw: writer = csv.writer( fw, delimiter="\t", lineterminator="\n", ) t = -1 while True: t += 1 # Select action and update exploration probability action_idxes = np.array( act(np.array(obs)[None], update_eps=exploration.value(t))) # Convert sub-actions indexes (discrete sub-actions) to continuous controls action = action_idxes / num_action_grains * actions_range + low # epsilon_greedy = False: use Gaussian noise actions_greedy = action action_idx_stoch = [] action = [] for index in range(len(actions_greedy)): a_greedy = actions_greedy[index] out_of_range_action = True while out_of_range_action: # Sample from a Gaussian with mean at the greedy action and a std following a schedule of choice a_stoch = np.random.normal(loc=a_greedy, scale=std_schedule.value(t)) # Convert sampled cont action to an action idx a_idx_stoch = np.rint( (a_stoch + high[index]) / actions_range[index] * num_action_grains) # Check if action is in range if a_idx_stoch >= 0 and a_idx_stoch < num_actions_pad: action_idx_stoch.append(a_idx_stoch) action.append(a_stoch) out_of_range_action = False action_idxes = action_idx_stoch new_obs, rew, done, _ = env.step(action) # On-demand rendering if (t + 1) % 100 == 0: # TO DO better? termios.tcsetattr(fd, termios.TCSANOW, newattr) oldflags = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, oldflags | os.O_NONBLOCK) try: try: c = sys.stdin.read(1) if c == 'r': print() print('Rendering begins...') render = True elif c == 's': print() print('Stop rendering!') render = False env.render(close=True) except IOError: pass finally: termios.tcsetattr(fd, termios.TCSAFLUSH, oldterm) fcntl.fcntl(fd, fcntl.F_SETFL, oldflags) # Visualize Gym environment on render if render: env.render() # Store transition in the replay buffer replay_buffer.add(obs, action_idxes, rew, new_obs, float(done)) obs = new_obs reward_sum += rew if done: obs = env.reset() time_spent_exploring[-1] = int(100 * exploration.value(t)) time_spent_exploring.append(0) episode_rewards.append(reward_sum) time_steps[-1] = t reward_sum = 0.0 time_steps.append(0) # Frequently log to file writer.writerow( [len(episode_rewards), t, episode_rewards[-1]]) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer # prioritized_replay experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience td_errors = train( obses_t, actions, rewards, obses_tp1, dones, weights) #np.ones_like(rewards)) #TEMP AT NEW # prioritized_replay new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) n_trainings += 1 if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically update_target() if len(episode_rewards) == 0: mean_100ep_reward = 0 elif len(episode_rewards) < 100: mean_100ep_reward = np.mean(episode_rewards) else: mean_100ep_reward = np.mean(episode_rewards[-100:]) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) current_time = time.time() logger.record_tabular( "trainings per second", n_trainings / (current_time - prev_time)) logger.dump_tabular() n_trainings = 0 prev_time = current_time if t > learning_starts and num_episodes > 100: if displayed_mean_reward is None or mean_100ep_reward > displayed_mean_reward: if print_freq is not None: logger.log("Mean reward increase: {} -> {}".format( displayed_mean_reward, mean_100ep_reward)) displayed_mean_reward = mean_100ep_reward # Performance evaluation with a greedy policy if done and num_episodes % eval_freq == 0: evaluate(t + 1, num_episodes) obs = env.reset() # STOP training if num_episodes >= total_num_episodes: break if model_saved: logger.log("Restore model with mean eval: {}".format( max_eval_reward_mean)) U.load_state(model_file) data_to_log = { 'time_steps': time_steps, 'episode_rewards': episode_rewards, 'time_spent_exploring': time_spent_exploring } # Write to file the episodic rewards, number of steps, and the time spent exploring with open("results/{}_{}.txt".format(time_stamp, env_name), 'wb') as fp: pickle.dump(data_to_log, fp) return ActWrapper(act, act_params)