def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: print("are we here?") frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations) if env_type == 'mujoco': env = VecNormalize(env, use_tf=True) return env
def profile_tf_runningmeanstd(): import time from common import tf_util tf_util.get_session( config=tf.ConfigProto( inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, allow_soft_placement=True )) x = np.random.random((376,)) n_trials = 10000 rms = RunningMeanStd() tfrms = TfRunningMeanStd() tic1 = time.time() for _ in range(n_trials): rms.update(x) tic2 = time.time() for _ in range(n_trials): tfrms.update(x) tic3 = time.time() print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1)) print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2)) tic1 = time.time() for _ in range(n_trials): z1 = rms.mean tic2 = time.time() for _ in range(n_trials): z2 = tfrms.mean assert z1 == z2 tic3 = time.time() print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1)) print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2)) '''
def __init__(self, epsilon=1e-4, shape=(), scope=''): sess = get_session() self._new_mean = tf.placeholder(shape=shape, dtype=tf.float64) self._new_var = tf.placeholder(shape=shape, dtype=tf.float64) self._new_count = tf.placeholder(shape=(), dtype=tf.float64) with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self._mean = tf.get_variable('mean', initializer=np.zeros( shape, 'float64'), dtype=tf.float64) self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64) self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64) self.update_ops = tf.group([ self._var.assign(self._new_var), self._mean.assign(self._new_mean), self._count.assign(self._new_count) ]) sess.run(tf.variables_initializer([self._mean, self._var, self._count])) self.sess = sess self._set_mean_var_count()
def get_reward(self, obs, embedding): sess = U.get_session() if len(obs.shape) == 1: obs = np.expand_dims(obs, 0) if len(embedding.shape) == 1: embedding = np.expand_dims(embedding, 0) feed_dict = {self.generator_obs_ph: obs, self.embedding_ph: embedding} reward = sess.run(self.reward_op, feed_dict) return reward
def add_all_summary(self, writer, values, iter): # Note that the order of the incoming ```values``` should be the same as the that of the # ```scalar_keys``` given in ```__init__``` if np.sum(np.isnan(values) + 0) != 0: return sess = U.get_session() keys = self.scalar_summaries_ph + self.histogram_summaries_ph feed_dict = {} for k, v in zip(keys, values): feed_dict.update({k: v}) summaries_str = sess.run(self.summaries, feed_dict) writer.add_summary(summaries_str, iter)
def test_nonfreeze(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 # for some reason the session config with inter_op_parallelism_threads was causing # nested sess.run calls to freeze config = tf.ConfigProto(inter_op_parallelism_threads=1) sess = U.get_session(config=config) update_op = MpiAdamOptimizer(comm=MPI.COMM_WORLD, learning_rate=stepsize).minimize(loss) sess.run(tf.global_variables_initializer()) losslist_ref = [] for i in range(100): l, _ = sess.run([loss, update_op]) print(i, l) losslist_ref.append(l)
def __init__( self, create_embedding_layer_fn, use_chacacter_embedding, rnn_layer_num, hidden_state_size, learning_rate, dropout, keep_prob, l2, l2_decay, output_dense_num, output_dense_size, decay_rate, decay_steps, sample_num, ): """ :param create_embedding_layer_fn: the function used to create the embedding layer :param use_chacacter_embedding: boolean indicates whether use character embedding :param rnn_layer_num: the rnn layer number :param hidden_state_size: the rnn hidden state size :param learning_rate: the learning_rate :param dropout: boolean, indicate whether to use dropout :param keep_prob: the dropout keep rate :param l2: boolean indicate whether use l2 :param l2_decay: l2 parameter :param output_dense_num: the layer number of the output network :param output_dense_size: the output network hidden layer size :param decay_rate: the exponential learning rate decay rate :param decay_steps: the exponential learning rate decay steps """ super().__init__(learning_rate=learning_rate) self.embedding_layer_fn = create_embedding_layer_fn() self.vocabulary_size = len(create_embedding_layer_fn) self.use_chacacter_embedding = use_chacacter_embedding self.input_seq = tf.placeholder(dtype=tf.int32, shape=(None, None), name="input_seq") self.input_seq_length = tf.placeholder(dtype=tf.int32, shape=(None, ), name="input_seq_length") input_placeholder = [self.input_seq, self.input_seq_length] if self.use_chacacter_embedding: self.character_input_seq = tf.placeholder( dtype=tf.int32, shape=(None, None, None), name="character_input_seq") self.character_input_seq_length = tf.placeholder( dtype=tf.int32, shape=(None, None), name="character_input_seq_length") input_placeholder += [ self.character_input_seq, self.character_input_seq_length ] self._input_placeholder = input_placeholder self.rnn_layer_num = rnn_layer_num self.hidden_state_size = hidden_state_size self.dropout = dropout self.keep_prob = keep_prob self.l2 = l2 self.l2_decay = l2_decay self.output_dense_num = output_dense_num self.output_dense_size = output_dense_size self.decay_rate = decay_rate self.decay_steps = decay_steps self.sample_num = sample_num # tf_util.init_all_op(self) tf_util.add_summary_scalar("loss", self.loss_op) self._summary_op = tf_util.merge_op() sess = tf_util.get_session() init = tf.global_variables_initializer() sess.run(init)
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate( (np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis=0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss == 1 and self.q_filter == 1: # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape( tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1] ) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum( tf.square( tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean( self.main.Q_pi_tf ) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u) ) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum( tf.square( tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean( self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def learn_att(env, q_func, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs ): # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) # q_func = build_q_func(network, **network_kwargs) since no network setting # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = build_train_att( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, #add a mask function for the choice of actions mask_func= ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_variables(model_file) return act
def build_env(cloth_cfg_path=None, render_path=None, start_state_path=None, num_env=1, seed=1, alg='ddpg'): """Daniel: actually construct the env, using 'vector envs' for parallelism. For now our cloth env can follow the non-atari and non-retro stuff, because I don't think we need a similar kind of 'wrapping' that they do. Note that `VecFrameStack` is needed to stack frames, e.g., in Atari we do 4 frame stacking. Without that, the states would be size (84,84,1). The non-`args` parameters here are for the cloth env. """ #Adi: Need to modify the next section because no 'args' parameter ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 #nenv = args.num_env or ncpu #alg = args.alg #seed = args.seed #env_type, env_id = get_env_type(args) env_type = 'cloth' env_id = 'cloth' if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} #Adi: I don't think we want to make a vector environment for now because it's causing a lot of trouble temporarily.. let's just start with a single non-vec env #env = make_vec_env(env_id, env_type, num_env or 1, seed, # reward_scale=1, # flatten_dict_observations=flatten_dict_observations, # cloth_cfg_path=cloth_cfg_path, # render_path=render_path, # start_state_path=start_state_path) #Adi: I have to directly define a few more variables because we are now making a single environment instead of a vector environment #Adi: These values are subject to change mpi_rank = 0 subrank = 0 reward_scale = 1.0 gamestate = None wrapper_kwargs = None logger_dir = logger.get_dir() env = make_env(env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=subrank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir, cloth_cfg_path=cloth_cfg_path, render_path=render_path, start_state_path=start_state_path) if env_type == 'mujoco': env = VecNormalize(env) return env
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(nbatch_train, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] '''declare the defined function in init''' self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) def load_ini(load_path): """ Load the model """ # variables = tf.contrib.framework.get_variables_to_restore() # non_actor = [v for v in variables if v.name.split('/')[0]!='actor'] # saver = tf.train.Saver(non_actor) # print('Loading ' + load_path) # saver.restore(sess, load_path) for v in tf.get_default_graph().as_graph_def().node: print(v.name) '''Initialize actor policy with supervised policy!''' try: # from the ddpg tensor graph: actor, critic, target_actor, target_critic actor_var_list = tf.contrib.framework.get_variables( 'ppo2_model/pi') except: print('Cannot get variables list!') print('actor_var:', actor_var_list) try: actor_saver = tf.train.Saver(actor_var_list) actor_saver.restore(sess, load_path) print('Actor Load Succeed!') except: print('Actor Load Failed!') # sess.run(self.target_init_updates) self.load_ini = load_ini if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate( (np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis=0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_naf_network(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_naf_network(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_value = self.target.value clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_value, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(target_tf - self.main.Q)) tf.summary.histogram("Q_loss", self.Q_loss_tf) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main')) assert len(self._vars('main')) == len(Q_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main')) # optimizers self.adam = MpiAdam(self._vars('main'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main') self.target_vars = self._vars('target') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net() visualize = True if visualize: writer = tf.summary.FileWriter("output", self.sess.graph) writer.close() saver = tf.train.Saver() saver.save(self.sess, "./models/model.ckpt")
def learn( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=7, #50 nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # noise_type='adaptive-param_0.2', # noise_type='normal_0.2', # large noise # noise_type='normal_0.02', # small noise noise_type='normal_2.0', # action ranges 360, so noise scale should be chosen properly # noise_type='normal_5', # large noise # noise_type='normal_0.2', # small noise # noise_type='normal_0.00001', # no noise # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, # large lr critic_lr=1e-3, # large lr # actor_lr=1e-7, # small lr # critic_lr=1e-3, # small lr # actor_lr = 1e-10, # no lr # critic_lr=1e-10, # no lr popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() nb_actions = env.num_actions action_shape = np.array(nb_actions * [0]).shape #4 pairs pos + 3 link length # nb_features = 2*(env.num_actions+1)+env.num_actions #4 pairs pos + 1 pair target pos nb_features = 2 * (env.num_actions + 2) observation_shape = np.array(nb_features * [0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] episode_end_distance = [] epoch_episodes = 0 SPARSE_REWARD = False '''add this line to make non-initialized to be initialized''' agent.load_ini(sess, save_path) for epoch in range(nb_epochs): print('epochs: ', epoch) obs = env.reset() agent.save(save_path) epoch_episode_rewards = [] for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # print('action:', action) if SPARSE_REWARD: new_obs, r, done, end_distance = env.step( action, SPARSE_REWARD) else: new_obs, r, done = env.step(action, SPARSE_REWARD) t += 1 episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. # print('r: ', r) # '''r shape: (1,)''' obs = new_obs epoch_episode_rewards.append(episode_reward) if cycle == nb_epoch_cycles - 1: # record the distance from the end position of reacher to the goal for the last step of each episode if SPARSE_REWARD: episode_end_distance.append(end_distance) else: end_distance = 100.0 / r - 1 episode_end_distance.append(end_distance[0]) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] # filling memory with noised initialized policy & preupdate the critic networks preheating_step = 30 #50 episode = 600 steps, 12 steps per episode if epoch > preheating_step: # print('memory_entries: ',memory.nb_entries) for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # print('Train!') cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() else: # update two critic networks at start cl = agent.update_critic() epoch_critic_losses.append(cl) print('critic loss in initial training: ', cl) pass # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.figure(1) plt.plot(step_set, mean_epoch_episode_rewards) plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean.png') plt.figure(2) plt.plot(step_set, episode_end_distance) plt.xlabel('Steps') plt.ylabel('Distance to Target') plt.savefig('ddpgini_distance.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('stepset: ', step_set) print('rewards: ', mean_epoch_episode_rewards) print('distances: ', episode_end_distance) return agent
def testing( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, # **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] # nb_actions = 2*env.grid_size nb_actions = env.grid_size action_shape = np.array(nb_actions * [0]).shape nb_features = (4 + 1) * env.grid_size observation_shape = np.array(nb_features * [0]).shape grid_x = env.grid_x grid_y = env.grid_y x = [] y = [] for i in range(grid_x): x.append(i + 1) for i in range(grid_y): y.append(i + 1) # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. # agent.initialize(sess) # sess.graph.finalize() agent.load(sess, save_path) agent.reset() obs, env_state = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] average_reward = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_state = [] epoch_episodes = 0 #record the car numbers in each step car_num_set = {} t_set = [i for i in range(total_timesteps)] for xx in x: for yy in y: lab = str(xx) + str(yy) car_num_set[lab] = [[0 for i in range(total_timesteps)] for j in range(4)] for epoch in range(nb_epochs): obs, env_state = env.reset() epoch_actions = [] epoch_state = [] average_car_num_set = [] last_action = 1 for cycle in range(nb_epoch_cycles): # Perform rollouts. action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) '''random action''' # if np.random.rand()>0.5: # action=[1] # else: # action=[0] '''cycle light state''' # action=[0] '''cycle action (should cycle state instead of action)''' # if last_action==1: # action=[0] # else: # action=[1] # last_action=action[0] if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): new_obs, r, env_state, done = env.step(action, env_state) epoch_state.append(env_state['11'].light_state) for xx in x: for yy in y: lab = str(xx) + str(yy) for i in range(4): car_num_set[lab][i][t] = ( env_state['11'].car_nums[i]) t += 1 episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: print('done') # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() epoch_episode_rewards.append(episode_reward) average_reward.append(episode_reward / nb_rollout_steps) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] # for t_train in range(nb_train_steps): # # Adapt param noise, if necessary. # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # # print('Train!') # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 step_set.append(t) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) # plt.figure(figsize=(8,5)) '''plot rewards-steps''' ax1 = plt.subplot(2, 1, 1) plt.sca(ax1) plt.plot(step_set, average_reward, color='b') # plt.xlabel('Steps') plt.ylabel('Mean Reward', fontsize=12) # plt.ylim(-15000,0) '''plot queueing car numbers-steps''' ax2 = plt.subplot(2, 1, 2) plt.sca(ax2) print(np.shape(t_set), np.shape(car_num_set['11'][i])) for i in range(4): if i == 0: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b') elif i == 1: plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='orange') elif i == 2: plt.plot(t_set, car_num_set['11'][i], label=i, color='g') else: plt.plot(t_set, car_num_set['11'][i], label=i, color='r') plt.ylim(0, 100) #sum among roads sum_car_num = np.sum(car_num_set['11'], axis=0) #average among time steps average_car_num = np.average(sum_car_num) average_car_num_set.append(average_car_num) plt.xlabel('Steps', fontsize=12) plt.ylabel('Cars Numbers', fontsize=12) # set legend handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) leg = plt.legend(by_label.values(), by_label.keys(), loc=1) # leg = plt.legend(loc=4) legfm = leg.get_frame() legfm.set_edgecolor('black') # set legend fame color legfm.set_linewidth(0.5) # set legend fame linewidth plt.savefig('ddpg_mean_test.pdf') plt.show() print(epoch_state) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('average queueing car numbers: ', np.average(average_car_num_set)) return agent
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self): self.sess = U.get_session() self.inp_src = tf.placeholder(shape=[None, 1, self.inp_dim], dtype=tf.float32, name='input_src') self.inp_dest = tf.placeholder(shape=[None, 1, self.out_dim], dtype=tf.float32, name='input_dest') self.labels = tf.placeholder(shape=[None, self.seq_len, self.out_dim], dtype=tf.float32, name='label') self.src_seq_len = tf.placeholder(tf.int32, (None, ), name='source_sequence_length') self.tar_seq_len = tf.placeholder(tf.int32, (None, ), name='target_sequence_length') # running averages # with tf.variable_scope('goal_stats_src'): # self.goal_stats_src = Normalizer(self.inp_dim, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('goal_stats_dest'): self.goal_stats_dest = Normalizer(self.out_dim, self.norm_eps, self.norm_clip, sess=self.sess, PLN=True) # normalize inp_src, and goals labels inp_src = self.goal_stats_dest.normalize(self.inp_src) inp_dest = self.goal_stats_dest.normalize(self.inp_dest) goal_labels = self.goal_stats_dest.normalize(self.labels) with tf.variable_scope('goal_gen'): encoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size) encoder_outputs, encoder_state = tf.nn.dynamic_rnn( encoder_cell, inp_src, sequence_length=self.src_seq_len, dtype=tf.float32) decoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size) project_layer = tf.layers.Dense(self.out_dim) with tf.variable_scope("decode"): train_inp = tf.concat([inp_dest, goal_labels[:, :-1, :]], axis=-2) train_helper = tf.contrib.seq2seq.TrainingHelper( train_inp, sequence_length=self.tar_seq_len) train_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, train_helper, encoder_state, output_layer=project_layer) train_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode( train_decoder, maximum_iterations=self.seq_len) self.train_outputs = train_outputs.rnn_output with tf.variable_scope("decode", reuse=True): infer_helper = ContinousInferHelper(inp_dest[:, 0, :], self.tar_seq_len) infer_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, infer_helper, encoder_state, output_layer=project_layer) infer_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode( infer_decoder, maximum_iterations=self.seq_len) self.infer_outputs = self.goal_stats_dest.denormalize( infer_outputs.rnn_output) log_sigma = tf.get_variable(name="logstd", shape=[1, self.out_dim], initializer=U.normc_initializer(0.1)) goals = train_outputs.rnn_output loss = 0.5 * tf.reduce_sum(tf.square((goal_labels - goals)/tf.exp(log_sigma)), axis=-1) \ + 0.5 * np.log(2*np.pi) * tf.to_float(tf.shape(self.labels)[-1]) \ + tf.reduce_sum(log_sigma, axis=-1) self.loss = tf.reduce_mean(loss) self.tr_outputs = self.goal_stats_dest.denormalize( self.train_outputs ) # just for inspect the correctness of training var_list = self._vars('') self.grads = U.flatgrad(self.loss, var_list) self.adam = MpiAdam(var_list, epsilon=self.adamepsilon) tf.variables_initializer(self._global_vars('')).run() self.adam.sync()
def retraining( save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=4, #50 nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # noise_type='adaptive-param_0.2', noise_type='normal_0.2', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-4, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape = np.array(nb_actions * [0]).shape #4 pairs pos + 3 link length # nb_features = 2*(env.num_actions+1)+env.num_actions #4 pairs pos + 1 pair target pos nb_features = 2 * (env.num_actions + 2) observation_shape = np.array(nb_features * [0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) # sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar step_set = [] reward_set = [] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 #load the initialization policy agent.load_ini(sess, save_path) # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() agent.save(save_path) epoch_episode_rewards = [] '''check if the actor initialization policy has been loaded correctly, i.e. equal to directly ouput values in checkpoint files ''' # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0') # print('loaded_weights:', sess.run(loaded_weights)) for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) print('action:', action) new_obs, r, done = env.step(action) # time.sleep(0.2) t += 1 episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b = 1. agent.store_transition( obs, action, r, new_obs, done ) #the batched data will be unrolled in memory.py's append. obs = new_obs epoch_episode_rewards.append(episode_reward) episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) # print('Train!') cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set, mean_epoch_episode_rewards, color='r', label='Initialization') plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_retrain.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) print('stepset: ', step_set) print('rewards: ', mean_epoch_episode_rewards) return agent
def testing(save_path, network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=50, nb_rollout_steps=3, #100 reward_scale=1.0, render=False, render_eval=False, # no noise for test # noise_type='adaptive-param_0.2', # noise_type='normal_0.9', # noise_type='ou_0.9', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, # actor_lr=1e-6, # critic_lr=1e-5, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=3, # per epoch cycle and MPI worker, 50 nb_eval_steps=1, #100 batch_size=640, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=3, #50 **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 rank = MPI.COMM_WORLD.Get_rank() # nb_actions = env.action_space.shape[-1] nb_actions = env.num_actions # nb_actions=3 # print(nb_actions) action_shape=np.array(nb_actions*[0]).shape nb_features = 2*(env.num_actions+1)+env.num_actions observation_shape=np.array(nb_features*[0]).shape # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) memory = Memory(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None # nb_actions = env.action_space.shape[-1] '''no noise for test''' # if noise_type is not None: # for current_noise_type in noise_type.split(','): # current_noise_type = current_noise_type.strip() # if current_noise_type == 'none': # pass # elif 'adaptive-param' in current_noise_type: # _, stddev = current_noise_type.split('_') # param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) # elif 'normal' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # elif 'ou' in current_noise_type: # _, stddev = current_noise_type.split('_') # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # else: # raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # max_action = env.action_space.high # logger.info('scaling actions by {} before executing in env'.format(max_action)) # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, agent = DDPG(actor, critic, memory, observation_shape, action_shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.load(sess,save_path) # sess.graph.finalize() # cannot save sess if its finalized! agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar step_set=[] reward_set=[] epoch = 0 start_time = time.time() epoch_episode_rewards = [] mean_epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): print(nb_epochs) # obs, env_state = env.reset() obs = env.reset() for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. '''no noise for test''' action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=True) # print('action:', action) # Execute next action. # if rank == 0 and render: # env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch # new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # new_obs, r, env_state,done = env.step(action, env_state) '''actually no need for env_state: in or out''' new_obs, r, done = env.step(action) # print('reward:', r) # note these outputs are batched from vecenv # print('obs: ',obs.shape,obs, 'action: ', action.shape, action ) '''obs shape: (1,17), action shape: (1,6)''' # print('maxaction: ', max_action.shape) '''max_action shape: (6,) , max_action*action shape: (1,6)''' t += 1 # if rank == 0 and render: # env.render() # print('r:', r) episode_reward += r episode_step += 1 # print('episode_re: ', episode_reward) #[1.] # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) b=1. agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. # print('r: ', r) # '''r shape: (1,)''' obs = new_obs # for d in range(len(done)): # if done[d]: # print('done') # # Episode done. # epoch_episode_rewards.append(episode_reward[d]) # episode_rewards_history.append(episode_reward[d]) # epoch_episode_steps.append(episode_step[d]) # episode_reward[d] = 0. # episode_step[d] = 0 # epoch_episodes += 1 # episodes += 1 # if nenvs == 1: # agent.reset() '''added''' epoch_episode_rewards.append(episode_reward) ''' step_set.append(t) reward_set=np.concatenate((reward_set,episode_reward)) # print(step_set,reward_set) # print(t, episode_reward) plt.plot(step_set,reward_set) plt.xlabel('Steps') plt.ylabel('Episode Reward') plt.savefig('ddpg.png') plt.show() ''' episode_reward = np.zeros(nenvs, dtype = np.float32) #vector # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] '''no training for test''' # for t_train in range(nb_train_steps): # Adapt param noise, if necessary. no noise for test! # if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: # distance = agent.adapt_param_noise() # epoch_adaptive_distances.append(distance) # cl, al = agent.train() # epoch_critic_losses.append(cl) # epoch_actor_losses.append(al) # agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards)) # print(step_set,mean_epoch_episode_rewards) step_set.append(t) plt.plot(step_set,mean_epoch_episode_rewards) plt.xlabel('Steps') plt.ylabel('Mean Episode Reward') plt.savefig('ddpg_mean_test.png') # plt.show() # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, lf_coef, max_grad_norm, init_labda=1., microbatch_size=None, threshold=1.): self.sess = sess = get_session() with tf.variable_scope('ppo2_lyapunov_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.l_ADV = l_ADV = tf.placeholder(tf.float32, [None]) # 这两个R都是带衰减的R self.R = R = tf.placeholder(tf.float32, [None]) self.v_l = v_l = tf.placeholder(tf.float32, [None]) log_labda = tf.get_variable('ppo2_lyapunov_model/Labda', None, tf.float32, initializer=tf.log(init_labda)) self.labda = tf.exp(log_labda) self.safety_threshold = tf.placeholder(tf.float32, None, 'threshold') self.threshold = threshold # self.log_labda = tf.placeholder(tf.float32, None, 'Labda') # self.labda = tf.constant(10.) # self.Lam=10. # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.OLDLPRED = OLDLPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Get the predicted value lpred = train_model.lf lpredclipped = OLDLPRED + tf.clip_by_value(train_model.lf - OLDLPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value lf_losses1 = tf.square(lpred - v_l) # Clipped value lf_losses2 = tf.square(lpredclipped - v_l) lf_loss = .5 * tf.reduce_mean(tf.maximum(lf_losses1, lf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining safety loss lpred = train_model.lf lpred_ = train_model.lf_ # self.l_lambda = tf.reduce_mean(ratio * tf.stop_gradient(lpred_) - tf.stop_gradient(lpred)) l_lambda1 = tf.reduce_mean(ratio * l_ADV + v_l - self.safety_threshold) l_lambda2 = tf.reduce_mean( tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) * l_ADV + v_l - self.safety_threshold) l_lambda = tf.maximum(l_lambda1, l_lambda2) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))+ l_lambda*tf.stop_gradient(self.labda) - \ tf.stop_gradient(l_lambda) * log_labda # pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)+ self.l_lambda * self.labda) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + lf_loss * lf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_lyapunov_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'safety_value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'lagrangian' ] self.stats_list = [ pg_loss, vf_loss, lf_loss, entropy, approxkl, clipfrac, self.labda ] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.eval_step = act_model.eval_step self.value = act_model.value self.l_value = act_model.l_value self.l_value_ = act_model.l_value_ self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def _serialize_variables(): sess = get_session() variables = tf.trainable_variables() values = sess.run(variables) return {var.name: value for var, value in zip(variables, values)}