class Expert: def __init__(self, limit, env): self.limit = limit self.env = env self.memory = Memory( limit=self.limit, action_shape=self.env.action_space.shape, observation_shape=self.env.observation_space.shape) self.file_dir = None def load_file(self, file_dir): self.file_dir = file_dir expert_file = open(self.file_dir, 'rb') expert_data = pickle.load(expert_file) expert_file.close() for episode_sample in expert_data: for step_sample in episode_sample: self.memory.append(step_sample[0], step_sample[1], step_sample[2], step_sample[3], step_sample[4]) def sample(self, batch_size): return self.memory.sample(batch_size) def set_tf(self, actor, critic, obs_rms, ret_rms, observation_range, return_range): self.expert_state = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape, name='expert_state') self.expert_action = tf.placeholder(tf.float32, shape=(None, ) + self.env.action_space.shape, name='expert_action') normalized_state = tf.clip_by_value( normalize(self.expert_state, obs_rms), observation_range[0], observation_range[1]) expert_actor = actor(normalized_state, reuse=True) normalized_q_with_expert_data = critic(normalized_state, self.expert_action, reuse=True) normalized_q_with_expert_actor = critic(normalized_state, expert_actor, reuse=True) self.Q_with_expert_data = denormalize( tf.clip_by_value(normalized_q_with_expert_data, return_range[0], return_range[1]), ret_rms) self.Q_with_expert_actor = denormalize( tf.clip_by_value(normalized_q_with_expert_actor, return_range[0], return_range[1]), ret_rms) self.critic_loss = tf.reduce_mean( tf.nn.softplus(self.Q_with_expert_actor - self.Q_with_expert_data)) self.actor_loss = -tf.reduce_mean(self.Q_with_expert_actor)
class DQNModel(TensorflowBasedModel): key_list = Config.load_json(file_path=None) def __init__(self, config, action_bound): super(DQNModel, self).__init__(config=config) self.proposed_action_list = [] self.action_bound = action_bound action_list = [] for i in range(len(action_bound[0])): low = action_bound[0][i] high = action_bound[1][i] action_list.append( np.arange(start=low, stop=high, step=(high - low) / self.config.config_dict['ACTION_SPLIT_COUNT'])) action_iterator = itertools.product(*action_list) self.action_selection_list = [] for action_sample in action_iterator: self.action_selection_list.append(tf.constant(action_sample)) self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.state_input = tf.placeholder( shape=[None] + list(self.config.config_dict['STATE_SPACE']), dtype=tf.float32) self.next_state_input = tf.placeholder( shape=[None] + list(self.config.config_dict['STATE_SPACE']), dtype=tf.float32) self.action_input = tf.placeholder( shape=[None] + list(self.config.config_dict['ACTION_SPACE']), dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.input = tf.concat([self.state_input, self.action_input]) self.done = tf.cast(self.done_input, dtype=tf.float32) self.q_value_list = [] var_list = None for action_sample in self.action_selection_list: q_net, q_output, var_list = NetworkCreator.create_network( input=tf.concat(self.state_input, action_sample), network_config=self.config.config_dict['NET_CONFIG'], net_name=self.config.config_dict['NAME']) self.q_value_list.append(q_output) self.var_list = var_list self.target_q_value_list = [] for action_sample in self.action_selection_list: q_net, q_output, var_list = NetworkCreator.create_network( input=tf.concat(self.next_state_input, action_sample), network_config=self.config.config_dict['NET_CONFIG'], net_name='TARGET' + self.config.config_dict['NAME']) self.target_var_list.append(q_output) self.target_var_list = var_list self.loss, self.optimizer, self.optimize = self.create_training_method( ) self.update_target_q_op = self.create_target_q_update() self.memory = Memory( limit=1e100, action_shape=self.config.config_dict['ACTION_SPACE'], observation_shape=self.config.config_dict['STATE_SPACE']) self.sess = tf.get_default_session() def update(self): for i in range(self.config.config_dict['ITERATION_EVER_EPOCH']): batch_data = self.memory.sample( batch_size=self.config.config_dict['BATCH_SIZE']) loss = self.sess.run(fetches=[self.loss, self.optimize], feed_dict={ self.reward_input: batch_data['rewards'], self.action_input: batch_data['actions'], self.state_input: batch_data['obs0'], self.done_input: batch_data['terminals1'] }) def predict(self, obs, q_value): pass def print_log_queue(self, status): self.status = status while self.log_queue.qsize() > 0: log = self.log_queue.get() print("%s: Critic loss %f: " % (self.name, log[self.name + '_CRITIC'])) log['INDEX'] = self.log_print_count self.log_file_content.append(log) self.log_print_count += 1 def create_training_method(self): l1_l2 = tfcontrib.layers.l1_l2_regularizer() loss = tf.reduce_sum((self.predict_q_value - self.q_output) ** 2) + \ tfcontrib.layers.apply_regularization(l1_l2, weights_list=self.var_list) optimizer = tf.train.AdadeltaOptimizer( learning_rate=self.config.config_dict['LEARNING_RATE']) optimize_op = optimizer.minimize(loss=loss, var_list=self.var_list) return loss, optimizer, optimize_op def create_predict_q_value_op(self): predict_q_value = (1. - self.done) * self.config.config_dict['DISCOUNT'] * self.target_q_output \ + self.reward_input return predict_q_value def create_target_q_update(self): op = [] for var, target_var in zip(self.var_list, self.target_var_list): ref_val = self.config.config_dict['DECAY'] * target_var + ( 1.0 - self.config.config_dict['DECAY']) * var op.append(tf.assign(ref_val, var)) return op def store_one_sample(self, state, next_state, action, reward, done, *arg, **kwargs): self.memory.append(obs0=state, obs1=next_state, action=action, reward=reward, terminal1=done)
class Model(object): def __init__(self, network, env, gamma=1, tau=0.01, total_timesteps=1e6, normalize_observations=True, normalize_returns=False, enable_popart=False, noise_type='adaptive-param_0.2', clip_norm=None, reward_scale=1., batch_size=128, l2_reg_coef=0.2, actor_lr=1e-4, critic_lr=1e-3, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), **network_kwargs): # logger.info('Using agent with the following configuration:') # logger.info(str(self.__dict__.items())) observation_shape = env.observation_space.shape action_shape = env.action_space.shape # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.env = env self.gamma = gamma self.tau = tau self.total_timesteps = total_timesteps self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.enable_popart = enable_popart self.clip_norm = clip_norm self.reward_scale = reward_scale self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.l2_reg_coef = l2_reg_coef self.stats_sample = None self.action_noise = None self.param_noise = None nb_actions = self.env.action_space.shape[-1] if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') self.param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') self.action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. self.memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) self.critic = Critic(network=network, **network_kwargs) self.actor = Actor(nb_actions, network=network, **network_kwargs) # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = self.actor(normalized_obs0) self.normalized_critic_tf = self.critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet self.def_path_pre = os.path.dirname(os.path.abspath(__file__)) + '/tmp/' def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.l2_reg_coef > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.l2_reg_coef)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.l2_reg_coef), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def train_step(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action[0].shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def step(self, obs, compute_Q=True): feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([self.actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(self.actor_tf, feed_dict=feed_dict) q = None action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() else: mean_distance = distance self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def learn(self, total_timesteps=None, seed=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, render=False, nb_train_steps=50, # per epoch cycle and MPI worker, batch_size=64, # per MPI worker param_noise_adaption_interval=50,): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 # eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. self.initialize(sess) sess.graph.finalize() self.reset() obs = self.env.reset() # if eval_env is not None: # eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype=np.float32) # vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 # scalar t = 0 # scalar start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead self.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = self.train_step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: self.env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch # new_obs, r, done, info = self.env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) new_obs, r, done, info = self.env.step(action) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: self.env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) # the batched data will be unrolled in memory.py's append. self.store_transition(obs, action, r, new_obs, done) obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: self.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = self.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = self.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) self.update_target_net() # # # Evaluate. # eval_episode_rewards = [] # eval_qs = [] # if eval_env is not None: # eval_obs = eval_env.reset() # nenvs_eval = eval_obs.shape[0] # eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) # for t_rollout in range(nb_eval_steps): # eval_action, eval_q, _, _ = self.train_step(eval_obs, apply_noise=False, compute_Q=True) # # eval_obs, eval_r, eval_done, eval_info = eval_env.step( # # max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # eval_obs, eval_r, eval_done, eval_info = eval_env.step(eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # # if render_eval: # eval_env.render() # eval_episode_reward += eval_r # # eval_qs.append(eval_q) # for d in range(len(eval_done)): # if eval_done[d]: # eval_episode_rewards.append(eval_episode_reward[d]) # eval_episode_rewards_history.append(eval_episode_reward[d]) # eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # save trainable variables file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S', time.localtime(time.time())) model_save_path = self.def_path_pre + file_name self.save(model_save_path) # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) # combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) # combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) # combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. # if eval_env is not None: # combined_stats['eval/return'] = eval_episode_rewards # combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) # combined_stats['eval/Q'] = eval_qs # combined_stats['eval/episodes'] = len(eval_episode_rewards) combined_stats_sums = np.array([np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(self.env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(self.env.get_state(), f) # if eval_env and hasattr(eval_env, 'get_state'): # with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: # pickle.dump(eval_env.get_state(), f) self.sess.graph._unsafe_unfinalize() return self def save(self, save_path=None): save_variables(save_path=save_path, sess=self.sess) print('save model variables to', save_path) def load_newest(self, load_path=None): file_list = os.listdir(self.def_path_pre) file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x))) if load_path is None: load_path = os.path.join(self.def_path_pre, file_list[-1]) load_variables(load_path=load_path, sess=self.sess) print('load_path: ', load_path) def load_index(self, index, load_path=None): file_list = os.listdir(self.def_path_pre) file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)), reverse=True) if load_path is None: load_path = os.path.join(self.def_path_pre, file_list[index]) load_variables(load_path=load_path, sess=self.sess) print('load_path: ', load_path)
class Expert: def __init__(self, limit, env): self.limit = limit self.env = env self.memory = Memory( limit=self.limit, action_shape=self.env.action_space.shape, observation_shape=self.env.observation_space.shape) self.file_dir = None def load_file(self, file_dir, print_reward=False): self.file_dir = file_dir expert_file = open(self.file_dir, 'rb') expert_data = pickle.load(expert_file) expert_file.close() k = 0 if print_reward: total_rew = 0. ep_rew = 0. nep = 1. for episode_sample in expert_data: for step_sample in episode_sample: k = k + 1 if k <= self.limit: if print_reward: ep_rew += step_sample[2] if step_sample[4]: nep += 1 total_rew += ep_rew ep_rew = 0 self.memory.append(step_sample[0], step_sample[1], step_sample[2], step_sample[3], step_sample[4]) else: if print_reward: print( 'Successfully loaded expert files, average reward ', total_rew / nep) return if print_reward: print('Successfully loaded expert files, average reward ', total_rew / nep) def load_file_trpo(self, file_dir): self.file_dir = file_dir traj_data = np.load(file_dir) if self.limit is None: obs = traj_data["obs"][:] acs = traj_data["acs"][:] else: obs = traj_data["obs"][:self.limit] acs = traj_data["acs"][:self.limit] episode_num = len(acs) ''' step_num = 0 for i in range(episode_num): step_num += len(acs[i]) print("Total Step is:", step_num, "\nTotal_Episode is:", episode_num) ''' for i in range(episode_num): episode_len = len(acs[i]) for j in range(episode_len): done = True if (j == episode_len - 1) else False self.memory.append(obs[i][j], acs[i][j], 0., 0., done) def sample(self, batch_size): return self.memory.sample(batch_size) def set_tf(self, actor, critic, obs0, actions, obs_rms, ret_rms, observation_range, return_range, supervise=False, critic_only=False, actor_only=False, both_ours_sup=False, gail=False, pofd=False): self.expert_state = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape, name='expert_state') self.expert_action = tf.placeholder(tf.float32, shape=(None, ) + self.env.action_space.shape, name='expert_action') normalized_state = tf.clip_by_value( normalize(self.expert_state, obs_rms), observation_range[0], observation_range[1]) expert_actor = actor(normalized_state, reuse=True) normalized_q_with_expert_data = critic(normalized_state, self.expert_action, reuse=True) normalized_q_with_expert_actor = critic(normalized_state, expert_actor, reuse=True) self.Q_with_expert_data = denormalize( tf.clip_by_value(normalized_q_with_expert_data, return_range[0], return_range[1]), ret_rms) self.Q_with_expert_actor = denormalize( tf.clip_by_value(normalized_q_with_expert_actor, return_range[0], return_range[1]), ret_rms) if supervise: self.actor_loss = tf.nn.l2_loss(self.expert_action - expert_actor) self.critic_loss = 0 else: self.critic_loss = tf.reduce_mean( tf.nn.relu(self.Q_with_expert_actor - self.Q_with_expert_data)) self.actor_loss = -tf.reduce_mean(self.Q_with_expert_actor) if critic_only: self.actor_loss = 0 if actor_only: self.critic_loss = 0 #self.dist = tf.reduce_mean(self.Q_with_expert_data - self.Q_with_expert_actor) if both_ours_sup: self.actor_loss = tf.nn.l2_loss(self.expert_action - expert_actor) - tf.reduce_mean( self.Q_with_expert_actor) self.critic_loss = tf.reduce_mean( tf.nn.relu(self.Q_with_expert_actor - self.Q_with_expert_data)) if gail or pofd: discriminator = Discriminator() d_with_expert_data = discriminator(normalized_state, self.expert_action) d_with_gen_data = discriminator(obs0, actions, reuse=True) self.discriminator_loss = tf.reduce_mean( tf.log(d_with_gen_data)) + tf.reduce_mean( tf.log(1 - d_with_expert_data)) self.actor_loss = -tf.reduce_mean(tf.log(d_with_gen_data))
def learn( network, env, data_path='', model_path='./model/', model_name='ddpg_none_fuzzy_150', file_name='test', model_based=False, memory_extend=False, model_type='linear', restore=False, dyna_learning=False, seed=None, nb_epochs=5, # with default settings, perform 1M steps total nb_sample_cycle=5, nb_epoch_cycles=150, nb_rollout_steps=400, nb_model_learning=10, nb_sample_steps=50, nb_samples_extend=5, reward_scale=1.0, noise_type='normal_0.2', #'adaptive-param_0.2', ou_0.2, normal_0.2 normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, batch_size=32, # per MPI worker tau=0.01, param_noise_adaption_interval=50, **network_kwargs): nb_actions = env.action_space.shape[0] memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) if model_based: """ store fake_data""" fake_memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) """ select model or not """ if model_type == 'gp': kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) dynamic_model = GaussianProcessRegressor(kernel=kernel) reward_model = GaussianProcessRegressor(kernel=kernel) elif model_type == 'linear': dynamic_model = LinearRegression() reward_model = LinearRegression() elif model_type == 'mlp': dynamic_model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) reward_model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) else: logger.info( "You need to give the model_type to fit the dynamic and reward!!!" ) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) """ set noise """ action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) """action scale""" max_action = env.action_high_bound logger.info( 'scaling actions by {} before executing in env'.format(max_action)) """ agent ddpg """ agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape[0], gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() if restore: agent.restore(sess, model_path, model_name) else: agent.initialize(sess) sess.graph.finalize() agent.reset() episodes = 0 epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_states = [] for epoch in range(nb_epochs): logger.info( "======================== The {} epoch start !!! =========================" .format(epoch)) epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_times = [] epoch_actions = [] epoch_episode_states = [] epoch_qs = [] epoch_episodes = 0 for cycle in range(nb_epoch_cycles): start_time = time.time() obs, state, done = env.reset() obs_reset = cp.deepcopy(obs) episode_reward = 0. episode_step = 0 episode_states = [] logger.info( "================== The {} episode start !!! ===================" .format(cycle)) for t_rollout in range(nb_rollout_steps): logger.info( "================== The {} steps finish !!! ===================" .format(t_rollout)) """ Predict next action """ action, q, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=True) new_obs, next_state, r, done, safe_or_not, final_action = env.step( max_action * action, t_rollout) if safe_or_not is False: break episode_reward += r episode_step += 1 episode_states.append([ cp.deepcopy(state), cp.deepcopy(final_action), np.array(cp.deepcopy(r)), cp.deepcopy(next_state) ]) epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs state = next_state if done: break """ extend the memory """ if model_based and cycle > (nb_model_learning + 1) and memory_extend: pred_x = np.zeros((1, 18), dtype=np.float32) for j in range(nb_samples_extend): m_action, _, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=False) pred_x[:, :12] = obs pred_x[:, 12:] = m_action m_new_obs = dynamic_model.predict(pred_x)[0] """ get real reward """ # state = env.inverse_state(m_new_obs) # m_reward = env.get_reward(state, m_action) m_reward = reward_model.predict(pred_x)[0] agent.store_transition(obs, m_action, m_reward, m_new_obs, done) """ generate new data and fit model""" if model_based and cycle > nb_model_learning: logger.info( "============================== Model Fit !!! ===============================" ) input_x = np.concatenate( (memory.observations0.data[:memory.nb_entries], memory.actions.data[:memory.nb_entries]), axis=1) input_y_obs = memory.observations1.data[:memory.nb_entries] input_y_reward = memory.rewards.data[:memory.nb_entries] dynamic_model.fit(input_x, input_y_obs) reward_model.fit(input_x, input_y_reward) if dyna_learning: logger.info( "========================= Collect data !!! =================================" ) pred_obs = np.zeros((1, 18), dtype=np.float32) for sample_index in range(nb_sample_cycle): fake_obs = obs_reset for t_episode in range(nb_sample_steps): fake_action, _, _, _ = agent.step(fake_obs, stddev, apply_noise=True, compute_Q=False) pred_obs[:, :12] = fake_obs pred_obs[:, 12:] = fake_action next_fake_obs = dynamic_model.predict(pred_obs)[0] fake_reward = reward_model.predict(pred_obs)[0] # next_fake_obs = dynamic_model.predict(np.concatenate((fake_obs, fake_action)))[0] # fake_reward = reward_model.predict(np.concatenate((fake_obs, fake_action)))[0] fake_obs = next_fake_obs fake_terminals = False fake_memory.append(fake_obs, fake_action, fake_reward, next_fake_obs, fake_terminals) """ noise decay """ stddev = float(stddev) * 0.95 duration = time.time() - start_time epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_times.append(cp.deepcopy(duration)) epoch_episode_states.append(cp.deepcopy(episode_states)) epochs_rewards[epoch, cycle] = episode_reward epochs_steps[epoch, cycle] = episode_step epochs_times[epoch, cycle] = cp.deepcopy(duration) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_rewards)) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_times)) epoch_episodes += 1 episodes += 1 """ Training process """ epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): logger.info("") # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() """ planning training """ if model_based and cycle > (nb_model_learning + 1) and dyna_learning: for t_train in range(nb_train_steps): # setting for adapt param noise, if necessary. if fake_memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) batch = fake_memory.sample(batch_size=batch_size) fake_cl, fake_al = agent.train_fake_data(batch) epoch_critic_losses.append(fake_cl) epoch_actor_losses.append(fake_al) agent.update_target_net() epochs_states.append(cp.deepcopy(epoch_episode_states)) # # save data np.save( data_path + 'train_reward_' + algorithm_name + '_' + noise_type + file_name, epochs_rewards) np.save( data_path + 'train_step_' + algorithm_name + '_' + noise_type + file_name, epochs_steps) np.save( data_path + 'train_states_' + algorithm_name + '_' + noise_type + file_name, epochs_states) np.save( data_path + 'train_times_' + algorithm_name + '_' + noise_type + file_name, epochs_times) # # agent save agent.store(model_path + 'train_model_' + algorithm_name + '_' + noise_type + file_name)
class DDPG(object): def __init__(self, **params): for k in params: setattr(self, k, params[k]) self.init_args = copy(params) if self.her: # self.obs_to_goal = None # self.goal_idx = None # self.reward_fn = None self.memory = HERBuffer(limit=int(self.buffer_size), action_shape=self.action_shape, observation_shape=self.observation_shape, obs_to_goal=self.obs_to_goal, goal_slice=self.goal_idx, reward_fn=self.reward_fn) else: self.memory = Memory(limit=int(self.buffer_size), action_shape=self.action_shape, observation_shape=self.observation_shape) self.critic = Critic(layer_norm=self.layer_norm) self.actor = Actor(self.action_shape[-1], layer_norm=self.layer_norm) self.action_noise = NormalActionNoise(mu=np.zeros(self.action_shape), sigma=float(self.noise_sigma) * np.ones(self.action_shape)) self.param_noise = None # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + self.observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + self.observation_shape, name='obs1') # self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + self.action_shape, name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=self.observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(self.actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(self.critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = self.actor(normalized_obs0) self.normalized_critic_tf = self.critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) # self.target_Q = self.rewards + (1. - self.terminals1) * self.gamma * Q_obs1 self.target_Q = self.rewards + self.gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def pi(self, obs, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run( self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = mpi_mean(distance) self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) self.flush() def flush(self): if self.her: self.memory.flush() def get_save_tf(self): all_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) return self.sess.run(all_variables) def restore_tf(self, save): all_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) restore_ops = [] for x, y in zip(all_variables, save): restore_ops.append(tf.assign(x, y)) self.sess.run(restore_ops) def __getstate__(self): exclude_vars = set(["env"]) args = {} for k in self.init_args: if k not in exclude_vars: args[k] = self.init_args[k] return {'tf': self.get_save_tf(), 'init': args} def __setstate__(self, state): self.__init__(**state['init']) self.sess = tf.InteractiveSession( ) # for now just make ourself a session self.sess.run(tf.global_variables_initializer()) self.restore_tf(state['tf']) self.actor_optimizer.sync() self.critic_optimizer.sync()