def get_target_updates(vars, target_vars, tau): logger.info('setting up target updates ...') soft_updates = [] init_updates = [] assert len(vars) == len(target_vars) for var, target_var in zip(vars, target_vars): logger.info(' {} <- {}'.format(target_var.name, var.name)) init_updates.append(tf.assign(target_var, var)) soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var)) assert len(init_updates) == len(vars) assert len(soft_updates) == len(vars) return tf.group(*init_updates), tf.group(*soft_updates)
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): assert len(actor.vars) == len(perturbed_actor.vars) assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) updates = [] for var, perturbed_var in zip(actor.vars, perturbed_actor.vars): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var)) assert len(updates) == len(actor.vars) return tf.group(*updates)
def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) # critic_reg = tc.layers.apply_regularization( # tc.layers.l2_regularizer(self.critic_l2_reg), # weights_list=critic_reg_vars critic_reg = self.critic_l2_reg # critic_reg = tf.layers.l2_regularizer(self.critic_l2_reg) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def learn( controller, env, nb_epochs=5, # with default settings, perform 1M steps total nb_epoch_cycles=150, nb_rollout_steps=400, data_path_reward="", data_path_steps="", data_path_states="", data_path_times=""): epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_states = [] for epoch in range(nb_epochs): logger.info( "======================== The {} epoch start !!! =========================" .format(epoch)) epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_times = [] epoch_episode_states = [] for i in range(nb_epoch_cycles): start_time = time.time() s = env.reset() episode_reward = 0. episode_step = 0 episode_states = [] for j in range(nb_rollout_steps): print('state', s[:6]) a = 0 s_, us, r, done, safe = env.step([(0, 0, 0, 0, 0, 0), ""]) episode_reward += r episode_step += 1 episode_states.append([ cp.deepcopy(s), cp.deepcopy(a), np.array(cp.deepcopy(r)), cp.deepcopy(s_) ]) if done or j == nb_rollout_steps - 1 or safe is False: print('Ep: %i | %s | %s | step: %i' % (i, '---' if not done else 'done', 'unsafe' if not safe else 'safe', j)) break s = s_ """ store data """ duration = time.time() - start_time epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_times.append(cp.deepcopy(duration)) epoch_episode_states.append(cp.deepcopy(episode_states)) epochs_rewards[epoch, i] = episode_reward epochs_steps[epoch, i] = episode_step epochs_times[epoch, i] = cp.deepcopy(duration) epochs_states.append(cp.deepcopy(epoch_episode_states)) # # save data np.save(data_path_reward, epochs_rewards) np.save(data_path_steps, epochs_steps) np.save(data_path_states, epochs_states) np.save(data_path_times, epochs_times)
pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) """action scale""" max_action = env.action_high_bound logger.info('scaling actions by {} before executing in env'.format(max_action)) """ agent ddpg """ agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape[0], gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() if restore: agent.restore(sess, model_path, model_name) else:
def learn( network, env, data_path_reward="", data_path_steps="", data_path_states="", data_path_times="", model_path="", model_name="", restore=False, seed=None, nb_epochs=5, # with default settings, perform 1M steps total nb_epoch_cycles=150, nb_rollout_steps=400, reward_scale=1.0, noise_type='normal_0.2', #'adaptive-param_0.2', ou_0.2, normal_0.2 normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, batch_size=32, # per MPI worker tau=0.01, param_noise_adaption_interval=50, **network_kwargs): nb_actions = env.action_space.shape[0] memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) """ set noise """ action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) """action scale""" max_action = env.action_high_bound logger.info( 'scaling actions by {} before executing in env'.format(max_action)) """ agent ddpg """ agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape[0], gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() if restore: agent.restore(sess, model_path, model_name) else: agent.initialize(sess) sess.graph.finalize() agent.reset() episodes = 0 epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_states = [] for epoch in range(nb_epochs): logger.info( "======================== The {} epoch start !!! =========================" .format(epoch)) epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_times = [] epoch_actions = [] epoch_episode_states = [] epoch_qs = [] epoch_episodes = 0 for cycle in range(nb_epoch_cycles): start_time = time.time() obs, state, done = env.reset() episode_reward = 0. episode_step = 0 episode_states = [] logger.info( "================== The {} episode start !!! ===================" .format(cycle)) for t_rollout in range(nb_rollout_steps): # logger.info("================== The {} steps finish !!! ===================".format(t_rollout)) """ choose next action """ action, q, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=True) new_obs, next_state, r, done, safe_or_not = env.step( max_action * action) """ normalize state """ print("\nReward", r) if safe_or_not is False: break episode_reward += r episode_step += 1 episode_states.append([ cp.deepcopy(state), cp.deepcopy(action), np.array(cp.deepcopy(r)), cp.deepcopy(next_state) ]) epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs state = next_state if done: break """ noise decay """ stddev = float(stddev) * 0.95 """ store data """ duration = time.time() - start_time epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_times.append(cp.deepcopy(duration)) epoch_episode_states.append(cp.deepcopy(episode_states)) epochs_rewards[epoch, cycle] = episode_reward epochs_steps[epoch, cycle] = episode_step epochs_times[epoch, cycle] = cp.deepcopy(duration) logger.info( "============================= The Episode_Reward:: {}!!! ============================" .format(epoch_episode_rewards)) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_times)) epoch_episodes += 1 episodes += 1 """ Training process """ epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): logger.info("") # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() epochs_states.append(cp.deepcopy(epoch_episode_states)) # # save data np.save(data_path_reward, epochs_rewards) np.save(data_path_steps, epochs_steps) np.save(data_path_states, epochs_states) np.save(data_path_times, epochs_times) # np.save(data_path + 'train_reward_' + "DDPG" + '_' + file_name + "_" + noise_type, epochs_rewards) # np.save(data_path + 'train_step_' + "DDPG" + '_' + file_name + "_" + noise_type, epochs_steps) # np.save(data_path + 'train_states_' + "DDPG" + '_' + file_name + "_" + noise_type, epochs_states) # np.save(data_path + 'train_times_' + "DDPG" + '_' + file_name + "_" + noise_type, epochs_times) # # agent save agent.store(model_path + model_name)