def __init__(self, config): self.config = config env = gym.make(self.config['env_name']) self.config['obs_dim'] = env.observation_space.shape[0] self.config['act_dim'] = env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config) self.latest_flat_weights = self.agent.get_flat_weights() self.latest_obs_filter = self.obs_filter.as_serializable() self.sample_total_episodes = 0 self.sample_total_steps = 0 self.actors_signal_input_queues = [] self.actors_output_queues = [] self.create_actors() self.eval_rewards_stat = WindowStat(self.config['report_window_size']) self.eval_lengths_stat = WindowStat(self.config['report_window_size'])
def __init__(self, config): self.config = config self.env = gym.make(self.config['env_name']) self.config['obs_dim'] = self.env.observation_space.shape[0] self.config['act_dim'] = self.env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config)
def main(): env = gym.make(args.env) env = ActionMappingWrapper(env) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] obs_dim += 1 # add 1 to obs dim for time step feature scaler = Scaler(obs_dim) model = MujocoModel(obs_dim, act_dim) alg = parl.algorithms.PPO(model, act_dim=act_dim, policy_lr=model.policy_lr, value_lr=model.value_lr) agent = MujocoAgent(alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type) # run a few episodes to initialize scaler collect_trajectories(env, agent, scaler, episodes=5) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: trajectories = collect_trajectories(env, agent, scaler, episodes=args.episodes_per_batch) total_steps += sum([t['obs'].shape[0] for t in trajectories]) total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories]) train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data( trajectories, agent) policy_loss, kl = agent.policy_learn(train_obs, train_actions, train_advantages) value_loss = agent.value_learn(train_obs, train_discount_sum_rewards) logger.info( 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' .format(total_steps, total_train_rewards / args.episodes_per_batch, policy_loss, kl, value_loss)) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 eval_reward = run_evaluate_episode(env, agent, scaler) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, eval_reward))
def main(): env = gym.make(args.env) env.seed(ENV_SEED) env = ActionMappingWrapper(env) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] model = MujocoModel(act_dim) algorithm = parl.algorithms.DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) while rpm.size() < MEMORY_WARMUP_SIZE: run_train_episode(env, agent, rpm) episode = 0 while episode < args.train_total_episode: for i in range(50): train_reward = run_train_episode(env, agent, rpm) episode += 1 logger.info('Episode: {} Reward: {}'.format(episode, train_reward)) evaluate_reward = run_evaluate_episode(env, agent) logger.info('Episode {}, Evaluate reward: {}'.format( episode, evaluate_reward))
def main(): env = gym.make(args.env) env.seed(ENV_SEED) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = MujocoModel(act_dim, max_action) algorithm = parl.algorithms.TD3(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps)
def main(): env = gym.make(args.env) env.seed(args.seed) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = MujocoModel(obs_dim, act_dim, max_action) algorithm = ADER(model, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR, kappa=args.kappa, epoch=args.epoch, alpha=args.alpha) agent = MujocoAgent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward, evaluate_fall_rate, total_steps_list = run_evaluate_episode( env, agent) mean_steps = np.mean(total_steps_list) logger.info('Steps {}, Evaluate reward: {}, Fall rate: {}'.format( total_steps, evaluate_reward, evaluate_fall_rate)) logger.info( 'Steps {}, Mean episode steps: {}, Steps list: {}'.format( total_steps, mean_steps, total_steps_list)) res = { 'eval_step': mean_steps, 'fall_rate': evaluate_fall_rate, 'Step': total_steps, 'Value': evaluate_reward } csv_logger.log_dict(res)
class Learner(object): def __init__(self, config): self.config = config env = gym.make(self.config['env_name']) self.config['obs_dim'] = env.observation_space.shape[0] self.config['act_dim'] = env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config) self.latest_flat_weights = self.agent.get_flat_weights() self.latest_obs_filter = self.obs_filter.as_serializable() self.sample_total_episodes = 0 self.sample_total_steps = 0 self.actors_signal_input_queues = [] self.actors_output_queues = [] self.create_actors() self.eval_rewards_stat = WindowStat(self.config['report_window_size']) self.eval_lengths_stat = WindowStat(self.config['report_window_size']) def create_actors(self): """ create actors for parallel training. """ parl.connect(self.config['master_address']) self.remote_count = 0 for i in range(self.config['actor_num']): signal_queue = queue.Queue() output_queue = queue.Queue() self.actors_signal_input_queues.append(signal_queue) self.actors_output_queues.append(output_queue) self.remote_count += 1 remote_thread = threading.Thread(target=self.run_remote_sample, args=(signal_queue, output_queue)) remote_thread.setDaemon(True) remote_thread.start() logger.info('All remote actors are ready, begin to learn.') def run_remote_sample(self, signal_queue, output_queue): """ Sample data from remote actor or get filters of remote actor. """ remote_actor = Actor(self.config) while True: info = signal_queue.get() if info['signal'] == 'sample': result = remote_actor.sample(self.latest_flat_weights) output_queue.put(result) elif info['signal'] == 'get_filter': actor_filter = remote_actor.get_filter(flush_after=True) output_queue.put(actor_filter) elif info['signal'] == 'set_filter': remote_actor.set_filter(self.latest_obs_filter) else: raise NotImplementedError def step(self): """Run a step in ES. 1. kick off all actors to synchronize weights and sample data; 2. update parameters of the model based on sampled data. 3. update global observation filter based on local filters of all actors, and synchronize global filter to all actors. """ num_episodes, num_timesteps = 0, 0 results = [] while num_episodes < self.config['min_episodes_per_batch'] or \ num_timesteps < self.config['min_steps_per_batch']: # Send sample signal to all actors for q in self.actors_signal_input_queues: q.put({'signal': 'sample'}) # Collect results from all actors for q in self.actors_output_queues: result = q.get() results.append(result) # result['noisy_lengths'] is a list of lists, where the inner lists have length 2. num_episodes += sum( len(pair) for pair in result['noisy_lengths']) num_timesteps += sum( sum(pair) for pair in result['noisy_lengths']) all_noise_indices = [] all_training_rewards = [] all_training_lengths = [] all_eval_rewards = [] all_eval_lengths = [] for result in results: all_eval_rewards.extend(result['eval_rewards']) all_eval_lengths.extend(result['eval_lengths']) all_noise_indices.extend(result['noise_indices']) all_training_rewards.extend(result['noisy_rewards']) all_training_lengths.extend(result['noisy_lengths']) assert len(all_eval_rewards) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_rewards) == len(all_training_lengths)) self.sample_total_episodes += num_episodes self.sample_total_steps += num_timesteps eval_rewards = np.array(all_eval_rewards) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_rewards = np.array(all_training_rewards) noisy_lengths = np.array(all_training_lengths) # normalize rewards to (-0.5, 0.5) proc_noisy_rewards = utils.compute_centered_ranks(noisy_rewards) noises = [ self.noise.get(index, self.agent.weights_total_size) for index in noise_indices ] # Update the parameters of the model. self.agent.learn(proc_noisy_rewards, noises) self.latest_flat_weights = self.agent.get_flat_weights() # Update obs filter self._update_filter() # Store the evaluate rewards if len(all_eval_rewards) > 0: self.eval_rewards_stat.add(np.mean(eval_rewards)) self.eval_lengths_stat.add(np.mean(eval_lengths)) metrics = { "episodes_this_iter": noisy_lengths.size, "sample_total_episodes": self.sample_total_episodes, 'sample_total_steps': self.sample_total_steps, "evaluate_rewards_mean": self.eval_rewards_stat.mean, "evaluate_steps_mean": self.eval_lengths_stat.mean, "timesteps_this_iter": noisy_lengths.sum(), } self.log_metrics(metrics) return metrics def _update_filter(self): # Send get_filter signal to all actors for q in self.actors_signal_input_queues: q.put({'signal': 'get_filter'}) filters = [] # Collect filters from all actors and update global filter for q in self.actors_output_queues: actor_filter = q.get() self.obs_filter.apply_changes(actor_filter) # Send set_filter signal to all actors self.latest_obs_filter = self.obs_filter.as_serializable() for q in self.actors_signal_input_queues: q.put({'signal': 'set_filter'}) def log_metrics(self, metrics): logger.info(metrics) for k, v in metrics.items(): if v is not None: summary.add_scalar(k, v, self.sample_total_steps)
class Actor(object): def __init__(self, config): self.config = config self.env = gym.make(self.config['env_name']) self.config['obs_dim'] = self.env.observation_space.shape[0] self.config['act_dim'] = self.env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config) def _play_one_episode(self, add_noise=False): episode_reward = 0 episode_step = 0 obs = self.env.reset() while True: if np.random.uniform() < self.config['filter_update_prob']: obs = self.obs_filter(obs[None], update=True) else: obs = self.obs_filter(obs[None], update=False) action = self.agent.predict(obs) if add_noise: action += np.random.randn( *action.shape) * self.config['action_noise_std'] obs, reward, done, _ = self.env.step(action) episode_reward += reward episode_step += 1 if done: break return episode_reward, episode_step def sample(self, flat_weights): noise_indices, rewards, lengths = [], [], [] eval_rewards, eval_lengths = [], [] # Perform some rollouts with noise. task_tstart = time.time() while (len(noise_indices) == 0 or time.time() - task_tstart < self.config['min_task_runtime']): if np.random.uniform() < self.config["eval_prob"]: # Do an evaluation run with no perturbation. self.agent.set_flat_weights(flat_weights) episode_reward, episode_step = self._play_one_episode( add_noise=False) eval_rewards.append(episode_reward) eval_lengths.append(episode_step) else: # Do a regular run with parameter perturbations. noise_index = self.noise.sample_index( self.agent.weights_total_size) perturbation = self.config["noise_stdev"] * self.noise.get( noise_index, self.agent.weights_total_size) # mirrored sampling: evaluate pairs of perturbations \epsilon, −\epsilon self.agent.set_flat_weights(flat_weights + perturbation) episode_reward_pos, episode_step_pos = self._play_one_episode( add_noise=True) self.agent.set_flat_weights(flat_weights - perturbation) episode_reward_neg, episode_step_neg = self._play_one_episode( add_noise=True) noise_indices.append(noise_index) rewards.append([episode_reward_pos, episode_reward_neg]) lengths.append([episode_step_pos, episode_step_neg]) return { 'noise_indices': noise_indices, 'noisy_rewards': rewards, 'noisy_lengths': lengths, 'eval_rewards': eval_rewards, 'eval_lengths': eval_lengths } def get_filter(self, flush_after=False): return_filter = self.obs_filter.as_serializable() if flush_after: self.obs_filter.clear_buffer() return return_filter def set_filter(self, new_filter): self.obs_filter.sync(new_filter)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_env(args.env_name, args.seed, args.gamma) model = MujocoModel(envs.observation_space.shape[0], envs.action_space.shape[0]) model.to(device) algorithm = PPO(model, args.clip_param, args.value_loss_coef, args.entropy_coef, initial_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) agent = MujocoAgent(algorithm, device) rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0], envs.action_space.shape[0]) obs = envs.reset() rollouts.obs[0] = np.copy(obs) episode_rewards = deque(maxlen=10) num_updates = int(args.num_env_steps) // args.num_steps for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(algorithm.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = agent.sample( rollouts.obs[step]) # why use obs from rollouts???有病吧 # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.append(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = agent.value(rollouts.obs[-1]) value_loss, action_loss, dist_entropy = agent.learn( next_value, args.gamma, args.gae_lambda, args.ppo_epoch, args.num_mini_batch, rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_steps print( "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms eval_mean_reward = evaluate(agent, ob_rms, args.env_name, args.seed, device)