def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") model_dir = Path('./models') / args.env_name / args.log_dir if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(str(log_dir)) args.log_dir = str(log_dir) print('saving to', args.log_dir) os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) obs_shape = envs.observation_space.n, actor_critic = Policy(obs_shape, envs.action_space, args.dual_type, args.dual_rank, args.dual_emb_dim) if args.cuda: actor_critic.cuda() agent = algo.A2C_ACKTR(actor_critic=actor_critic, value_loss_coef=args.value_loss_coef, entropy_coef=args.entropy_coef, dual_act_coef=args.dual_act_coef, dual_state_coef=args.dual_state_coef, dual_sup_coef=args.dual_sup_coef, policy_coef=args.policy_coef, emb_coef=args.dual_emb_coef, demo_eta=args.demo_eta, demo_eps=args.demo_eps, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.base.state_size) current_obs = torch.zeros(args.num_processes) def update_current_obs(obs): obs = torch.from_numpy(obs).float() current_obs[:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() current_obs *= masks.squeeze(1) update_current_obs(obs) rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, \ dual_act_loss, dual_state_loss, dual_sup, emb_loss, \ state_acc, action_acc, sup_acc, miss_rate = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0: save_path = run_dir / 'incremental' if not save_path.exists(): os.makedirs(str(save_path)) # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, str(save_path / ("model_ep_%i.pt" % j))) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}," "\t entropy {:.3f}, v {:.3f}, p {:.3f}, d-act {:.3f}/{:.3f}, d-state {:.3f}/{:.3f}, d-sup {:.3f}/{:.3f}/{:.3f}, emb {:.3f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss, dual_act_loss, action_acc, dual_state_loss, state_acc, dual_sup, sup_acc, miss_rate, emb_loss)) if args.vis and j % args.vis_interval == 0: try: #Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): config = None args = get_args() config, checkpoint = get_config_and_checkpoint(args) set_random_seeds(args, config) eval_log_dir = args.save_dir + "_eval" try: os.makedirs(args.save_dir) os.makedirs(eval_log_dir) except OSError: pass now = datetime.datetime.now() experiment_name = args.experiment_name + '_' + now.strftime("%Y-%m-%d_%H-%M-%S") # Create checkpoint file save_dir_model = os.path.join(args.save_dir, 'model', experiment_name) save_dir_config = os.path.join(args.save_dir, 'config', experiment_name) try: os.makedirs(save_dir_model) os.makedirs(save_dir_config) except OSError as e: logger.error(e) exit() if args.config: shutil.copy2(args.config, save_dir_config) curriculum = args.follow_curriculum if args.follow_curriculum: print('Using preset curriculum') # Tensorboard Logging writer = SummaryWriter(os.path.join(args.save_dir, 'tensorboard', experiment_name)) # Logger that writes to STDOUT and a file in the save_dir logger = setup_carla_logger(args.save_dir, experiment_name) device = torch.device("cuda:0" if args.cuda else "cpu") norm_reward = not config.no_reward_norm norm_obs = not config.no_obs_norm assert not (config.num_virtual_goals > 0) or (config.reward_class == 'SparseReward'), 'Cant use HER with dense reward' obs_converter = CarlaObservationConverter(h=84, w=84, rel_coord_system=config.rel_coord_system) action_converter = CarlaActionsConverter(config.action_type) envs = make_vec_envs(obs_converter, action_converter, args.starting_port, config.seed, config.num_processes, config.gamma, device, config.reward_class, num_frame_stack=1, subset=config.experiments_subset, norm_reward=norm_reward, norm_obs=norm_obs, apply_her=config.num_virtual_goals > 0, video_every=args.video_interval, video_dir=os.path.join(args.save_dir, 'video', experiment_name), curriculum=curriculum) if config.agent == 'forward': agent = agents.ForwardCarla() if config.agent == 'vpg': agent = agents.VPGCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, gamma=config.gamma, max_grad_norm=config.max_grad_norm) if config.agent == 'a2c': agent = agents.A2CCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm) elif config.agent == 'acktr': agent = agents.A2CCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm, acktr=True) elif config.agent == 'ppo': agent = agents.PPOCarla(obs_converter, action_converter, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm) if checkpoint is not None: load_modules(agent.optimizer, agent.model, checkpoint) rollouts = RolloutStorage(config.num_steps, config.num_processes, envs.observation_space, envs.action_space, 20, config.num_virtual_goals, config.rel_coord_system, obs_converter) obs = envs.reset() # Save the first observation obs = obs_to_dict(obs) rollouts.obs = obs_to_dict(rollouts.obs) for k in rollouts.obs: rollouts.obs[k][rollouts.step + 1].copy_(obs[k]) rollouts.obs = dict_to_obs(rollouts.obs) rollouts.to(device) start = time.time() total_steps = 0 total_episodes = 0 total_reward = 0 episode_reward = torch.zeros(config.num_processes) for j in range(config.num_updates): for step in range(config.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = agent.act( rollouts.get_obs(step), rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, info = envs.step(action) # For logging purposes carla_rewards = torch.tensor([i['carla-reward'] for i in info], dtype=torch.float) episode_reward += carla_rewards total_reward += carla_rewards.sum().item() total_steps += config.num_processes * config.num_steps if done.any(): total_episodes += done.sum() torch_done = torch.tensor(done.astype(int)).byte() mean_episode_reward = episode_reward[torch_done].mean().item() logger.info('{} episode(s) finished with reward {}'.format(done.sum(), mean_episode_reward)) writer.add_scalar('train/mean_ep_reward_vs_steps', mean_episode_reward, total_steps) writer.add_scalar('train/mean_ep_reward_vs_episodes', mean_episode_reward, total_episodes) episode_reward[torch_done] = 0 # If done then clean the history of observations. masks = torch.FloatTensor(1-done) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks.unsqueeze(-1)) if config.num_virtual_goals > 0: rollouts.apply_her(config.num_virtual_goals, device, beta=config.beta) with torch.no_grad(): next_value = agent.get_value(rollouts.get_obs(-1), # Get last observation rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "" and config.agent !='forward': save_path = os.path.join(save_dir_model, str(j) + '.pth.tar') save_modules(agent.optimizer, agent.model, args, config, save_path) total_num_steps = (j + 1) * config.num_processes * config.num_steps if j % args.log_interval == 0: # Logging to the stdout/our logs end = time.time() logger.info('------------------------------------') logger.info('Episodes {}, Updates {}, num timesteps {}, FPS {}'\ .format(total_episodes, j + 1, total_num_steps, total_num_steps / (end - start))) logger.info('------------------------------------') # Logging to tensorboard writer.add_scalar('train/cum_reward_vs_steps', total_reward, total_steps) writer.add_scalar('train/cum_reward_vs_updates', total_reward, j+1) if config.agent in ['a2c', 'acktr', 'ppo']: writer.add_scalar('debug/value_loss_vs_steps', value_loss, total_steps) writer.add_scalar('debug/value_loss_vs_updates', value_loss, j+1) writer.add_scalar('debug/action_loss_vs_steps', action_loss, total_steps) writer.add_scalar('debug/action_loss_vs_updates', action_loss, j+1) writer.add_scalar('debug/dist_entropy_vs_steps', dist_entropy, total_steps) writer.add_scalar('debug/dist_entropy_vs_updates', dist_entropy, j+1) # Sample the last reward writer.add_scalar('debug/sampled_normalized_reward_vs_steps', reward.mean(), total_steps) writer.add_scalar('debug/sampled_normalized_reward_vs_updates', reward.mean(), j+1) writer.add_scalar('debug/sampled_carla_reward_vs_steps', carla_rewards.mean(), total_steps) writer.add_scalar('debug/sampled_carla_reward_vs_updates', carla_rewards.mean(), j+1) if (args.eval_interval is not None and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.starting_port, obs_converter, args.x + config.num_processes, config.num_processes, config.gamma, eval_log_dir, config.add_timestep, device, True, curriculum) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(config.num_processes, 20, device=device) eval_masks = torch.zeros(config.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = agent.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs carla_obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() logger.info(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): torch.set_num_threads(1) envs = make_vec_envs( args_env_name, args_seed, args_num_processes) actor_critic = Policy( envs.observation_space.shape, envs.action_space) agent = PPO( actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm) rollouts = RolloutStorage( args_num_steps, args_num_processes, envs.observation_space.shape) obs = envs.reset() np.copyto(rollouts.obs[0], obs) num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = np.zeros(shape=(args_num_processes, 1)) for j in range(num_updates): for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob\ = actor_critic.act(rollouts.obs[step]) print(action) ss('hoho') obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) sum_re[i] *= 0 masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1]) rollouts.compute_returns( next_value, args_gamma) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() print( "E {}, N_steps {}, FPS {}" " mean/median {:.1f}/{:.1f}, min/max {:.1f}/{:.1f} Ent {:.4f},V {:.4f},A {:.4f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.recurrent_hidden_state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.obs[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) current_obs = current_obs.to(device) rollouts.to(device) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks masks = masks.to(device) if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:1" if args.cuda else "cpu") ## UID = 'exp_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) step_log = [] reward_log = [] ## To be used to selec environment mode = 'normal' # encoder type encoder = 'sym_VAE' if encoder == 'symbolic': embedding_size = (18, ) elif encoder == 'AE': embedding_size = (200, ) elif encoder == 'VAE': embedding_size = (100, ) elif encoder == 'sym_VAE': embedding_size = (118, ) else: raise NotImplementedError('fff') # load pre-trained AE #AE = VAEU([128,128]) #model_path = '/hdd_c/data/miniWorld/trained_models/VAE/dataset_4/VAEU.pth' #AE = torch.load(model_path) #AE.eval() # load pre-trained VAE VAE = VAER([128, 128]) model_path = '/hdd_c/data/miniWorld/trained_models/VAE/dataset_5/VAER.pth' VAE = torch.load(model_path).to(device) VAE.eval() # load pre-trained detector Detector_model = Detector model_path = '/hdd_c/data/miniWorld/trained_models/Detector/dataset_5/Detector_resnet18_e14.pth' Detector_model = torch.load(model_path).to(device) # load pre-trained RNN RNN_model = RNN(200, 128) model_path = '/hdd_c/data/miniWorld/trained_models/RNN/RNN1.pth' RNN_model = torch.load(model_path).to(device) RNN_model.eval() """ if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None """ envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) print(envs.observation_space.shape) #actor_critic = Policy(envs.observation_space.shape, envs.action_space, # base_kwargs={'recurrent': args.recurrent_policy}) actor_critic = Policy(embedding_size, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) #rollouts = RolloutStorage(args.num_steps, args.num_processes, # envs.observation_space.shape, envs.action_space, # actor_critic.recurrent_hidden_state_size) rollouts = RolloutStorage(args.num_steps, args.num_processes, embedding_size, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() #print(obs.size()) #obs = make_var(obs) print(obs.size()) with torch.no_grad(): if encoder == 'symbolic': z = Detector_model(obs) print(z.size()) z = Detector_to_symbolic(z) rollouts.obs[0].copy_(z) elif encoder == 'AE': z = AE.encode(obs) rollouts.obs[0].copy_(z) elif encoder == 'VAE': z = VAE.encode(obs)[0] rollouts.obs[0].copy_(z) elif encoder == 'sym_VAE': z_vae = VAE.encode(obs)[0] z_sym = Detector_model(obs) z_sym = Detector_to_symbolic(z_sym) z = torch.cat((z_vae, z_sym), dim=1) rollouts.obs[0].copy_(z) else: raise NotImplementedError('fff') #rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() for j in range(num_updates): #print(j) for step in range(args.num_steps): # Sample actions #print(step) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs #print(action) with torch.no_grad(): obs, reward, done, infos = envs.step(action) if encoder == 'symbolic': #print(obs.size()) np.save( '/hdd_c/data/miniWorld/training_obs_{}.npy'.format( step), obs.detach().cpu().numpy()) z = Detector_model(obs / 255.0) z = Detector_to_symbolic(z) #print(z) np.save( '/hdd_c/data/miniWorld/training_z_{}.npy'.format(step), z.detach().cpu().numpy()) elif encoder == 'AE': z = AE.encode(obs) elif encoder == 'VAE': z = VAE.encode(obs)[0] elif encoder == 'sym_VAE': z_vae = VAE.encode(obs)[0] z_sym = Detector_model(obs) z_sym = Detector_to_symbolic(z_sym) z = torch.cat((z_vae, z_sym), dim=1) else: raise NotImplementedError('fff') #obs = make_var(obs) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # # FIXME: works only for environments with sparse rewards # for idx, eps_done in enumerate(done): # if eps_done: # episode_rewards.append(reward[idx]) # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: #print('done') episode_rewards.append(infos[idx]['accumulated_reward']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) rollouts.insert(z, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps #print(len(episode_rewards)) step_log.append(total_num_steps) reward_log.append(np.mean(episode_rewards)) step_log_np = np.asarray(step_log) reward_log_np = np.asarray(reward_log) np.savez_compressed('/hdd_c/data/miniWorld/log/{}.npz'.format(UID), step=step_log_np, reward=reward_log_np) if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards))) if args.eval_interval is not None and len( episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass """ envs.close()
def main(): import matplotlib.pyplot as plt # You probably won't need this if you're embedding things in a tkinter plot... plt.ion() x = np.linspace(0, 6 * np.pi, 100) y = np.sin(x) fig = plt.figure() ax = fig.add_subplot(111) import time line1, = ax.plot([0, 1, 2], [0, 1, 1], 'r-') # Returns a tuple of line objects, thus the comma time.sleep(0.01) torch.set_num_threads(1) args.num_processes = 1 # device = torch.device("cuda:0" if args.cuda else "cpu") device = torch.device("cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = TorchRunner(acc=0.005) ob_shape = envs.reset().shape # envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, args.add_timestep, device, False) # actor_critic = Policy(ob_shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) # # try to load the previous policy # data = torch.load( # r"C:\Users\clvco\URA_F18\pytorch-a2c-ppo-acktr\trained_models\ppo\weight_positiverev_test.pt") # # # print(data) # actor_critic.load_state_dict(data[0].state_dict()) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) obs = envs.reset() ob_shape = obs.shape rollouts = RolloutStorage(args.num_steps, args.num_processes, ob_shape, envs.action_space, (agent.actor_critic.base.output_size), (1), actor_critic.recurrent_hidden_state_size) print(args.num_processes) print(envs.observation_space.shape) print(obs.shape) print(rollouts.obs[0].shape) rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = list() ep_reward = 0 import tqdm start = time.time() print(args) print(int(args.num_frames) // args.num_steps // args.num_processes) print('NUM', num_updates) timestep = 0 ep_ends = [] for j in range(num_updates): if j == 0: print("UPDATING SYNERGY") actor_critic.adjust_synergy(0.0) for step in tqdm.tqdm(range(args.num_steps)): # Sample actions timestep += 1 with torch.no_grad(): value, action, synergy, q, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) ep_reward += reward[0] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if done[0]: obs = envs.reset() episode_rewards.append(ep_reward) ep_ends.append(timestep) ep_reward = 0 # print(action) rollouts.insert(obs, recurrent_hidden_states, action, synergy, q, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model] print("Saving model") torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) print("Saved model to: ", os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps print("update time", print(len(episode_rewards))) if True: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.5f}/{:.5f}, min/max reward {:.5f}/{:.5f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards[-10:]), np.median(episode_rewards[-10:]), np.min(episode_rewards[-10:]), np.max(episode_rewards[-10:]), dist_entropy, value_loss, action_loss)) import time ydata = np.convolve(episode_rewards, np.ones(10) / 10, mode='valid') line1.set_xdata(np.arange(0, len(ydata))) line1.set_ydata(ydata) ax.set_xlim(0, len(ydata)) ax.set_ylim(min(ydata), max(ydata)) fig.canvas.draw() fig.canvas.flush_events() time.sleep(0.01) # save the returns xdata = np.array(ep_ends) ret_dir = 'returns_weight_experiments' os.makedirs(ret_dir, exist_ok=True) ret_path = ret_dir + '/' + args.env_name + '_' + str( args.seed) + '.npy' ep_path = ret_dir + '/' + "x_data-" + args.env_name + '_' + str( args.seed) + '.npy' np.save(ret_path, np.array(np.array(episode_rewards))) np.save(ep_path, ep_ends)
def main(): os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.start_container) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize print(str(actor_critic)) print('Total model size: %d' % modelSize) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) reward_avg = 0 if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step]), Variable(rollouts.states[step]), Variable(rollouts.masks[step])) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observation, reward and next obs obs, reward, done, info = envs.step(cpu_actions) # Maxime: clip the reward within [0,1] for more reliable training # This code deals poorly with large reward values reward = np.clip(reward, a_min=0, a_max=None) / 400 reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.observations[-1]), Variable(rollouts.states[-1]), Variable(rollouts.masks[-1]))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), reward_avg, dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) """ print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0]) ) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("config:\n") print("activation:", args.activation) print("evaluation:", args.evaluation) print("evaluation mode:", args.evaluation_mode) print("evaluation layer:", args.evaluation_layer) writer = SummaryWriter() torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, activation = args.activation, modulation = args.evaluation) # load trained model if args.load_model_path != None: state_dicts = torch.load(args.load_model_path) actor_critic.load_nets(state_dicts) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) # elif args.algo == 'ppo': # agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, # args.value_loss_coef, args.entropy_coef, lr=args.lr, # eps=args.eps, # max_grad_norm=args.max_grad_norm) # elif args.algo == 'acktr': # agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, # args.entropy_coef, acktr=True) tonic_g = 1 phasic_g = 1 if args.evaluation and args.evaluation_layer == 1: # f1 modulation tonic_g = args.f1_tonic_g phasic_g = args.f1_phasic_g if args.evaluation and args.evaluation_layer == 0: # input activation tonic_g = args.input_tonic_g phasic_g = args.input_phasic_g g = torch.ones(args.num_processes,1)*tonic_g g_device = (torch.ones(args.num_processes,1)*tonic_g).to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, tonic_g) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() pre_value = [None for i in range(args.num_processes)] evaluations = [0 for i in range(args.num_processes)] ## to calculate next_value and update g next_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size).to(device) next_g = torch.zeros(args.num_processes,1).to(device) next_masks = torch.zeros(args.num_processes,1).to(device) next_obs = torch.zeros(args.num_processes, *envs.observation_space.shape).to(device) for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.g[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) # calculate next value with old g and decide new g if args.evaluation: if args.evaluation_layer == 0: next_obs.copy_(neural_activity(obs,g_device)) else: next_obs.copy_(obs/255) next_recurrent_hidden_states.copy_(recurrent_hidden_states) next_g.copy_(g) next_masks.copy_(masks) with torch.no_grad(): next_value = actor_critic.get_value(next_obs, next_g, next_recurrent_hidden_states, next_masks).detach() evaluations, g, pre_value = calc_modes(reward, next_value, pre_value, evaluations, args.evaluation_mode, tonic_g, phasic_g, masks) g_device.copy_(g) # observation processing with new g if args.evaluation and args.evaluation_layer == 0: obs = neural_activity(obs, g_device) else: obs = obs/255.0 for idx in range(len(infos)): info = infos[idx] if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) steps_done = j*args.num_steps*args.num_processes + step*args.num_processes + idx writer.add_scalar('data/reward', info['episode']['r'], steps_done ) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, g) # record evaluation value to help decide parameters to switch modes if args.evaluation_log: writer.add_scalar('data/evaluations', evaluations[0], j*args.num_steps*args.num_processes + step*args.num_processes) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.g[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass state_dicts = actor_critic.save_nets() torch.save(state_dicts, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps writer.export_scalars_to_json("./all_scalars.json") writer.close()
def main(): envs = [make_env(env_name, seed, rank, log_dir) for rank in range(num_processes)] envs = SubprocVecEnv(envs) obs_shape = envs.observation_space.shape obs_shape = [obs_shape[0]*num_stack, *obs_shape[1:]] actor_critic = CNNPolicy(obs_shape[0], envs.action_space, False) if cuda: actor_critic.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) episode_rewards = torch.zeros([num_processes,1]) final_rewards = torch.zeros([num_processes,1]) if cuda: rollouts.cuda() current_obs = current_obs.cuda() if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] # test start = time.time() for j in range(num_updates): for step in range(num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze().cpu().numpy() #print(cpu_action) # obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # stack: make sure that reward is a numpy array(convert list to ndarray) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks # update obs nad rollouts update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) # compute current update's return next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, False, gamma, tau) # in a2c the values were calculated twice # the data in rollouts must be viewed, because the shape in rollouts is [num_steps, num_processes, x] which is [num,x] in actor_critic values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) # compute the loss values = values.view(num_steps, num_processes, 1) action_log_probs = action_log_probs.view(num_steps, num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() # update model optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm) optimizer.step() rollouts.after_update() if j % log_interval == 0: end = time.time() total_num_steps = (j + 1) * num_processes * num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) # todo: test save_url torch.save(actor_critic,save_url)
def main(): train_log = Log(log_name+'_train_log') evl_log = Log(log_name+'_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs( args_env_name, args_seed, args_num_processes) actor_critic = Policy( envs.observation_space.shape, envs.action_space) agent = PPO( actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm) rollouts = RolloutStorage( args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) # print(obs) # ss('i am over it') num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob\ = actor_critic.act(rollouts.obs[step]) obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) sum_re[i] *= 0 masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1]) rollouts.compute_returns( next_value, args_gamma) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ob_rms = get_vec_normalize(envs).ob_rms ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed, args_num_processes) ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result evl_log.log(ev_log_string)
class Runner(): def __init__(self, **args): cuda = not args['no_cuda'] and torch.cuda.is_available() self.device = torch.device("cuda:0" if cuda else "cpu") print("Model running on device: {}".format(self.device)) torch.set_num_threads(1) self.env_name = args['env_name'] self.epochs = args['epochs'] self.num_processes = args['num_processes'] self.num_steps = args['num_steps'] self.num_test_episodes = args['num_test_episodes'] self.test_every_n_epochs = args['test_every_n_epochs'] self.use_deterministic_policy_while_testing = args['use_deterministic_policy_while_testing'] self.grayscale = args['grayscale'] self.skip_frame = args['skip_frame'] self.num_frame_stack = args['num_frame_stack'] self.num_updates_per_epoch = args['num_updates_per_epoch'] self.num_steps = args['num_steps'] self.use_gae = args['use_gae'] self.gamma = args['gamma'] self.tau = args['tau'] self.reward_scaling = args['reward_scaling'] self.seed = args['seed'] self.log_dir = args['log_dir'] self.save_dir = args['save_dir'] try: os.makedirs(args['log_dir']) files = glob.glob(os.path.join(args['log_dir'], '*.manifest.json')) for f in files: os.remove(f) except OSError: files = glob.glob(os.path.join(args['log_dir'], '*.monitor.csv')) for f in files: os.remove(f) self.eval_log_dir = args['log_dir'] + "_eval" try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) self.envs = make_vec_envs(self.env_name, self.seed, self.num_processes, self.gamma, self.log_dir, self.device, False, self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack) self.algorithm = args['algorithm'] # Decreasing LR scheduler self.scheduler = None if self.algorithm == 'A2C': actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = A2C(actor_critic, **args['algorithm_parameters']) elif self.algorithm == 'PPO': if(args['decreasing_lr']): def lambdalr(epoch): return ((float(self.epochs - epoch)) / float(self.epochs) * args['algorithm_parameters']['lr']) # noqa: E704 actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = PPO(actor_critic, lambdalr, ** args['algorithm_parameters']) self.scheduler = self.agent.scheduler else: actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = PPO(actor_critic, None, ** args['algorithm_parameters']) self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.envs.observation_space.shape, self.envs.action_space, actor_critic.recurrent_hidden_state_size) obs = self.envs.reset() self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) self.episode_rewards = deque(maxlen=50) self.writer = SummaryWriter( comment="{}-{}".format(self.env_name, self.algorithm)) def run(self): start = time.time() for epoch in range(self.epochs): value_losses, action_losses, dist_entropies = [], [], [] print("\nEpoch %d\n-------" % (epoch + 1)) for j in trange(self.num_updates_per_epoch, leave=False): for step in range(self.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = self.policy.act( self.rollouts.obs[step], self.rollouts.recurrent_hidden_states[step], self.rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = self.envs.step(action) for info in infos: if 'episode' in info.keys(): print("New episode") self.episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) self.rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = self.policy.get_value(self.rollouts.obs[-1], self.rollouts.recurrent_hidden_states[-1], self.rollouts.masks[-1]).detach() self.rollouts.compute_returns( next_value, self.use_gae, self.gamma, self.tau) value_loss, action_loss, dist_entropy = self.agent.update( self.rollouts) value_losses.append(value_loss) action_losses.append(action_loss) dist_entropies.append(dist_entropy) self.rollouts.after_update() total_num_steps = (epoch + 1) * (j + 1) * \ self.num_processes * self.num_steps end = time.time() print("Total timesteps: {}, FPS: {}".format( total_num_steps, int(total_num_steps / (end - start)))) print("Statistic of the last %d episodes played" % len(self.episode_rewards)) if(len(self.episode_rewards) < 1): self.episode_rewards.append(0) episode_rewards_np = np.array(self.episode_rewards) value_losses = np.array(value_losses) action_losses = np.array(action_losses) dist_entropies = np.array(dist_entropies) print("Mean value loss: {}, Mean action loss: {}, Mean entropy: {}".format( value_losses.mean(), action_losses.mean(), dist_entropies.mean())) print(episode_rewards_np) print("Results: mean: {} +/- {}".format(np.mean(episode_rewards_np), np.std(episode_rewards_np))) print("Min: {}, Max: {}, Median: {}".format(np.min(episode_rewards_np), np.max(episode_rewards_np), np.median(episode_rewards_np))) self.writer.add_scalar( 'value_loss/mean', value_losses.mean(), epoch) self.writer.add_scalar( 'action_loss/mean', action_losses.mean(), epoch) self.writer.add_scalar( 'dist_entropy/mean', dist_entropies.mean(), epoch) self.writer.add_scalar( 'reward/mean', episode_rewards_np.mean(), epoch) self.writer.add_scalar( 'reward/max', episode_rewards_np.max(), epoch) self.writer.add_scalar( 'reward/min', episode_rewards_np.min(), epoch) if (epoch + 1) % self.test_every_n_epochs == 0: print("\nTesting...") bar = tqdm(total=self.num_test_episodes, leave=False) eval_envs = make_vec_envs(self.env_name, self.seed + self.num_processes, self.num_processes, self.gamma, self.eval_log_dir, self.device, True, self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(self.envs).ob_rm eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(self.num_processes, self.policy.recurrent_hidden_state_size, device=self.device) eval_masks = torch.zeros( self.num_processes, 1, device=self.device) while len(eval_episode_rewards) < self.num_test_episodes: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = self.policy.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=self.use_deterministic_policy_while_testing) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): bar.update(1) eval_episode_rewards.append( info['episode']['r']) eval_envs.close() bar.close() print(eval_episode_rewards) print(" Evaluation using {} episodes: mean reward {:.5f}, min/max {}/{}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards), np.min(eval_episode_rewards), np.max(eval_episode_rewards))) print("Total elapsed time: %.2f minutes" % ((time.time() - start) / 60.0)) if self.scheduler is not None: print("Decreasing the learning rate...") self.scheduler.step() print("Saving the model...") save_path = os.path.join(self.save_dir, self.algorithm) try: os.makedirs(save_path) except OSError: pass save_model = self.policy if self.device == "cuda:0": save_model = copy.deepcopy(self.policy).cpu() save_model = [save_model, getattr(get_vec_normalize(self.envs), 'ob_rms', None)] torch.save(save_model, os.path.join( save_path, self.env_name + ".pt"))
class Runner(object): def __init__(self, net, env, params, is_cuda=True, seed=42, log_dir=abspath("/data/patrik")): super().__init__() # constants self.timestamp = strftime("%Y-%m-%d %H_%M_%S", gmtime()) self.seed = seed self.is_cuda = torch.cuda.is_available() and is_cuda # parameters self.params = params """Logger""" self.logger = TemporalLogger(self.params.env_name, self.timestamp, log_dir, *["rewards", "features"]) self.checkpointer = AgentCheckpointer(self.params.env_name, self.params.num_updates, self.timestamp) """Environment""" self.env = env self.storage = RolloutStorage(self.params.rollout_size, self.params.num_envs, self.env.observation_space.shape[0:-1], self.params.n_stack, is_cuda=self.is_cuda) """Network""" self.net = net if self.is_cuda: self.net = self.net.cuda() def train(self): """Environment reset""" obs = self.env.reset() self.storage.states[0].copy_(self.storage.obs2tensor(obs)) for num_update in range(self.params.num_updates): final_value, entropy = self.episode_rollout() self.net.optimizer.zero_grad() """ICM prediction """ # tensors for the curiosity-based loss # feature, feature_pred: fwd_loss # a_t_pred: inv_loss icm_loss = self.net.icm( self.params.num_envs, self.storage.states.view(-1, self.params.n_stack, *self.storage.frame_shape), self.storage.actions.view(-1)) """Assemble loss""" a2c_loss, rewards = self.storage.a2c_loss( final_value, entropy, self.params.value_coeff, self.params.entropy_coeff) loss = a2c_loss + icm_loss loss.backward(retain_graph=False) # gradient clipping nn.utils.clip_grad_norm_(self.net.parameters(), self.params.max_grad_norm) """Log rewards & features""" if len(self.storage.episode_rewards) > 1: self.logger.log( **{ "rewards": np.array(self.storage.episode_rewards), "features": self.storage.features[-1].detach().cpu().numpy() }) self.net.optimizer.step() # it stores a lot of data which let's the graph # grow out of memory, so it is crucial to reset self.storage.after_update() if len(self.storage.episode_rewards) > 1: self.checkpointer.checkpoint(loss, self.storage.episode_rewards, self.net) if num_update % 1000 == 0: print("current loss: ", loss.item(), " at update #", num_update) self.storage.print_reward_stats() # torch.save(self.net.state_dict(), "a2c_time_log_no_norm") self.env.close() self.logger.save(*["rewards", "features"]) self.params.save(self.logger.data_dir, self.timestamp) def episode_rollout(self): episode_entropy = 0 for step in range(self.params.rollout_size): """Interact with the environments """ # call A2C a_t, log_p_a_t, entropy, value, a2c_features = self.net.a2c.get_action( self.storage.get_state(step)) # accumulate episode entropy episode_entropy += entropy # interact obs, rewards, dones, infos = self.env.step(a_t.cpu().numpy()) # save episode reward self.storage.log_episode_rewards(infos) self.storage.insert(step, rewards, obs, a_t, log_p_a_t, value, dones, a2c_features) self.net.a2c.reset_recurrent_buffers(reset_indices=dones) # Note: # get the estimate of the final reward # that's why we have the CRITIC --> estimate final reward # detach, as the final value will only be used as a with torch.no_grad(): _, _, _, final_value, final_features = self.net.a2c.get_action( self.storage.get_state(step + 1)) self.storage.features[step + 1].copy_(final_features) return final_value, episode_entropy
def train_a_gym_model(env, config): """We train gym-type RL problem using ppo given environment and configuration""" torch.set_num_threads(1) seed = config.get('seed', None) log_dir = config.get('log_dir', '/tmp/gym') log_interval = config.get('log_interval', 10) save_interval = config.get('save_interval', 100) save_dir = config.get('save_dir', 'trained_models/ppo') add_timestep = config.get('add_timestep', False) num_processes = config.get('num_processes', 4) gamma = config.get('gamma', 0.99) num_stack = config.get('num_stack', 1) recurrent_policy = config.get('recurrent_policy', False) cuda = config.get('cuda', True) vis = config.get('vis', True) vis_interval = config.get('vis_interval', 100) env_name = config['env_name'] save_step = config.get('save_step', None) warm_model = config.get('warm_model', None) if save_step is not None: next_save_step = save_step # clean the log folder, if necessary try: os.makedirs(log_dir) except OSError: files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed(seed) if vis: from visdom import Visdom port = config.get('port', 8097) viz = Visdom(port=port) win = None envs = [ make_env(env, seed, i, log_dir, add_timestep) for i in range(num_processes) ] if num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) if warm_model is None: actor_critic = Policy(obs_shape, envs.action_space, recurrent_policy) else: actor_critic, ob_rms, ret_rms = torch.load(warm_model) envs.ob_rms = ob_rms # also use previous existing observation rms envs.ret_rms = ret_rms if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if cuda: actor_critic.cuda() clip_param = config.get('clip_param', 0.2) ppo_epoch = config.get('ppo_epoch', 4) num_mini_batch = config.get('num_mini_batch', 32) value_loss_coef = config.get('value_loss_coef', 0.5) entropy_coef = config.get('entropy_coef', 0.01) lr = config.get('lr', 1e-3) eps = config.get('eps', 1e-5) max_grad_norm = config.get('max_grad_norm', 0.5) use_gae = config.get('use_gae', False) tau = config.get('tau', 0.95) num_steps = config.get('num_steps', 100) num_frames = config.get('num_frames', 1e6) num_updates = int(num_frames) // num_steps // num_processes agent = algo.PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=lr, eps=eps, max_grad_norm=max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, num_stack) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) final_rewards = torch.zeros([num_processes, 1]) if cuda: current_obs = current_obs.cuda() rollouts.cuda() def save_the_model(num=None): """num is additional information""" # save it after training save_path = save_dir try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None, hasattr(envs, 'ret_rms') and envs.ret_rms or None ] if num is None: save_name = '%s.pt' % env_name else: save_name = '%s_at_%d.pt' % (env_name, int(num)) torch.save(save_model, os.path.join(save_path, save_name)) start = time.time() for j in range(1, 1 + num_updates): for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, num_stack) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % save_interval == 0 and save_dir != "": save_the_model() if save_step is not None: total_num_steps = j * num_processes * num_steps if total_num_steps > next_save_step: save_the_model(total_num_steps) next_save_step += save_step if j % log_interval == 0: end = time.time() total_num_steps = j * num_processes * num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if vis and j % vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, log_dir, env_name, 'ppo', num_frames) except IOError: pass # finally save model again save_the_model()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") tbwriter = SummaryWriter(log_dir=args.save_dir) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) args.obs_mean, args.obs_std = get_env_mean_std(args.env_name, args.seed) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'obs_mean': args.obs_mean, 'obs_std': args.obs_std }) if args.use_curiosity: # Works only for discrete actions currently fwd_model = ForwardModel(envs.action_space.n, state_size=512, hidden_size=256) inv_model = InverseModel(envs.action_space.n, state_size=512, hidden_size=256) fwd_model.to(device) inv_model.to(device) else: fwd_model = None inv_model = None actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=False, norm_adv=args.norm_adv, use_curiosity=args.use_curiosity, fwd_model=fwd_model, inv_model=inv_model, curiosity_beta=args.curiosity_beta, curiosity_lambda=args.curiosity_lambda) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, use_curiosity=args.use_curiosity, fwd_model=fwd_model, inv_model=inv_model, curiosity_beta=args.curiosity_beta, curiosity_lambda=args.curiosity_lambda) elif args.algo == 'acktr': if args.use_curiosity: raise NotImplementedError agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, norm_rew=args.norm_rew) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, actor_features = actor_critic.act_curiosity( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) reward = reward.to(device) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) if args.use_curiosity: with torch.no_grad(): next_actor_features = actor_critic.get_features( obs, recurrent_hidden_states, masks).detach() # Augment reward with curiosity rewards action_onehot = torch.zeros(args.num_processes, envs.action_space.n, device=device) action_onehot.scatter_(1, action.view(-1, 1).long(), 1) with torch.no_grad(): pred_actor_features = fwd_model(actor_features, action_onehot).detach() curiosity_rewards = 0.5 * torch.mean(F.mse_loss( pred_actor_features, next_actor_features, reduce=False), dim=1).view(-1, 1) reward = reward + args.curiosity_eta * curiosity_rewards for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if not args.use_curiosity: value_loss, action_loss, dist_entropy = agent.update(rollouts, device=device) else: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update( rollouts, device=device) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) tbwriter.add_scalar('mean reward', np.mean(episode_rewards), total_num_steps) tbwriter.add_scalar('median reward', np.median(episode_rewards), total_num_steps) tbwriter.add_scalar('dist_entropy', dist_entropy, total_num_steps) tbwriter.add_scalar('value_loss', value_loss, total_num_steps) tbwriter.add_scalar('action_loss', action_loss, total_num_steps) if args.use_curiosity: print("fwd loss: {:.5f}, inv loss: {:.5f}".format( fwd_loss, inv_loss)) tbwriter.add_scalar('fwd_loss', fwd_loss, total_num_steps) tbwriter.add_scalar('inv_loss', inv_loss, total_num_steps) if args.eval_interval is not None and len( episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(args): env = GymEnvironment(args, gamma) env.env = env.env.unwrapped actor_critic = Policy(obs_shape, env.action_size, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr, eps, max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, env.action_space, actor_critic.recurrent_hidden_state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs, _, _, _ = env.new_expt() obs = obs[np.newaxis, ...] current_obs[:, -1] = torch.from_numpy(obs) rollouts.obs[0].copy_(current_obs) current_obs = current_obs.to(device) rollouts.to(device) num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps)) n_goal_reached = 0 n_episodes = 0 for j in range(num_updates): for step in range(num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() (obs, reward, done), goal_reached = env.act(action) reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]]) masks = masks.to(device) current_obs[:, :-1] = current_obs[:, 1:] if done: current_obs[:] = 0 current_obs[:, -1] = torch.from_numpy(obs) rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if done: n_episodes += 1 env.new_expt() if goal_reached: n_goal_reached += 1 with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau, step) value_loss, action_loss, dist_entropy = agent.update(rollouts, step) rollouts.after_update() if j % log_interval == 0: total_num_steps = (j + 1) * num_processes * num_steps try: success = float(n_goal_reached) / n_episodes except ZeroDivisionError: success = 0. print( "Timesteps: {}, Goal reached : {} / {}, Success %: {}".format( total_num_steps, n_goal_reached, n_episodes, success)) if args.lang_coeff > 0: av_list = np.array(env.action_vectors_list) for k in range(len(spearman_corr_coeff_actions)): sr, _ = spearmanr(env.rewards_list, av_list[:, k]) print(k, sr)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") """ if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None """ envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: episode_rewards.append(reward[idx]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n". format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(episode_rewards) / len(episode_rewards) ) ) if args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards) )) """
class VecEnvAgent(object): def __init__(self, envs, args): self.envs = envs self.args = args obs_shape = self.envs.observation_space.shape self.obs_shape = (obs_shape[0] * self.args.num_stack, *obs_shape[1:]) self.actor_critic = self.select_network() self.optimizer = self.select_optimizer() if self.args.cuda: self.actor_critic.cuda() self.action_shape = 1 if self.envs.action_space.__class__.__name__ == "Discrete" \ else self.envs.action_space.shape[0] self.current_obs = torch.zeros(self.args.num_processes, *self.obs_shape) obs = self.envs.reset() self.update_current_obs(obs) self.rollouts = RolloutStorage(self.args.num_steps, self.args.num_processes, self.obs_shape, self.envs.action_space, self.actor_critic.state_size) self.rollouts.observations[0].copy_(self.current_obs) # These variables are used to compute average rewards for all processes. self.episode_rewards = torch.zeros([self.args.num_processes, 1]) self.final_rewards = torch.zeros([self.args.num_processes, 1]) if self.args.cuda: self.current_obs = self.current_obs.cuda() self.rollouts.cuda() if self.args.vis: from visdom import Visdom self.viz = Visdom(port=args.port) self.win = None def select_network(self): if len(self.envs.observation_space.shape) == 3: actor_critic = CNNPolicy(self.obs_shape[0], self.envs.action_space, self.args.recurrent_policy) else: assert not self.args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(self.obs_shape[0], self.envs.action_space) #actor_critic = BPW_MLPPolicy(obs_shape[0], self.envs.action_space) return actor_critic def select_optimizer(self): if self.args.algo == 'a2c' and not self.args.use_adam: optimizer = optim.RMSprop(self.actor_critic.parameters(), self.args.lr, eps=self.args.eps, alpha=self.args.alpha) elif self.args.algo == 'ppo' or self.args.algo == 'a2c': optimizer = optim.Adam(self.actor_critic.parameters(), self.args.lr, eps=self.args.eps) self.meta_optimizer = Adam_Custom(self.actor_critic.parameters(), lr=self.args.lr,eps=self.args.eps) elif self.args.algo == 'acktr': optimizer = KFACOptimizer(self.actor_critic) else: raise TypeError("Optimizer should be any one from {a2c, ppo, acktr}") return optimizer def update_current_obs(self, obs): shape_dim0 = self.envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if self.args.num_stack > 1: self.current_obs[:, :-shape_dim0] = self.current_obs[:, shape_dim0:] self.current_obs[:, -shape_dim0:] = obs def run(self): for step in range(self.args.num_steps): value, action, action_log_prob, states = self.actor_critic.act( Variable(self.rollouts.observations[step], volatile=True), Variable(self.rollouts.states[step], volatile=True), Variable(self.rollouts.masks[step], volatile=True) ) cpu_actions = action.data.squeeze(1).cpu().numpy() #print (cpu_actions) #input() # Obser reward and next obs obs, reward, done, info = self.envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() self.episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) self.final_rewards *= masks self.final_rewards += (1 - masks) * self.episode_rewards self.episode_rewards *= masks if self.args.cuda: masks = masks.cuda() if self.current_obs.dim() == 4: self.current_obs *= masks.unsqueeze(2).unsqueeze(2) else: self.current_obs *= masks self.update_current_obs(obs) self.rollouts.insert(step, self.current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = self.actor_critic( Variable(self.rollouts.observations[-1], volatile=True), Variable(self.rollouts.states[-1], volatile=True), Variable(self.rollouts.masks[-1], volatile=True) )[0].data self.rollouts.compute_returns(next_value, self.args.use_gae, self.args.gamma, self.args.tau) dist_entropy, value_loss, action_loss = update(self) self.rollouts.after_update() return dist_entropy, value_loss, action_loss def meta_run(self,theta_loss,theta_grad): for step in range(self.args.num_steps): value, action, action_log_prob, states = self.actor_critic.act( Variable(self.rollouts.observations[step], volatile=True), Variable(self.rollouts.states[step], volatile=True), Variable(self.rollouts.masks[step], volatile=True) ) cpu_actions = action.data.squeeze(1).cpu().numpy() #print (cpu_actions) #input() # Obser reward and next obs obs, reward, done, info = self.envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() self.episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) self.final_rewards *= masks self.final_rewards += (1 - masks) * self.episode_rewards self.episode_rewards *= masks if self.args.cuda: masks = masks.cuda() if self.current_obs.dim() == 4: self.current_obs *= masks.unsqueeze(2).unsqueeze(2) else: self.current_obs *= masks self.update_current_obs(obs) self.rollouts.insert(step, self.current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = self.actor_critic( Variable(self.rollouts.observations[-1], volatile=True), Variable(self.rollouts.states[-1], volatile=True), Variable(self.rollouts.masks[-1], volatile=True) )[0].data self.rollouts.compute_returns(next_value, self.args.use_gae, self.args.gamma, self.args.tau) dist_entropy, value_loss, action_loss = meta_update(self,theta_loss,theta_grad) self.rollouts.after_update() return dist_entropy, value_loss, action_loss # def update_net(self,dist_entropy,value_loss,action_loss): # # self.optimizer.zero_grad() # # (value_loss + action_loss - dist_entropy * 0.01).backward() # # nn.utils.clip_grad_norm(self.actor_critic.parameters(), 0.2) # # self.optimizer.step() # update_network(self,dist_entropy,value_loss,action_loss) def evaluate(self,j,dist_entropy,value_loss,action_loss,model_file=None): end = time.time() total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps print("Updates {}, num timesteps {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, self.final_rewards.mean(), self.final_rewards.median(), self.final_rewards.min(), self.final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) try: # Sometimes monitor doesn't properly flush the outputs self.win = visdom_plot(self.viz, self.win, self.args.log_dir, self.args.env_name, self.args.algo) except IOError: pass def train(self, num_updates): start = time.time() for j in range(num_updates): dist_entropy, value_loss, action_loss = self.run() if j % self.args.save_interval == 0 and self.args.save_dir != "": save_path = os.path.join(self.args.save_dir, self.args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = self.actor_critic if self.args.cuda: save_model = copy.deepcopy(self.actor_critic).cpu() save_model = [save_model, hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, self.args.env_name + ".pt")) if j % self.args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), self.final_rewards.mean(), self.final_rewards.median(), self.final_rewards.min(), self.final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if self.args.vis and j % self.args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs self.win = visdom_plot(self.viz, self.win, self.args.log_dir, self.args.env_name, self.args.algo) except IOError: pass def train_maml(self, num_updates): start = time.time() theta_list = [] num_tasks = 1000 sample_size = 10 # episode_id: episode_id%10==0) # env = gym.wrappers.Monitor(self.envs, self.args.save_dir, video_callable=lambda episode_id: episode_id%10==0) # Create the variations needed task_list = [] for i in range(num_tasks): friction = np.random.randint(low=1, high=10, size=3).astype('float32')/10. friction_1 = np.random.uniform(low=0.1, high=0.8, size=3).astype('float32') task = {'default/geom': ['', 'friction', '{0:.1f} {1:.1f} {2:.1f}'.format( friction[0], friction[1], friction[2])], 'worldbody/body/body/geom': [[['name', 'fthigh'], ['type', 'capsule']], 'friction', '{0:.1f} {1:.1f} {2:.1f}'.format( friction_1[0], friction_1[1], friction_1[2])] } # task2 = {'option': ['gravity', '{0:.2f} {1:.2f} {2:.2f}'.format(0,0,gravity_z)]} task_list.append(task) for j in range(num_updates): sample_indexes = np.random.randint(0, num_tasks, size=sample_size) # Get the theta if j == 0: theta = self.get_weights() # Inner loop # First gradient for i, sample_index in enumerate(sample_indexes): # Get the task task = task_list[sample_index] env = self.envs.venv.envs[0] # env = gym.wrappers.Monitor(env.env, './videos2/', video_callable=lambda episode_id: episode_id%10==0) _tag_names = [] _tag_identifiers = [] _attributes = [] _values = [] for k in task.keys(): v = task[k] _tag_names.append(k) _tag_identifiers.append(v[0]) _attributes.append(v[1]) _values.append(v[2]) env.env.env.my_init(_tag_names, \ _tag_identifiers, _attributes, \ _values, None) # Set the model weights to theta before training self.set_weights(theta) dist_entropy, value_loss, action_loss = self.run() if j == 0: theta_list.append(self.get_weights()) else: print(i) theta_list[i] = self.get_weights() # Second gradiet theta_copy = deepcopy(theta) for k1, sample_index in enumerate(sample_indexes): # Get the task task = task_list[sample_index] env = self.envs.venv.envs[0] _tag_names = [] _tag_identifiers = [] _attributes = [] _values = [] for k in task.keys(): v = task[k] _tag_names.append(k) _tag_identifiers.append(v[0]) _attributes.append(v[1]) _values.append(v[2]) env.env.env.my_init(_tag_names, \ _tag_identifiers, _attributes, \ _values, None) # Get the network loss for this task for 1 episode # TODO: There should be no while loop # while self.a2c.n_episodes < 1: dist_entropy, value_loss, action_loss = self.meta_run(theta_list[k1],theta_copy) theta = self.get_weights() # Set the model weights to theta # self.set_weights(theta) # Update theta # Change the update network function # theta['state_dict'] = self.agent.update_net(theta['state_dict'],dist_entropy,value_loss,action_loss) # env = gym.wrappers.Monitor(env, './videos/', video_callable=lambda episode_id: episode_id%10==0,force=True) if j % self.args.save_interval == 0 and self.args.save_dir != "": save_path = os.path.join(self.args.save_dir, self.args.algo) try: os.makedirs(save_path) except OSError: pass model_state = {'num_updates': j, 'state_dict': self.actor_critic.state_dict(), 'optimizer': self.meta_optimizer.state_dict() } model_state = [model_state,hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None] torch.save(model_state, os.path.join(save_path, self.args.env_name + 'update_'+ str(j) +".pt")) # # A really ugly way to save a model to CPU # save_model = self.actor_critic # if self.args.cuda: # save_model = copy.deepcopy(self.actor_critic).cpu() # save_model = [save_model, # hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None] # torch.save(save_model, os.path.join(save_path, self.args.env_name + ".pt")) if j % self.args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), self.final_rewards.mean(), self.final_rewards.median(), self.final_rewards.min(), self.final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if self.args.vis and j % self.args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs self.win = visdom_plot(self.viz, self.win, self.args.log_dir, self.args.env_name, self.args.algo) except IOError: pass def get_weights(self): # state_dicts = {'id': id, # 'state_dict': self.actor_critic.state_dict(), # } return self.actor_critic.state_dict() def set_weights(self, state_dicts): checkpoint = state_dicts self.actor_critic.load_state_dict(checkpoint)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:] ) # I guess the obs_shape[0] is channel number if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # args.num_steps should be the length of interactions before each updating/training # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy( ) # returns are state value, sampled action, act_log_prob, hidden states # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert( step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks ) # so the rollout stores one batch of interaction sequences, each sequence has length of args.num_steps next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) # values should be values of observations, states are the hidden states used in rnn module, by pwang8 values = values.view( args.num_steps, args.num_processes, 1) # values are estimated current state values action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) # rollouts.returns are current "Action" value calculted following Bellmans' eqaution gamma * State_value(t+1) + reward(t) advantages = Variable( rollouts.returns[:-1] ) - values # This is also the definition of advantage value (action_value - state_value). value_loss = advantages.pow( 2).mean() # values are estimated current state_value(t) action_loss = -(Variable(advantages.data) * action_log_probs).mean() # If ACKTR is utilized, it is not only a different optimizer is used, they also added some new loss source if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -( values - Variable(sample_values.data) ).pow(2).mean( ) # don't know what is the difference between this and just randomly sample some noise fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[: -1] # calculating the advantage value of an action advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) # The difference from this ppo optimization to the optimization above is that: it updates params for # multiple epochs in ppo optimization. Because of this, it samples from the rollouts storage a minibatch # every time to calculate gradient. Sampling is conducted for optimization purpose. for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) # For the 1st epoch of updating, I guess the action_log_probls is the same as old_action_log_probs_batch # because params of the NN have not been updated at that time. But later, in other updating epochs, # this ratio will generate some error. The old_action_log_probs_batch will not be updated during # these param updating epochs. # action_log_probs is the log prob of that action taken by the agent. So it's one value here, not # log_prob for all actions with certain input observation/state. By pwang8, Dec 31, 2017 adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) # compared to a2c, the major difference for ppo is that action_loss is calculated in controlled way value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)] # env = get_test_env("001") envs = [lambda: get_test_env("000") for _ in range(args.num_processes)] # num_states = len(env.all_possible_states()) if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = OptionCritic(num_options, obs_shape[0], envs.action_space, args.recurrent_policy) else: # assert not args.recurrent_policy, \ # "Recurrent policy is not implemented for the MLP controller" # actor_critic = MLPPolicy(obs_shape[0], envs.action_space) raise NotImplementedError() if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': # optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) raise NotImplementedError() elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps = args.eps) elif args.algo == 'acktr': # optimizer = KFACOptimizer(actor_critic) raise NotImplementedError() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, num_options) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) optionSelection = 0 if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() #print(options) #print(options[0]) for j in range(num_updates): options = [-1] * args.num_processes for step in range(args.num_steps): # Choose Option t0 = time.time() selection_value, new_option, option_log_prob, states = actor_critic.get_option(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) # print(new_option) for i in range(args.num_processes): if options[i] == -1: options[i] = new_option[i].data[0] #print("option is:") #print(options) t1 = time.time() # Sample actions value, action, action_log_prob, states = actor_critic.get_output( options, Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() t2 = time.time() # Termination term_value, termination, termination_log_prob, _ = actor_critic.get_termination( options, Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) termination = torch.LongTensor([termination[i].data[0] for i in range(termination.shape[0])]) t3 = time.time() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # newIndex = obs_to_int(obs) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks, options, termination) for i in range(termination.shape[0]): if termination[i] == 1: options[i] = -1 t4 = time.time() #print("part1") #print(t1 - t0) #print("part2") #print(t2-t1) #print("part3") #print(t3-t2) #print("part4") #print(t4-t3) for i in range(args.num_processes): if options[i]== -1: selection_value, new_option, option_log_prob, states = actor_critic.get_option(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) # print(new_option) options[i] = new_option[i].data[0] rollouts.options[step+1].copy_(torch.LongTensor(options)) next_value = actor_critic.get_output(options,Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: raise NotImplementedError() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): for i in range(args.num_steps): # Get the ith step during exploration options = rollouts.options[i] #print(options) adv_targ = Variable(advantages[i]) old_action_log_probs = rollouts.action_log_probs[i] termination = rollouts.optionSelection[i] #print(termination) # Use critic value of option nn to update option parameters values, action_log_probs, dist_entropy, states = actor_critic.evaluate_option( Variable(rollouts.observations[i]), Variable(rollouts.states[i]), Variable(rollouts.masks[i]), Variable(rollouts.actions[i]), options) #print(action_log_probs) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(rollouts.returns[i]) - values).pow(2).mean() selection_log_prob = actor_critic.evaluate_selection( Variable(rollouts.observations[i]), Variable(rollouts.states[i]), Variable(rollouts.masks[i]), Variable(termination), Variable(rollouts.options[i].type(torch.cuda.LongTensor))) V_Omega = selection_log_prob * values # Update termination parameters termination_log_prob = actor_critic.evaluate_termination( Variable(rollouts.observations[i]), Variable(rollouts.states[i]), Variable(rollouts.masks[i]), Variable(termination.type(torch.cuda.LongTensor)), rollouts.options[i+1]) left_values = [] right_values = [] for i in range(args.num_processes): if int(termination[i]) == 1: left_values.append(V_Omega[i]) right_values.append(values[i]) elif int(termination[i]) == 0: left_values.append(values[i]) right_values.append(V_Omega[i]) left_values = torch.cat(left_values) right_values = torch.cat(right_values) termination_loss = (- torch.exp(termination_log_prob) * left_values - (1 - torch.exp(termination_log_prob)) * right_values).mean() optimizer.zero_grad() (action_loss + value_loss+ termination_loss - V_Omega.mean()).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) writer.add_scaler("final_reward_max", final_rewards.max(), plot_index) plot_index += 1 if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs print("hit") win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): ''' Train PPO policies on each of the training environments. ''' args = get_args() try: os.makedirs(args.log_dir) except OSError: pass torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args, device) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) ep_reward = np.zeros(args.num_processes) episode_rewards = deque(maxlen=100) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obs reward and next obs obs, reward, done, infos = envs.step(action) if 'spaceship' in args.env_name: # spaceship, swimmer for i in range(len(done)): if done[i]: episode_rewards.append(reward[i].item()) # elif 'swimmer' in args.env_name: else: for i in range(len(done)): ep_reward[i] += reward[i].numpy().item() if done[i]: episode_rewards.append(ep_reward[i]) ep_reward[i] = 0 # if 'ant' in args.env_name: # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, True, args.gamma, args.gae_lambda, True) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": try: os.makedirs(args.save_dir) except OSError: pass torch.save( actor_critic.state_dict(), os.path.join(args.save_dir, "ppo.{}.env{}.seed{}.pt"\ .format(args.env_name, args.default_ind, args.seed)) ) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("\nUpdates {}, num timesteps {}, Last {} training episodes: \ \n mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, device) envs.close()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None if args.num_processes > 1: if args.retro_contest == True: import json sonic_env_confs = json.load(open(args.sonic_config_file, 'r')) sonic_env_confs = sonic_env_confs['Train'] sonic_env_confs = [v for _, v in sonic_env_confs.items()] envs = SubprocVecSonicEnv(sonic_env_confs, args.num_processes) else: envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] envs = SubprocVecEnv(envs) else: envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) prev_saved_rew_median = float('-inf') actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if args.load_model: model_path = os.path.join(args.save_dir, args.algo, args.env_name) + ".pt" actor_critic, ob_rms, prev_saved_rew_median = torch.load(model_path) print("Loaded actor_critic model from:", model_path, "which got a median score of:", prev_saved_rew_median) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() prev_reward = 0.0 start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and final_rewards.median( ) > prev_saved_rew_median and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None, final_rewards.median() ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) prev_saved_rew_median = final_rewards.median() # Save a separate copy just in case the main saved model ends up being worser. # Helps to have a few saved models to choose from at test/runtime torch.save( save_model, os.path.join( save_path, args.env_name + str(final_rewards.median()) + '.pt')) print("Saved the state which got a median reward of", prev_saved_rew_median) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(args): env = GymEnvironment(args, gamma) env.env = env.env.unwrapped actor_critic = Policy(obs_shape, env.action_size, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr, eps, max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, env.action_space, actor_critic.recurrent_hidden_state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs, _, _, _ = env.new_expt() obs = obs[np.newaxis, ...] current_obs[:, -1] = torch.from_numpy(obs) rollouts.obs[0].copy_(current_obs) current_obs = current_obs.to(device) rollouts.to(device) num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps)) n_goal_reached = 0 n_episodes = 0 for j in tqdm(range(num_updates), ascii=True): for step in range(num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() (obs, reward, done), goal_reached = env.act(action) reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]]) masks = masks.to(device) current_obs[:, :-1] = current_obs[:, 1:] if done: current_obs[:] = 0 current_obs[:, -1] = torch.from_numpy(obs) rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if done: n_episodes += 1 env.new_expt() if goal_reached: n_goal_reached += 1 with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau, step) value_loss, action_loss, dist_entropy = agent.update(rollouts, step) rollouts.after_update() torch.save(agent.actor_critic.state_dict(), 'log/model.pt')
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) print('here') if args.env_name == 'Reacher-v2': rbf1 = build_features_reacher2(.2, 5, 2) len_rbf = rbf1._K len_features = len_rbf + 1 if args.env_name == 'Hopper-v2': len_features = 3 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, len_features) print('here2') obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) num_updates = 20 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # Prepare demos demo_actions = np.zeros( (1, args.num_processes, envs.action_space.shape[0])) demo_states = np.zeros( (1, args.num_processes, envs.observation_space.shape[0])) demo_features = np.zeros((1, args.num_processes, len_features)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) # obs, reward and next obs demo_actions = np.concatenate( [demo_actions, action.reshape(1, args.num_processes, -1)], 0) demo_states = np.concatenate([ demo_states, rollouts.obs[step].reshape( 1, args.num_processes, -1) ], 0) feat_rewards = np.zeros((args.num_processes, len_features)) if args.env_name == 'Hopper-v2': if args.num_processes > 1: pos_before = envs.get_sim_data() obs, reward, done, infos = envs.step(action) if args.env_name == 'Hopper-v2': if args.num_processes > 1: pos_after = envs.get_sim_data() for num_p in range(args.num_processes): feat_1 = pos_after[num_p] - pos_before[num_p] feat_2 = 0 if not done[num_p]: feat_2 = 1 # feat_2 = np.array([1 for _ in range(args.num_processes)]) feat_3 = np.array( [np.linalg.norm(action[num_p], ord=2)**2]).flatten() feat_rewards[num_p] = np.array( [feat_1, feat_2, feat_3]) if args.env_name == 'Reacher-v2': if args.num_processes > 1: body_data = envs.get_body_data() for num_p in range(args.num_processes): rbf1_ = rbf1(body_data[num_p][:-1]) rbf4_ = np.array( [np.linalg.norm(action[num_p], ord=2)**2]) feat_rewards[num_p] = np.concatenate( (rbf1_.reshape(-1), rbf4_)) else: rbf1_ = rbf1( (envs.envs[0].env.env.get_body_com("fingertip") - envs.envs[0].env.env.get_body_com("target"))[:-1]) rbf4_ = np.array([-np.square(action[0]).sum()]) feat_rewards[0] = np.concatenate( (rbf1_.reshape(-1), rbf4_)) demo_features = np.concatenate([ demo_features, feat_rewards.reshape(1, args.num_processes, -1) ], 0) if step > 1 and step % 1000 == 0: done = [True for _ in range(args.num_processes)] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, \ value, reward, masks, feat_rewards) # Save demos: action_file_name = demos_expe_dir + '/actions_step_' + str(j) + '.npy' state_file_name = demos_expe_dir + '/states_step_' + str(j) + '.npy' rew_feat_file_name = demos_expe_dir + '/rew_feat_step_' + str( j) + '.npy' policy_file_name = demos_expe_dir + '/policy_step_' + str(j) + '.pth' np.save(action_file_name, demo_actions) np.save(state_file_name, demo_states) np.save(rew_feat_file_name, demo_features) torch.save(actor_critic.state_dict(), policy_file_name) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) #print(acc_scores) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act(obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.env_name + '_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.env_name + '_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) utils.cleanup_log_dir(log_dir) with open(log_dir + 'extras.csv', "w") as file: file.write("n, value_loss\n") torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) model = Policy(envs.observation_space.shape, envs.action_space.n, extra_kwargs={'use_backpack': args.algo == 'tdprop'}) model.to(device) if args.algo == 'tdprop': from algo.sarsa_tdprop import SARSA agent = SARSA(model, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, beta_1=args.beta_1, beta_2=args.beta_2, n=args.num_steps, num_processes=args.num_processes, gamma=args.gamma) else: from algo.sarsa import SARSA agent = SARSA(model, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, beta_1=args.beta_1, beta_2=args.beta_2, algo=args.algo) explore_policy = utils.eps_greedy rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, model.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): qs = model(rollouts.obs[step]) _, dist = explore_policy(qs, args.exploration) actions = dist.sample().unsqueeze(-1) value = qs.gather(-1, actions) # Obser reward and next obs obs, reward, done, infos = envs.step(actions) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, torch.FloatTensor([0.0]), actions, value, value, reward, masks, bad_masks) with torch.no_grad(): next_qs = model(rollouts.obs[-1]) next_probs, _ = explore_policy(next_qs, args.exploration) next_value = (next_probs * next_qs).sum(-1).unsqueeze(-1) rollouts.compute_returns(next_value, args.gamma) value_loss = agent.update(rollouts, explore_policy, args.exploration) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1): save_path = os.path.join(args.log_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ list(model.parameters()), getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( ("Updates {}, num timesteps {}, FPS {}\n" + \ "Last {} training episodes: mean/median reward {:.1f}/{:.1f}" + \ ", min/max reward {:.1f}/{:.1f}\n" + \ "entropy {:.2f}, value loss {:.4f}") .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist.entropy().mean().item(), value_loss)) with open(log_dir + 'extras.csv', "a") as file: file.write( str(total_num_steps) + ", " + str(value_loss) + "\n")
def main(): is_limit_action = True # is_limit_action = False args_cuda = True # args_cuda = False torch.manual_seed(args_seed) torch.cuda.manual_seed_all(args_seed) device = torch.device("cuda:0" if args_cuda else "cpu") train_log = Log(log_name + '_train_log') evl_log = Log(log_name + '_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs(args_env_name, args_seed, args_num_processes, device, gamma=args_gamma) if is_limit_action: envs.action_space.n = 3 print('Number of Actions:', envs.action_space.n) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_recurrent_policy}) actor_critic.to(device) # print(actor_critic.is_recurrent) # print(actor_critic.gru) # ss('hi') agent = PPO(actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm, use_clipped_value_loss=args_use_clipped_value_loss) rollouts = RolloutStorage(args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # print(obs) # ss('i am over it') num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # ss('dissecting actor critic. act') # print(action) # print() # action = action + 1 # print(action) # ss('hoiohasdfhioas') if is_limit_action: obs, reward, done, infos = envs.step(action + 1) else: obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) # print(done) # print(sum_re[i]) sum_re[i] *= 0 masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_gamma, args_use_gae, args_gae_lambda) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ob_rms = get_vec_normalize(envs).ob_rms ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed, args_num_processes, device, is_limit_action=is_limit_action) ev_log_string = 'steps:' + str(total_num_steps) + '. ' + ev_result evl_log.log(ev_log_string)
surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy.mean() * args.entropy_coef).backward() nn.utils.clip_grad_norm(agent.parameters(), args.max_grad_norm) optimizer.step() ppo_update += 1 if ppo_update // args.ppo_epoch % 5 == 0: writer.add_scalar('value_loss', value_loss.data.cpu().numpy(), ppo_update) writer.add_scalar('action_loss', action_loss.data.cpu().numpy(), ppo_update) writer.add_scalar('entropy_loss', dist_entropy.mean().data.cpu().numpy(), ppo_update) # Save model torch.save( agent.model.state_dict(), "saved_weights/saved_model_ppo_epoch_" + str(ppo_update)) rollouts.after_update()
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monit`or (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' logger = Logger(algorithm_name = args.algo, environment_name = args.env_name, folder = args.folder) logger.save_args(args) print ("---------------------------------------") print ('Saving to', logger.save_folder) print ("---------------------------------------") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) final_rewards_mean = [final_rewards.mean()] final_rewards_median = [final_rewards.median()] final_rewards_min = [final_rewards.min()] final_rewards_max = [final_rewards.max()] logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max) logger.save() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): """ 主程序 :return: """ num_cls = args.wave_num * args.k + 1 # 所有的路由和波长选择组合,加上啥都不选 action_shape = 1 # action的维度,默认是1. num_updates = int( args.steps) // args.workers // args.num_steps # 梯度一共需要更新的次数 if args.append_route.startswith("True"): channel_num = args.wave_num + args.k else: channel_num = args.wave_num # 解析weight if args.weight.startswith('None'): weight = None else: weight = args.weight # 创建actor_critic if args.mode.startswith('alg'): # ksp(args, weight) return elif args.mode.startswith('learning'): # CNN学习模式下,osb的shape应该是CHW obs_shape = (channel_num, args.img_height, args.img_width) if args.cnn.startswith('mobilenetv2'): actor_critic = MobileNetV2(in_channels=channel_num, num_classes=num_cls, t=6) elif args.cnn.startswith('simplenet'): actor_critic = SimpleNet(in_channels=channel_num, num_classes=num_cls) elif args.cnn.startswith('simplestnet'): actor_critic = SimplestNet(in_channels=channel_num, num_classes=num_cls) elif args.cnn.startswith('alexnet'): actor_critic = AlexNet(in_channels=channel_num, num_classes=num_cls) elif args.cnn.startswith('squeezenet'): actor_critic = SqueezeNet(in_channels=channel_num, num_classes=num_cls, version=1.0) elif args.cnn.startswith('expandsimplenet'): actor_critic = ExpandSimpleNet(in_channels=channel_num, num_classes=num_cls, expand_factor=args.expand_factor) elif args.cnn.startswith('deepersimplenet'): actor_critic = DeeperSimpleNet(in_channels=channel_num, num_classes=num_cls, expand_factor=args.expand_factor) else: raise NotImplementedError # 创建optimizer if args.algo.startswith("a2c"): optimizer = optim.RMSprop(actor_critic.parameters(), lr=args.base_lr, eps=args.epsilon, alpha=args.alpha) elif args.algo.startswith("ppo"): optimizer = optim.Adam(actor_critic.parameters(), lr=args.base_lr, eps=args.epsilon) else: raise NotImplementedError else: raise NotImplementedError if args.cuda.startswith("True"): # 如果要使用cuda进行计算 actor_critic.cuda() # actor_critic = DistModule(actor_critic) # 判断是否是评估模式 if args.evaluate: print("evaluate mode") models = {} times = 1 prefix = "trained_models" directory = os.path.join(prefix, 'a2c', args.cnn, args.step_over) env = RwaGame(net_config=args.net, wave_num=args.wave_num, rou=args.rou, miu=args.miu, max_iter=args.max_iter, k=args.k, mode=args.mode, img_width=args.img_width, img_height=args.img_height, weight=weight, step_over=args.step_over) for model_file in reversed( sorted(os.listdir(directory), key=lambda item: int(item.split('.')[0]))): model_file = os.path.join(directory, model_file) print("evaluate model {}".format(model_file)) params = torch.load(model_file) actor_critic.load_state_dict(params['state_dict']) actor_critic.eval() models[params['update_i']] = {} print("model loading is finished") for t in range(times): total_reward, total_services, allocated_services = 0, 0, 0 obs, reward, done, info = env.reset() while not done: inp = Variable(torch.Tensor(obs).unsqueeze(0), volatile=True) # 禁止梯度更新 value, action, action_log_prob = actor_critic.act( inputs=inp, deterministic=True) # 确定性决策 action = action.data.numpy()[0] obs, reward, done, info = env.step(action=action[0]) total_reward += reward if reward == ARRIVAL_NEWPORT or reward == ARRIVAL_NOPORT: allocated_services += 1 if args.step_over.startswith('one_time'): if info: total_services += 1 elif args.step_over.startswith('one_service'): total_services += 1 else: raise NotImplementedError models[params['update_i']]['time'] = t models[params['update_i']]['reward'] = total_reward models[params['update_i']]['total_services'] = total_services models[params['update_i']][ 'allocated_services'] = allocated_services models[params['update_i']]['bp'] = ( total_services - allocated_services) / total_services # 输出仿真结果 # print("|updated model|test index|reward|bp|total services|allocated services|") # print("|:-----|:-----|:-----|:-----|:-----|:-----|") # for m in sorted(models): for i in range(times): print("|{up}|{id}|{r}|{bp:.4f}|{ts}|{als}|".format( up=params['update_i'], id=models[params['update_i']]['time'], r=models[params['update_i']]['reward'], bp=models[params['update_i']]['bp'], ts=models[params['update_i']]['total_services'], als=models[params['update_i']]['allocated_services'])) return # 创建游戏环境 envs = [ make_env(net_config=args.net, wave_num=args.wave_num, k=args.k, mode=args.mode, img_width=args.img_width, img_height=args.img_height, weight=weight, step_over=args.step_over) for _ in range(args.workers) ] envs = SubprocEnv(envs) # 创建游戏运行过程中相关变量存储更新的容器 rollout = RolloutStorage(num_steps=args.num_steps, num_processes=args.workers, obs_shape=obs_shape, action_shape=action_shape) current_obs = torch.zeros(args.workers, *obs_shape) observation, _, _, _ = envs.reset() update_current_obs(current_obs, observation, channel_num) rollout.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.workers, 1]) final_rewards = torch.zeros([args.workers, 1]) if args.cuda.startswith("True"): current_obs = current_obs.cuda() rollout.cuda() start = time.time() log_start = time.time() total_services = 0 # log_interval期间一共有多少个业务到达 allocated_services = 0 # log_interval期间一共有多少个业务被分配成功 update_begin = 0 # 判断是否是接续之前的训练 if args.resume: pms = torch.load(args.resume) actor_critic.load_state_dict(pms['state_dict']) optimizer.load_state_dict(pms['optimizer']) update_begin = pms['update_i'] print("resume process from update_i {}, with base_lr {}".format( update_begin, args.base_lr)) for updata_i in range(update_begin, num_updates): update_start = time.time() for step in range(args.num_steps): # 选择行为 inp = Variable(rollout.observations[step], volatile=True) # 禁止梯度更新 value, action, action_log_prob = actor_critic.act( inputs=inp, deterministic=False) # print(action) # 压缩维度,放到cpu上执行。因为没有用到GPU,所以并没有什么卵用,权当提示 cpu_actions = action.data.squeeze(1).cpu().numpy() # 观察observation,以及下一个observation envs.step_async(cpu_actions) obs, reward, done, info = envs.step_wait( ) # reward和done都是(n,)的numpy.ndarray向量 # if reward == ARRIVAL_NEWPORT_NEWPORT or reward == ARRIVAL_NOPORT_NEWPORT or reward == ARRIVAL_NOPORT_NOPORT: # allocated_services += 1 print(reward) for i in reward: if i == ARRIVAL_NEWPORT or i == ARRIVAL_NOPORT: allocated_services += 1 # allocated_services += (reward==ARRIVAL_NEWPORT_NEWPORT or reward==ARRIVAL_NOPORT_NEWPORT or reward==ARRIVAL_NOPORT_NOPORT).any().sum() # 计算分配成功的reward的次数 # TODO 未解决 if args.step_over.startswith('one_service'): total_services += (info == True).sum() # 计算本次step中包含多少个业务到达事件 # elif args.step_over.startswith('one_service'): # total_services += args.workers else: raise NotImplementedError reward = torch.from_numpy(np.expand_dims(reward, 1)).float() episode_rewards += reward # 累加reward分数 # 如果游戏结束,则重新开始计算episode_rewards和final_rewards,并且以返回的reward为初始值重新进行累加。 masks = torch.FloatTensor([[0.0] if d else [1.0] for d in done ]) # True --> 0, False --> 1 final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # if done[len(done)-1]: # print('游戏结束最终端口数量:',envs.get_all_edges_port()) if args.cuda.startswith("True"): masks = masks.cuda() # 给masks扩充2个维度,与current_obs相乘。则运行结束的游戏进程对应的obs值会变成0,图像上表示全黑,即游戏结束的画面。 current_obs *= masks.unsqueeze(2).unsqueeze(2) update_current_obs(current_obs=current_obs, obs=obs, channel_num=channel_num) # 把本步骤得到的结果存储起来 rollout.insert(step=step, current_obs=current_obs, action=action.data, action_log_prob=action_log_prob.data, value_pred=value.data, reward=reward, mask=masks) # TODO 强行停止 # envs.close() # return # 注意不要引用上述for循环定义的变量。下面变量的命名和使用都要注意。 next_inp = Variable(rollout.observations[-1], volatile=True) # 禁止梯度更新 next_value = actor_critic(next_inp)[0].data # 获取下一步的value值 rollout.compute_returns(next_value=next_value, use_gae=False, gamma=args.gamma, tau=None) if args.algo.startswith('a2c'): # 下面进行A2C算法梯度更新 inps = Variable(rollout.observations[:-1].view(-1, *obs_shape)) acts = Variable(rollout.actions.view(-1, action_shape)) # print("a2cs's acts size is {}".format(acts.size())) value, action_log_probs, cls_entropy = actor_critic.evaluate_actions( inputs=inps, actions=acts) print(cls_entropy.data) # print("inputs' shape is {}".format(inps.size())) # print("value's shape is {}".format(value.size())) value = value.view(args.num_steps, args.workers, 1) # print("action_log_probs's shape is {}".format(action_log_probs.size())) action_log_probs = action_log_probs.view(args.num_steps, args.workers, 1) # 计算loss advantages = Variable(rollout.returns[:-1]) - value value_loss = advantages.pow(2).mean() # L2Loss or MSE Loss action_loss = -(Variable(advantages.data) * action_log_probs).mean() total_loss = value_loss * args.value_loss_coef + action_loss - cls_entropy * args.entropy_coef optimizer.zero_grad() total_loss.backward() # 下面进行迷之操作。。梯度裁剪(https://www.cnblogs.com/lindaxin/p/7998196.html) nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) # average_gradients(actor_critic) optimizer.step() elif args.algo.startswith('ppo'): # 下面进行PPO算法梯度更新 advantages = rollout.returns[:-1] - rollout.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): data_generator = rollout.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, cls_entropy = actor_critic.evaluate_actions( Variable(observations_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() # 事后一支烟 rollout.after_update() update_time = time.time() - update_start print("updates {} finished, cost time {}:{}".format( updata_i, update_time // 60, update_time % 60)) # print("total services is {}".format(total_services)) # 存储模型 if updata_i % args.save_interval == 0: save_path = os.path.join(args.save_dir, 'a2c') save_path = os.path.join(save_path, args.cnn) save_path = os.path.join(save_path, args.step_over) save_path = os.path.join(save_path, args.parameter) if os.path.exists(save_path) and os.path.isdir(save_path): pass else: os.makedirs(save_path) save_file = os.path.join(save_path, str(updata_i) + '.tar') save_content = { 'update_i': updata_i, 'state_dict': actor_critic.state_dict(), 'optimizer': optimizer.state_dict(), 'mean_reward': final_rewards.mean() } torch.save(save_content, save_file) # 输出日志 if updata_i % args.log_interval == 0: end = time.time() interval = end - log_start remaining_seconds = (num_updates - updata_i - 1) / args.log_interval * interval remaining_hours = int(remaining_seconds // 3600) remaining_minutes = int((remaining_seconds % 3600) / 60) total_num_steps = (updata_i + 1) * args.workers * args.num_steps blocked_services = total_services - allocated_services bp = blocked_services / total_services wave_port_num, total_port_num = envs.get_all_edges_port() wave_occ_sum, resource_utilization_rate = envs.get_resourceUtilization( ) print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, \ entropy {:.5f}, value loss {:.5f}, policy loss {:.8f}, remaining time {}:{}, 阻塞率为{}/{}={}, \ 各个波长端口数量为{}, 总的端口数量为{}, 带宽占用情况为{}, 资源占用率为{}".format( updata_i, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), cls_entropy.data, value_loss.data, action_loss.data, remaining_hours, remaining_minutes, blocked_services, total_services, bp, wave_port_num, total_port_num, wave_occ_sum, resource_utilization_rate)) # raise NotImplementedError total_services = 0 allocated_services = 0 log_start = time.time() envs.close()
def train(self, dict_model, config, training_target): self.NUM_AGENTS = len(dict_model) # print("train", dict_model) # actor_critics = [] # local_brains = [] # rollouts = [] actor_critic = dict_model[training_target] global_brain = Brain(actor_critic, config) rollout = RolloutStorage(self.NUM_ADVANCED_STEP, self.NUM_PARALLEL, self.obs_shape, self.device) current_obs = torch.zeros(self.NUM_PARALLEL, self.obs_shape).to(self.device) episode_rewards = torch.zeros([self.NUM_PARALLEL, 1]) final_rewards = torch.zeros([self.NUM_PARALLEL, 1]) episode = np.zeros(self.NUM_PARALLEL) obs = self.envs.reset() obs = np.array(obs) obs = torch.from_numpy(obs).float() current_obs = obs rollout.observations[0].copy_(current_obs) while True: for step in range(self.NUM_ADVANCED_STEP): with torch.no_grad(): # action = actor_critic.act(rollouts.observations[step]) # ここでアクション決めて action = torch.zeros(self.NUM_PARALLEL, self.NUM_AGENTS).long().to( self.device) # 各観測に対する,各エージェントの行動 if DEBUG: print("actionサイズ", self.NUM_PARALLEL, self.NUM_AGENTS) for i, (k, v) in enumerate(dict_model.items()): if k == training_target: tmp_action = v.act(current_obs) target_action = copy.deepcopy(tmp_action) else: tmp_action = v.act_greedy(current_obs) action[:, i] = tmp_action.squeeze() if DEBUG: print("step前のここ?", action.shape) obs, reward, done, infos = self.envs.step(action) # これで時間を進める episode_rewards += reward # if done then clean the history of observation masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if DEBUG: print("done.shape", done.shape) if DEBUG: print("masks.shape", masks.shape) if DEBUG: print("obs.shape", obs.shape) with open(self.resdir + "/episode_reward.txt", "a") as f: for i, info in enumerate(infos): if 'episode' in info: f.write("{:}\t{:}\t{:}\t{:}\n".format( training_target, episode[i], info['env_id'], info['episode']['r'])) print(training_target, episode[i], info['env_id'], info['episode']['r']) episode[i] += 1 final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks current_obs *= masks current_obs = obs # ここで観測を更新している rollout.insert(current_obs, target_action.data, reward, masks, self.NUM_ADVANCED_STEP) with open(self.resdir + "/reward_log.txt", "a") as f: # このログはエピソードが終わったときだけでいい->要修正 f.write( "{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\n". format(self.loop_i, training_target, episode.mean(), step, reward.max().numpy(), reward.min().numpy(), reward.mean().numpy(), episode_rewards.max().numpy(), episode_rewards.min().numpy(), episode_rewards.mean().numpy())) print(self.loop_i, training_target, episode.mean(), step, reward.mean().numpy(), episode_rewards.mean().numpy()) with torch.no_grad(): next_value = actor_critic.get_value( rollout.observations[-1]).detach() rollout.compute_returns(next_value, self.gamma) value_loss, action_loss, total_loss, entropy = global_brain.update( rollout) with open(self.resdir + "/loss_log.txt", "a") as f: f.write("{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\n".format( self.loop_i, training_target, episode.mean(), value_loss, action_loss, entropy, total_loss)) print( "value_loss {:.4f}\taction_loss {:.4f}\tentropy {:.4f}\ttotal_loss {:.4f}" .format(value_loss, action_loss, entropy, total_loss)) rollout.after_update() if int(episode.mean()) + 1 > self.NUM_EPISODES: # print("ループ抜ける") break # ここでベストなモデルを保存していた(備忘) print("%s番目のエージェントのtrain終了" % training_target) dict_model[training_target] = actor_critic # {} return dict_model
def main(): torch.set_num_threads(1) device = torch.device("cpu") # if args.vis: # from visdom import Visdom # viz = Visdom(port=args.port) # win = None # envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, args.add_timestep, device, False) observation_space = Box(low=0, high=10000, shape=(19,), dtype=np.float32) # Box(84,84,4) action_space = Discrete(7) # Discrete(4) actor_critic = Policy(observation_space.shape, action_space, base_kwargs={'recurrent': None}) actor_critic.to(device) # if args.algo == 'a2c': # agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, # args.entropy_coef, lr=args.lr, # eps=args.eps, alpha=args.alpha, # max_grad_norm=args.max_grad_norm) # elif args.algo == 'ppo': # agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, # args.value_loss_coef, args.entropy_coef, lr=args.lr, # eps=args.eps, # max_grad_norm=args.max_grad_norm) # elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, value_loss_coef=0.1, entropy_coef=0.01, acktr=True) rollouts = RolloutStorage(8000, 1, observation_space.shape, action_space, actor_critic.recurrent_hidden_state_size) obs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] rollouts.obs[0].copy_(torch.Tensor(obs)) rollouts.to(device) episode_rewards = deque(maxlen=10) f = open('poktr_20_origin_2.txt', 'a') f.write("\noriginal loss(schedule 6 packets):") start = time.time() for j in range(num_updates): # num_updates net = Net() node_list, path_list = net.read_graph(net.node_list, net.path_list) startnode = node_list[0] # 起始节点 net.get_data(startnode) count = 0 remove_count = 0 # 记录丢弃的数据包的值 end_time = startnode.messages[0].end_time s = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, end_time] states = [[0], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []] # 用来存储所有节点状态 ep_r = 0 ep_acc_r = 0 obs[:] = s reward_ten = torch.Tensor(1, 1) for step in range(8000): # Sample actions count += 1 old_action_log_prob = torch.Tensor([[0]]) # print(rollouts, rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) action_item = action.item() # 将Tensor类型的数据转化为Int型 # Obser reward and next obs obs, reward, done, states, remove_count, acc_r, su_packets = net.schedule(action_item, count, states, node_list, path_list, remove_count) ep_r += reward ep_acc_r += acc_r reward_ten[[0]] = reward # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done else [1.0]]) # print((obs), recurrent_hidden_states, torch.Tensor(action), type(action_log_prob), type(value), type(reward), type(masks)) rollouts.insert(torch.Tensor(obs), recurrent_hidden_states, action, action_log_prob, value, reward_ten, masks) old_action_log_prob = action_log_prob # print(action_log_prob, action_log_prob.shape) f.write("\ntime:"+str(time.strftime('%H:%M:%S', time.localtime(time.time())))+"|"+str(j)+"|ep_r:"+str(ep_r)+"|pakcets:"+str(su_packets)+"|remove:"+str(remove_count)+"|ep_acc_r:"+str(ep_acc_r / 8000)) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, False, 0.99, 0.95) value_loss, action_loss, dist_entropy = agent.update(rollouts) print("time:", time.strftime('%H:%M:%S', time.localtime(time.time())), "|", j, "|ep_r:", ep_r, "|pakcets:", su_packets, "|remove:", remove_count, "|ep_acc_r:", ep_acc_r / 8000, "|value_loss:", value_loss, "|action_loss:", action_loss, "|entropy:", dist_entropy) rollouts.after_update() # if j % 100 == 0: # save_path = os.path.join('./trained_models/', 'acktr') # try: # os.makedirs(save_path) # except OSError: # pass # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # save_model = [save_model, # getattr(get_vec_normalize(envs), 'ob_rms', None)] # torch.save(save_model, os.path.join(save_path, "acktr" + ".pt")) total_num_steps = (j + 1) * num_processes * num_steps