def main(env, scene_path): try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) save_path = os.path.join(args.save_dir, args.algo) eval_x = [] eval_y = [] torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") initial_policies = torch.load(os.path.join(args.load_dir, args.algo, args.initial_policy + ".pt")) \ if args.initial_policy else None if args.reuse_residual: residual, ob_rms, initial_policies = initial_policies else: residual = None ob_rms = None pose_estimator = torch.load(os.path.join(args.load_dir, "pe", args.pose_estimator + ".pt")) \ if args.pose_estimator else None envs = make_vec_envs(env, scene_path, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, initial_policies, pose_estimator=pose_estimator, init_control=not args.dense_ip) if args.reuse_residual: vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms base_kwargs = {'recurrent': args.recurrent_policy} base = residual.base if args.reuse_residual else None dist = residual.dist if args.reuse_residual else None actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs=base_kwargs, zero_last_layer=True, base=base, dist=dist) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, burn_in=initial_policies is not None and not args.reuse_residual) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=64) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes total_num_steps = 0 j = 0 max_succ = -1 max_mean_rew = -math.inf mean_ep_rew = -math.inf evals_without_improv = 0 start = time.time() start_update = start while (not use_metric and j < num_updates) or (use_metric and max_succ < args.trg_succ_rate): if args.eval_interval is not None and j % args.eval_interval == 0: print("Evaluating current policy...") i = 0 total_successes = 0 max_trials = 50 eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while i + args.num_processes <= max_trials: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) obs, _, dones, infos = envs.step(action) if np.all(dones): # Rigid - assumes episodes are fixed length rews = [] for info in infos: rews.append(info['rew_success']) i += args.num_processes rew = sum([int(rew > 0) for rew in rews]) total_successes += rew p_succ = (100 * total_successes / i) eval_x += [total_num_steps] eval_y += [p_succ] end = time.time() print( f"Evaluation: {total_successes} successful out of {i} episodes - " f"{p_succ:.2f}% successful. Eval length: {end - start_update}") torch.save([eval_x, eval_y], os.path.join(args.save_as + "_eval.pt")) start_update = end if p_succ > max_succ: max_succ = p_succ max_mean_rew = mean_ep_rew evals_without_improv = 0 elif mean_ep_rew > max_mean_rew: print("Unimproved success rate, higher reward") max_mean_rew = mean_ep_rew evals_without_improv = 0 else: evals_without_improv += 1 if evals_without_improv == 10 or max_succ >= args.trg_succ_rate: save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None), initial_policies ] extra = "_final" if evals_without_improv == 5 else "" torch.save( save_model, os.path.join(save_path, args.save_as + f"{extra}.pt")) break # save for every interval-th episode or for the last epoch if ((not use_metric and (j % args.save_interval == 0 or j == num_updates - 1)) or (use_metric and evals_without_improv == 0)) and args.save_dir != "": os.makedirs(save_path, exist_ok=True) save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() if pose_estimator is not None: save_model = [save_model, pose_estimator, initial_policies] else: save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None), initial_policies ] torch.save(save_model, os.path.join(save_path, args.save_as + ".pt")) # torch.save(save_model, os.path.join(save_path, args.save_as + f"{j * args.num_processes * args.num_steps}.pt")) if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: mean_ep_rew = np.mean(episode_rewards) if mean_ep_rew > max_mean_rew: print("Improved max mean reward") max_mean_rew = mean_ep_rew evals_without_improv = 0 end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), mean_ep_rew, np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) print("Update length: ", end - start_update) start_update = end if args.vis and (j % args.vis_interval == 0 or (not use_metric and j == num_updates - 1)): try: # Sometimes monitor doesn't properly flush the outputs visdom_plot(args.log_dir, args.save_as, args.algo, total_num_steps) except IOError: pass j += 1 if use_metric: if max_succ >= args.trg_succ_rate: print( f"Achieved greater than {args.trg_succ_rate}% success, advancing curriculum." ) else: print( f"Policy converged with max success rate < {args.trg_succ_rate}%" ) # Copy logs to permanent location so new graphs can be drawn. copy_tree(args.log_dir, os.path.join('logs', args.save_as)) envs.close() return total_num_steps
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def onpolicy_main(): print("onpolicy main") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) # Make vector env envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) # agly ways to access to the environment attirubutes if args.env_name.find('doorenv') > -1: if args.num_processes > 1: visionnet_input = envs.venv.venv.visionnet_input nn = envs.venv.venv.nn env_name = envs.venv.venv.xml_path else: visionnet_input = envs.venv.venv.envs[ 0].env.env.env.visionnet_input nn = envs.venv.venv.envs[0].env.env.env.nn env_name = envs.venv.venv.envs[0].env.env.env.xml_path dummy_obs = np.zeros(nn * 2 + 3) else: dummy_obs = envs.observation_space visionnet_input = None nn = None if pretrained_policy_load: print("loading", pretrained_policy_load) actor_critic, ob_rms = torch.load(pretrained_policy_load) else: actor_critic = Policy(dummy_obs.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) if visionnet_input: visionmodel = load_visionmodel(env_name, args.visionmodel_path, VisionModelXYZ()) actor_critic.visionmodel = visionmodel.eval() actor_critic.nn = nn actor_critic.to(device) #disable normalizer vec_norm = get_vec_normalize(envs) vec_norm.eval() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, dummy_obs.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) full_obs = envs.reset() initial_state = full_obs[:, :envs.action_space.shape[0]] if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs, 0) else: obs = full_obs rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) pos_control = False total_switches = 0 prev_selection = "" for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) next_action = action if pos_control: frame_skip = 2 if step % (512 / frame_skip - 1) == 0: current_state = initial_state next_action = current_state + next_action for kk in range(frame_skip): full_obs, reward, done, infos = envs.step(next_action) current_state = full_obs[:, :envs.action_space.shape[0]] else: full_obs, reward, done, infos = envs.step(next_action) # convert img to obs if door_env and using visionnet if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: if knob_noisy: obs = add_noise(full_obs, j) else: obs = full_obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() writer.add_scalar("Value loss", value_loss, j) writer.add_scalar("action loss", action_loss, j) writer.add_scalar("dist entropy loss", dist_entropy, j) writer.add_scalar("Episode rewards", np.mean(episode_rewards), j) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join( save_path, args.env_name + "_{}.{}.pt".format(args.save_name, j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) DR = True #Domain Randomization ################## for multiprocess world change ###################### if DR: print("changing world") envs.close_extras() envs.close() del envs envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) full_obs = envs.reset() if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: obs = full_obs
def train_model(self): episode_rewards = deque(maxlen=10) current_episode_rewards = np.zeros(self.shell_args.num_processes) episode_lengths = deque(maxlen=10) current_episode_lengths = np.zeros(self.shell_args.num_processes) current_rewards = np.zeros(self.shell_args.num_processes) total_num_steps = self.start_iter fps_timer = [time.time(), total_num_steps] timers = np.zeros(3) egomotion_loss = 0 video_frames = [] num_episodes = 0 # self.evaluate_model() obs = self.envs.reset() if self.compute_surface_normals: obs["surface_normals"] = pt_util.depth_to_surface_normals( obs["depth"].to(self.device)) obs["prev_action_one_hot"] = obs[ "prev_action_one_hot"][:, ACTION_SPACE].to(torch.float32) if self.shell_args.algo == "supervised": obs["best_next_action"] = pt_util.from_numpy( obs["best_next_action"][:, ACTION_SPACE]) self.rollouts.copy_obs(obs, 0) distances = pt_util.to_numpy_array(obs["goal_geodesic_distance"]) self.train_stats["start_geodesic_distance"][:] = distances previous_visual_features = None egomotion_pred = None prev_action = None prev_action_probs = None num_updates = (int(self.shell_args.num_env_steps) // self.shell_args.num_forward_rollout_steps ) // self.shell_args.num_processes try: for iter_count in range(num_updates): if self.shell_args.tensorboard: if iter_count % 500 == 0: print("Logging conv summaries") self.logger.network_conv_summary( self.agent, total_num_steps) elif iter_count % 100 == 0: print("Logging variable summaries") self.logger.network_variable_summary( self.agent, total_num_steps) if self.shell_args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(self.optimizer.optimizer, iter_count, num_updates, self.shell_args.lr) if self.shell_args.algo == "ppo" and self.shell_args.use_linear_clip_decay: self.optimizer.clip_param = self.shell_args.clip_param * ( 1 - iter_count / float(num_updates)) if hasattr(self.agent.base, "enable_decoder"): if self.shell_args.record_video: self.agent.base.enable_decoder() else: self.agent.base.disable_decoder() for step in range(self.shell_args.num_forward_rollout_steps): with torch.no_grad(): start_t = time.time() value, action, action_log_prob, recurrent_hidden_states = self.agent.act( { "images": self.rollouts.obs[step], "target_vector": self.rollouts.additional_observations_dict[ "pointgoal"][step], "prev_action_one_hot": self.rollouts.additional_observations_dict[ "prev_action_one_hot"][step], }, self.rollouts.recurrent_hidden_states[step], self.rollouts.masks[step], ) action_cpu = pt_util.to_numpy_array(action.squeeze(1)) translated_action_space = ACTION_SPACE[action_cpu] if not self.shell_args.end_to_end: self.rollouts.additional_observations_dict[ "visual_encoder_features"][ self.rollouts.step].copy_( self.agent.base.visual_encoder_features ) if self.shell_args.use_motion_loss: if self.shell_args.record_video: if previous_visual_features is not None: egomotion_pred = self.agent.base.predict_egomotion( self.agent.base.visual_features, previous_visual_features) previous_visual_features = self.agent.base.visual_features.detach( ) timers[1] += time.time() - start_t if self.shell_args.record_video: # Copy so we don't mess with obs itself draw_obs = OrderedDict() for key, val in obs.items(): draw_obs[key] = pt_util.to_numpy_array( val).copy() best_next_action = draw_obs.pop( "best_next_action", None) if prev_action is not None: draw_obs[ "action_taken"] = pt_util.to_numpy_array( self.agent.last_dist.probs).copy() draw_obs["action_taken"][:] = 0 draw_obs["action_taken"][ np.arange(self.shell_args.num_processes), prev_action] = 1 draw_obs[ "action_taken_name"] = SIM_ACTION_TO_NAME[ ACTION_SPACE_TO_SIM_ACTION[ ACTION_SPACE[ prev_action.squeeze()]]] draw_obs[ "action_prob"] = pt_util.to_numpy_array( prev_action_probs).copy() else: draw_obs["action_taken"] = None draw_obs[ "action_taken_name"] = SIM_ACTION_TO_NAME[ SimulatorActions.STOP] draw_obs["action_prob"] = None prev_action = action_cpu prev_action_probs = self.agent.last_dist.probs.detach( ) if (hasattr(self.agent.base, "decoder_outputs") and self.agent.base.decoder_outputs is not None): min_channel = 0 for key, num_channels in self.agent.base.decoder_output_info: outputs = self.agent.base.decoder_outputs[:, min_channel: min_channel + num_channels, ...] draw_obs["output_" + key] = pt_util.to_numpy_array( outputs).copy() min_channel += num_channels draw_obs["rewards"] = current_rewards.copy() draw_obs["step"] = current_episode_lengths.copy() draw_obs["method"] = self.shell_args.method_name if best_next_action is not None: draw_obs["best_next_action"] = best_next_action if self.shell_args.use_motion_loss: if egomotion_pred is not None: draw_obs[ "egomotion_pred"] = pt_util.to_numpy_array( F.softmax(egomotion_pred, dim=1)).copy() else: draw_obs["egomotion_pred"] = None images, titles, normalize = draw_outputs.obs_to_images( draw_obs) if self.shell_args.algo == "supervised": im_inds = [0, 2, 3, 1, 9, 6, 7, 8, 5, 4] else: im_inds = [0, 2, 3, 1, 6, 7, 8, 5] height, width = images[0].shape[:2] subplot_image = drawing.subplot( images, 2, 5, titles=titles, normalize=normalize, order=im_inds, output_width=max(width, 320), output_height=max(height, 320), ) video_frames.append(subplot_image) # save dists from previous step or else on reset they will be overwritten distances = pt_util.to_numpy_array( obs["goal_geodesic_distance"]) start_t = time.time() obs, rewards, dones, infos = self.envs.step( translated_action_space) timers[0] += time.time() - start_t obs["reward"] = rewards if self.shell_args.algo == "supervised": obs["best_next_action"] = pt_util.from_numpy( obs["best_next_action"][:, ACTION_SPACE]).to( torch.float32) obs["prev_action_one_hot"] = obs[ "prev_action_one_hot"][:, ACTION_SPACE].to( torch.float32) rewards *= REWARD_SCALAR rewards = np.clip(rewards, -10, 10) if self.shell_args.record_video and not dones[0]: obs["top_down_map"] = infos[0]["top_down_map"] if self.compute_surface_normals: obs["surface_normals"] = pt_util.depth_to_surface_normals( obs["depth"].to(self.device)) current_rewards = pt_util.to_numpy_array(rewards) current_episode_rewards += pt_util.to_numpy_array( rewards).squeeze() current_episode_lengths += 1 for ii, done_e in enumerate(dones): if done_e: num_episodes += 1 if self.shell_args.record_video: final_rgb = draw_obs["rgb"].transpose( 0, 2, 3, 1).squeeze(0) if self.shell_args.task == "pointnav": if infos[ii]["spl"] > 0: draw_obs[ "action_taken_name"] = "Stop. Success" draw_obs["reward"] = [ self.configs[0].TASK. SUCCESS_REWARD ] final_rgb[:] = final_rgb * np.float32( 0.5) + np.tile( np.array([0, 128, 0], dtype=np.uint8), (final_rgb.shape[0], final_rgb.shape[1], 1), ) else: draw_obs[ "action_taken_name"] = "Timeout. Failed" final_rgb[:] = final_rgb * np.float32( 0.5) + np.tile( np.array([128, 0, 0], dtype=np.uint8), (final_rgb.shape[0], final_rgb.shape[1], 1), ) elif self.shell_args.task == "exploration" or self.shell_args.task == "flee": draw_obs[ "action_taken_name"] = "End of episode." final_rgb = final_rgb[np.newaxis, ...].transpose( 0, 3, 1, 2) draw_obs["rgb"] = final_rgb images, titles, normalize = draw_outputs.obs_to_images( draw_obs) im_inds = [0, 2, 3, 1, 6, 7, 8, 5] height, width = images[0].shape[:2] subplot_image = drawing.subplot( images, 2, 5, titles=titles, normalize=normalize, order=im_inds, output_width=max(width, 320), output_height=max(height, 320), ) video_frames.extend( [subplot_image] * (self.configs[0].ENVIRONMENT. MAX_EPISODE_STEPS + 30 - len(video_frames))) if "top_down_map" in infos[0]: video_dir = os.path.join( self.shell_args.log_prefix, "videos") if not os.path.exists(video_dir): os.makedirs(video_dir) im_path = os.path.join( self.shell_args.log_prefix, "videos", "total_steps_%d.png" % total_num_steps) from habitat.utils.visualizations import maps import imageio top_down_map = maps.colorize_topdown_map( infos[0]["top_down_map"]["map"]) imageio.imsave(im_path, top_down_map) images_to_video( video_frames, os.path.join( self.shell_args.log_prefix, "videos"), "total_steps_%d" % total_num_steps, ) video_frames = [] if self.shell_args.task == "pointnav": print( "FINISHED EPISODE %d Length %d Reward %.3f SPL %.4f" % ( num_episodes, current_episode_lengths[ii], current_episode_rewards[ii], infos[ii]["spl"], )) self.train_stats["spl"][ii] = infos[ii][ "spl"] self.train_stats["success"][ ii] = self.train_stats["spl"][ii] > 0 self.train_stats["end_geodesic_distance"][ ii] = (distances[ii] - self.configs[0]. SIMULATOR.FORWARD_STEP_SIZE) self.train_stats[ "delta_geodesic_distance"][ii] = ( self.train_stats[ "start_geodesic_distance"][ii] - self.train_stats[ "end_geodesic_distance"][ii]) self.train_stats["num_steps"][ ii] = current_episode_lengths[ii] elif self.shell_args.task == "exploration": print( "FINISHED EPISODE %d Reward %.3f States Visited %d" % (num_episodes, current_episode_rewards[ii], infos[ii]["visited_states"])) self.train_stats["visited_states"][ ii] = infos[ii]["visited_states"] elif self.shell_args.task == "flee": print( "FINISHED EPISODE %d Reward %.3f Distance from start %.4f" % (num_episodes, current_episode_rewards[ii], infos[ii]["distance_from_start"])) self.train_stats["distance_from_start"][ ii] = infos[ii]["distance_from_start"] self.train_stats["num_episodes"][ii] += 1 self.train_stats["reward"][ ii] = current_episode_rewards[ii] if self.shell_args.tensorboard: log_dict = { "single_episode/reward": self.train_stats["reward"][ii] } if self.shell_args.task == "pointnav": log_dict.update({ "single_episode/num_steps": self.train_stats["num_steps"][ii], "single_episode/spl": self.train_stats["spl"][ii], "single_episode/success": self.train_stats["success"][ii], "single_episode/start_geodesic_distance": self.train_stats[ "start_geodesic_distance"][ii], "single_episode/end_geodesic_distance": self.train_stats[ "end_geodesic_distance"][ii], "single_episode/delta_geodesic_distance": self.train_stats[ "delta_geodesic_distance"][ii], }) elif self.shell_args.task == "exploration": log_dict[ "single_episode/visited_states"] = self.train_stats[ "visited_states"][ii] elif self.shell_args.task == "flee": log_dict[ "single_episode/distance_from_start"] = self.train_stats[ "distance_from_start"][ii] self.logger.dict_log( log_dict, step=(total_num_steps + self.shell_args.num_processes * step + ii)) episode_rewards.append( current_episode_rewards[ii]) current_episode_rewards[ii] = 0 episode_lengths.append( current_episode_lengths[ii]) current_episode_lengths[ii] = 0 self.train_stats["start_geodesic_distance"][ ii] = obs["goal_geodesic_distance"][ii] # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in dones]) bad_masks = torch.FloatTensor( [[0.0] if "bad_transition" in info.keys() else [1.0] for info in infos]) self.rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, rewards, masks, bad_masks) with torch.no_grad(): start_t = time.time() next_value = self.agent.get_value( { "images": self.rollouts.obs[-1], "target_vector": self.rollouts. additional_observations_dict["pointgoal"][-1], "prev_action_one_hot": self.rollouts.additional_observations_dict[ "prev_action_one_hot"][-1], }, self.rollouts.recurrent_hidden_states[-1], self.rollouts.masks[-1], ).detach() timers[1] += time.time() - start_t self.rollouts.compute_returns(next_value, self.shell_args.use_gae, self.shell_args.gamma, self.shell_args.tau) if not self.shell_args.no_weight_update: start_t = time.time() if self.shell_args.algo == "supervised": ( total_loss, action_loss, visual_loss_total, visual_loss_dict, egomotion_loss, forward_model_loss, ) = self.optimizer.update(self.rollouts, self.shell_args) else: ( total_loss, value_loss, action_loss, dist_entropy, visual_loss_total, visual_loss_dict, egomotion_loss, forward_model_loss, ) = self.optimizer.update(self.rollouts, self.shell_args) timers[2] += time.time() - start_t self.rollouts.after_update() # save for every interval-th episode or for the last epoch if iter_count % self.shell_args.save_interval == 0 or iter_count == num_updates - 1: self.save_checkpoint(5, total_num_steps) total_num_steps += self.shell_args.num_processes * self.shell_args.num_forward_rollout_steps if not self.shell_args.no_weight_update and iter_count % self.shell_args.log_interval == 0: log_dict = {} if len(episode_rewards) > 1: end = time.time() nsteps = total_num_steps - fps_timer[1] fps = int((total_num_steps - fps_timer[1]) / (end - fps_timer[0])) timers /= nsteps env_spf = timers[0] forward_spf = timers[1] backward_spf = timers[2] print(( "{} Updates {}, num timesteps {}, FPS {}, Env FPS " "{}, \n Last {} training episodes: mean/median reward " "{:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n" ).format( datetime.datetime.now(), iter_count, total_num_steps, fps, int(1.0 / env_spf), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), )) if self.shell_args.tensorboard: log_dict.update({ "stats/full_spf": 1.0 / (fps + 1e-10), "stats/env_spf": env_spf, "stats/forward_spf": forward_spf, "stats/backward_spf": backward_spf, "stats/full_fps": fps, "stats/env_fps": 1.0 / (env_spf + 1e-10), "stats/forward_fps": 1.0 / (forward_spf + 1e-10), "stats/backward_fps": 1.0 / (backward_spf + 1e-10), "episode/mean_rewards": np.mean(episode_rewards), "episode/median_rewards": np.median(episode_rewards), "episode/min_rewards": np.min(episode_rewards), "episode/max_rewards": np.max(episode_rewards), "episode/mean_lengths": np.mean(episode_lengths), "episode/median_lengths": np.median(episode_lengths), "episode/min_lengths": np.min(episode_lengths), "episode/max_lengths": np.max(episode_lengths), }) fps_timer[0] = time.time() fps_timer[1] = total_num_steps timers[:] = 0 if self.shell_args.tensorboard: log_dict.update({ "loss/action": action_loss, "loss/0_total": total_loss, "loss/visual/0_total": visual_loss_total, "loss/exploration/egomotion": egomotion_loss, "loss/exploration/forward_model": forward_model_loss, }) if self.shell_args.algo != "supervised": log_dict.update({ "loss/entropy": dist_entropy, "loss/value": value_loss }) for key, val in visual_loss_dict.items(): log_dict["loss/visual/" + key] = val self.logger.dict_log(log_dict, step=total_num_steps) if self.shell_args.eval_interval is not None and total_num_steps % self.shell_args.eval_interval < ( self.shell_args.num_processes * self.shell_args.num_forward_rollout_steps): self.save_checkpoint(-1, total_num_steps) self.set_log_iter(total_num_steps) self.evaluate_model() # reset the env datasets self.envs.unwrapped.call( ["switch_dataset"] * self.shell_args.num_processes, [("train", )] * self.shell_args.num_processes) obs = self.envs.reset() if self.compute_surface_normals: obs["surface_normals"] = pt_util.depth_to_surface_normals( obs["depth"].to(self.device)) obs["prev_action_one_hot"] = obs[ "prev_action_one_hot"][:, ACTION_SPACE].to(torch.float32) if self.shell_args.algo == "supervised": obs["best_next_action"] = pt_util.from_numpy( obs["best_next_action"][:, ACTION_SPACE]) self.rollouts.copy_obs(obs, 0) distances = pt_util.to_numpy_array( obs["goal_geodesic_distance"]) self.train_stats["start_geodesic_distance"][:] = distances previous_visual_features = None egomotion_pred = None prev_action = None prev_action_probs = None except: # Catch all exceptions so a final save can be performed import traceback traceback.print_exc() finally: self.save_checkpoint(-1, total_num_steps)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, True) frame_skip = 4 # frame skip if args.tb_dir[-1] != '/': args.tb_dir = args.tb_dir + '/' logger = Logger(args.tb_dir) logger.write_settings(args) if args.use_tdm: # beta scheduler if args.beta_schedule == 'const': beta_func = lambda x: float(args.beta_int) elif args.beta_schedule == 'sqrt': beta_func = lambda x: 1. / np.sqrt(x + 2) elif args.beta_schedule == 'log': beta_func = lambda x: 1. / np.log(x + 2) elif args.beta_schedule == 'linear': beta_func = lambda x: 1. / (x + 2) # bonus function variations if args.bonus_func == 'linear': bonus_func = lambda x: x + 1 elif args.bonus_func == 'square': bonus_func = lambda x: (x + 1)**2 elif args.bonus_func == 'sqrt': bonus_func = lambda x: (x + 1)**(1 / 2) elif args.bonus_func == 'log': bonus_func = lambda x: np.log(x + 1) # temporal difference module tdm = TemporalDifferenceModule( inputSize=2 * int(envs.observation_space.shape[0]), outputSize=args.time_intervals, num_fc_layers=int(args.num_layers), depth_fc_layers=int(args.fc_width), lr=float(args.opt_lr), buffer_max_length=args.buffer_max_length, buffer_RL_ratio=args.buffer_RL_ratio, frame_skip=frame_skip, tdm_epoch=args.tdm_epoch, tdm_batchsize=args.tdm_batchsize, logger=logger, bonus_func=bonus_func).to(device) #collect random trajectories sample_collector = CollectSamples(envs, args.num_processes, initial=True) tdm.buffer_rand = sample_collector.collect_trajectories( args.num_rollouts, args.steps_per_rollout) # initial training tdm.update() actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # acting for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs # envs.render() obs_old = obs.clone() obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #compute intrinsic bonus if args.use_tdm: tdm.symm_eval = True if step == args.num_steps - 1 else False reward_int = tdm.compute_bonus(obs_old, obs).float() reward += beta_func( step + j * args.num_steps) * reward_int.cpu().unsqueeze(1) if (j % args.log_interval == 0) and (step == args.num_steps - 1): logger.add_reward_intrinsic(reward_int, (j + 1) * args.num_steps * args.num_processes) #saving to buffer. rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # saving to buffer and periodic updating parameters if (args.use_tdm): tdm.buffer_RL_temp.append((rollouts.obs, rollouts.masks)) if (j % args.num_steps == 0 and j > 0): tdm.update() with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch # no # save every 1-million steps if (((j + 1) * args.num_steps * args.num_processes) % 1e6 == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] if j == num_updates - 1: save_here = os.path.join( save_path, args.env_name + "_step_{}M.pt".format( (j + 1) * args.num_steps * args.num_processes // 1e6)) else: save_here = os.path.join(save_path, args.env_name + "_final.pt") torch.save(save_model, save_here) # saved policy. total_num_steps = (j + 1) * args.num_processes * args.num_steps # printing outputs if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) logger.add_reward(episode_rewards, (j + 1) * args.num_steps * args.num_processes) # # if j % args.tb_interval == 0: # # mean/std or median/1stqt? # logger.add_tdm_loss(loss, self.epoch_count*i) # evaluation process # if (args.eval_interval is not None # and len(episode_rewards) > 1 # and j % args.eval_interval == 0): # eval_envs = make_vec_envs( # args.env_name, args.seed + args.num_processes, args.num_processes, # args.gamma, eval_log_dir, args.add_timestep, device, True) # # vec_norm = get_vec_normalize(eval_envs) # if vec_norm is not None: # vec_norm.eval() # vec_norm.ob_rms = get_vec_normalize(envs).ob_rms # # eval_episode_rewards = [] # # obs = eval_envs.reset() # eval_recurrent_hidden_states = torch.zeros(args.num_processes, # actor_critic.recurrent_hidden_state_size, device=device) # eval_masks = torch.zeros(args.num_processes, 1, device=device) # # while len(eval_episode_rewards) < 10: # with torch.no_grad(): # _, action, _, eval_recurrent_hidden_states = actor_critic.act( # obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # # # Obser reward and next obs # # envs.render() # obs, reward, done, infos = eval_envs.step(action) # # eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] # for done_ in done]) # for info in infos: # if 'episode' in info.keys(): # eval_episode_rewards.append(info['episode']['r']) # # eval_envs.close() # # print(" Evaluation using {} episodes: mean reward {:.5f}\n". # format(len(eval_episode_rewards), # np.mean(eval_episode_rewards))) # # plotting # if args.vis and j % args.vis_interval == 0: # try: # # Sometimes monitor doesn't properly flush the outputs # win = visdom_plot(viz, win, args.log_dir, args.env_name, # args.algo, args.num_env_steps) # except IOError: # pass #if done save::::::::::: logger.save()
avg_win_rate = deque(maxlen=10) avg_cubes_placed_total = deque(maxlen=10) avg_player_dist_to_ref = deque(maxlen=10) avg_opponent_dist_to_ref = deque(maxlen=10) opponnet_policies = deque(maxlen=10) cached_stats = None start = time.time() num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr ) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], deterministic=args.det, ) # Observe reward and next obs obs, reward, done, infos = envs.step(action) for idx, info in enumerate(infos):
def main(): if not os.path.exists("./plots"): os.makedirs("./plots") gbench = read_gbench('./data/gbench.txt') args = my_get_args() print(args) config = dict(sigma=args.sim_sigma, momentum=args.sim_momentum, pump_bins=args.sim_bins, lag=1000 // args.num_steps, rshift=args.sim_rshift, pump_scale=args.sim_scale, reward_kind=args.sim_reward, continuous=args.sim_continuous, span=args.sim_span, percentile=args.sim_percentile, last_runs=args.sim_perc_len, add_linear=not args.sim_no_linear, start_pump=args.sim_start, static_features=not args.sim_no_static, extra_features=not args.sim_no_extra, curiosity_num=args.curiosity) base_kwargs = { 'hidden_size': args.hidden_size, 'film_size': 800 * (not args.sim_no_static) } if args.relu: base_kwargs['activation'] = 'relu' base = FILMBase #FILMBase if args.gset > 0: test_graphs = [args.gset] else: test_graphs = [1, 2, 3, 4, 5] #--------------------------------------------------------- assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print('Num updates: ', num_updates) if args.dry_run: return random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") logdata = defaultdict(list) if args.gset > 0: envs = [] for g in test_graphs: g_ = read_gset('./data/G{}.txt'.format(g), negate=True) s = SIMCIM(g_, device=device, batch_size=args.num_processes, **config) s.runpump() envs.append(s) envs = SIMCollection(envs, [gbench[g] for g in test_graphs]) logdata['bls_bench'] = [gbench[g] for g in test_graphs] else: envs = SIMGeneratorRandom(800, 0.06, args.num_processes, config, keep=args.sim_keep, n_sims=args.sim_nsim, device=device) if args.snapshot is None: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base=base, base_kwargs=base_kwargs) else: actor_critic, _ = torch.load( os.path.join(args.save_dir, args.algo, args.snapshot + ".pt")) actor_critic.to(device) print(actor_critic) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() print(rollouts.obs.shape, obs.shape) rollouts.obs[0].copy_(obs) rollouts.to(device) eval_envs = [] for g in test_graphs: g_ = read_gset('./data/G{}.txt'.format(g), negate=True) s = SIMCIM(g_, device=device, batch_size=args.num_val_processes, **config) s.runpump() eval_envs.append(s) eval_envs = SIMCollection(eval_envs, [gbench[g] for g in test_graphs]) ref_cuts = [s.lastcuts for s in eval_envs.envs] logdata['ref_cuts'] = [e.tolist() for e in ref_cuts] stoch_cuts = None start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # ROLLOUT DATA for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) if 'episode' in infos[0].keys(): rw = np.mean([e['episode']['r'] for e in infos]) logdata['episode_rewards'].append(rw.item()) if args.gset > 0: cuts = [e.lastcuts for e in envs.envs] logdata['train_median'].append( [np.median(e).item() for e in cuts]) logdata['train_max'].append( [np.max(e).item() for e in cuts]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) #UPDATE AGENT with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, _ = agent.update(rollouts) logdata['alosses'].append(action_loss) logdata['vlosses'].append(value_loss) logdata['train_percentiles'].append(envs.perc.tolist()) rollouts.after_update() #CHECKPOINTS # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join(save_path, args.env_name + '-' + str(j) + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps #LOGGING if j % args.log_interval == 0 and len(logdata['episode_rewards']) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: \ mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(logdata['episode_rewards']), np.mean(logdata['episode_rewards'][-10:]), np.median(logdata['episode_rewards'][-10:]), np.min(logdata['episode_rewards'][-10:]), np.max(logdata['episode_rewards'][-10:]))) #EVALUATION if (args.eval_interval is not None and j % args.eval_interval == 0): logdata['spumps'] = [] vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_val_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_val_processes, 1, device=device) eval_done = False while not eval_done: p = eval_envs.envs[0].old_p logdata['spumps'].append(p[:10].cpu().numpy().tolist()) with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=False) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_done = np.all(done) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) stoch_cuts = [e.lastcuts for e in eval_envs.envs] logdata['stoch_cuts'] = [e.tolist() for e in stoch_cuts] logdata['eval_median'].append( [np.median(e).item() for e in stoch_cuts]) logdata['eval_max'].append([np.max(e).item() for e in stoch_cuts]) logdata['test_percentiles'].append(eval_envs.perc.tolist()) rw = np.mean([e['episode']['r'] for e in infos]) logdata['eval_episode_rewards'].append(rw.item()) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(logdata['eval_episode_rewards']), np.mean(logdata['eval_episode_rewards']))) if j % args.log_interval == 0: fn = os.path.join(save_path, args.env_name + ".res") with open(fn, 'w') as f: json.dump(logdata, f, sort_keys=True, indent=2) #VISUALIZATION if j % args.vis_interval == 0: #if False: plt.figure(figsize=(15, 10)) plt.subplot(231) plt.title('Rewards') plt.xlabel('SIM runs') plt.plot(logdata['episode_rewards'], c='r', label='mean train') plt.plot(np.linspace(0, len(logdata['episode_rewards']), len(logdata['eval_episode_rewards'])), logdata['eval_episode_rewards'], 'b', label='mean eval') plt.legend() plt.subplot(232) plt.plot(logdata['alosses']) plt.title('Policy loss') plt.subplot(233) plt.plot(logdata['vlosses']) plt.title('Value loss') plt.subplot(234) plt.title('Pumps') plt.xlabel('SIM iterations / 10') plt.plot(np.array(logdata['spumps'])) plt.ylim(-0.05, 1.1) plt.subplot(235) plt.plot(logdata['train_percentiles']) plt.title('Train average percentile') plt.subplot(236) plt.title('Test percentiles') plt.plot(logdata['test_percentiles']) plt.legend([str(e) for e in test_graphs]) plt.tight_layout() plt.savefig('./plots/agent_' + args.env_name + '.pdf') plt.clf() plt.close() gc.collect() #plt.show() if stoch_cuts is not None: fig, axs = plt.subplots(len(ref_cuts), 1, sharex=False, tight_layout=True) if len(ref_cuts) == 1: axs = [axs] for gi in range(len(ref_cuts)): mn = min(ref_cuts[gi]) axs[gi].hist(ref_cuts[gi], bins=100, alpha=0.7) dc = stoch_cuts[gi][stoch_cuts[gi] >= mn] if dc.size > 0: axs[gi].hist(dc, bins=100, alpha=0.7) plt.savefig('./plots/cuts_' + args.env_name + '.pdf') plt.clf() plt.close() gc.collect()
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:" + str(args.cuda_id) if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) ########## file related filename = args.env_name + "_" + args.algo + "_n" + str(args.max_episodes) if args.attack: filename += "_" + args.type + "_" + args.aim filename += "_s" + str(args.stepsize) + "_m" + str( args.maxiter) + "_r" + str(args.radius) + "_f" + str(args.frac) if args.run >= 0: filename += "_run" + str(args.run) logger = get_log(args.logdir + filename + "_" + current_time) logger.info(args) rew_file = open(args.resdir + filename + ".txt", "w") if args.compute: radius_file = open( args.resdir + filename + "_radius" + "_s" + str(args.stepsize) + "_m" + str(args.maxiter) + "_th" + str(args.dist_thres) + ".txt", "w") if args.type == "targ" or args.type == "fgsm": targ_file = open(args.resdir + filename + "_targ.txt", "w") num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes if args.type == "wb": attack_net = WbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device=device) if args.type == "bb": attack_net = BbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device=device) elif args.type == "rand": attack_net = RandAttacker(envs, radius=args.radius, frac=args.frac, maxat=int(args.frac * num_updates), device=device) elif args.type == "semirand": attack_net = WbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device, rand_select=True) elif args.type == "targ": if isinstance(envs.action_space, Discrete): action_dim = envs.action_space.n target_policy = action_dim - 1 elif isinstance(envs.action_space, Box): action_dim = envs.action_space.shape[0] target_policy = torch.zeros(action_dim) # target_policy[-1] = 1 print("target policy is", target_policy) attack_net = TargAttacker(agent, envs, int(args.frac * num_updates), num_updates, target_policy, args, device=device) elif args.type == "fgsm": if isinstance(envs.action_space, Discrete): action_dim = envs.action_space.n target_policy = action_dim - 1 elif isinstance(envs.action_space, Box): action_dim = envs.action_space.shape[0] target_policy = torch.zeros(action_dim) def targ_policy(obs): return target_policy attack_net = FGSMAttacker(envs, agent, targ_policy, radius=args.radius, frac=args.frac, maxat=int(args.frac * num_updates), device=device) # if args.aim == "obs" or aim == "hybrid": # obs_space = gym.make(args.env_name).observation_space # attack_net.set_obs_range(obs_space.low, obs_space.high) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode = 0 start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions if args.type == "fgsm": # print("before", rollouts.obs[step]) rollouts.obs[step] = attack_net.attack( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).clone() # print("after", rollouts.obs[step]) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if args.type == "targ" or args.type == "fgsm": if isinstance(envs.action_space, Discrete): num_target = ( action == target_policy).nonzero()[:, 0].size()[0] targ_file.write( str(num_target / args.num_processes) + "\n") print("percentage of target:", num_target / args.num_processes) elif isinstance(envs.action_space, Box): target_action = target_policy.repeat(action.size()[0], 1) targ_file.write( str( torch.norm(action - target_action).item() / args.num_processes) + "\n") print("percentage of target:", torch.sum(action).item() / args.num_processes) # Obser reward and next obs obs, reward, done, infos = envs.step(action.cpu()) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # rew_file.write("episode: {}, total reward: {}\n".format(episode, info['episode']['r'])) episode += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) if args.attack and args.type != "fgsm": if args.aim == "reward": logger.info(rollouts.rewards.flatten()) rollouts.rewards = attack_net.attack_r_general( rollouts, next_value).clone().detach() logger.info("after attack") logger.info(rollouts.rewards.flatten()) elif args.aim == "obs": origin = rollouts.obs.clone() rollouts.obs = attack_net.attack_s_general( rollouts, next_value).clone().detach() logger.info(origin) logger.info("after") logger.info(rollouts.obs) elif args.aim == "action": origin = torch.flatten(rollouts.actions).clone() rollouts.actions = attack_net.attack_a_general( rollouts, next_value).clone().detach() logger.info("attack value") logger.info(torch.flatten(rollouts.actions) - origin) elif args.aim == "hybrid": res_aim, attack = attack_net.attack_hybrid( rollouts, next_value, args.radius_s, args.radius_a, args.radius_r) print("attack ", res_aim) if res_aim == "obs": origin = rollouts.obs.clone() rollouts.obs = attack.clone().detach() logger.info(origin) logger.info("attack obs") logger.info(rollouts.obs) elif res_aim == "action": origin = torch.flatten(rollouts.actions).clone() rollouts.actions = attack.clone().detach() logger.info("attack action") logger.info(torch.flatten(rollouts.actions) - origin) elif res_aim == "reward": logger.info(rollouts.rewards.flatten()) rollouts.rewards = attack.clone().detach() logger.info("attack reward") logger.info(rollouts.rewards.flatten()) if args.compute: stable_radius = attack_net.compute_radius(rollouts, next_value) print("stable radius:", stable_radius) radius_file.write("update: {}, radius: {}\n".format( j, np.round(stable_radius, decimals=3))) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if args.attack and args.type == "bb": attack_net.learning(rollouts) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) >= 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) rew_file.write("updates: {}, mean reward: {}\n".format( j, np.mean(episode_rewards))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) # if episode > args.max_episodes: # print("reach episodes limit") # break if args.attack: logger.info("total attacks: {}\n".format(attack_net.attack_num)) print("total attacks: {}\n".format(attack_net.attack_num)) rew_file.close() if args.compute: radius_file.close() if args.type == "targ" or args.type == "fgsm": targ_file.close()
def learn(env, max_timesteps, timesteps_per_batch, clip_param): ppo_epoch = 5 num_step = timesteps_per_batch save_interval = 100 seed = 1000 batch_size = 64 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) log_dir = os.path.expanduser('/tmp/gym/') eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda") envs = make_vec_envs(env, seed, 8, 0.95, log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = algo.PPO(actor_critic, clip_param, ppo_epoch, batch_size, 0.5, 0.01, lr=0.00025, eps=1e-05, max_grad_norm=0.5) rollouts = RolloutStorage(num_step, 8, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs)) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int(max_timesteps) // num_step // 8 for j in range(num_updates): # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, 0.00025) for step in range(num_step): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, True, 0.99, 0.95, False) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % save_interval == 0 or j == num_updates - 1) and "./trained_models/" != "": save_path = os.path.join("./trained_models/", 'ppo') try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, 'UniversalPolicy' + ".pt")) if j % 1 == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * 8 * num_step end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) '''
def main(): args = get_args() # set seeds and devices np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") log_dir = utils.default_log_init(args.log_dir, args.env_name) save_dir = utils.default_save_init(log_dir, args.save_dir) args_file = utils.default_args_init(log_dir, args) threads_dir = log_dir + "threads/" os.makedirs(threads_dir) logger.configure(log_dir) print(log_dir) eval_log_dir = log_dir + "_eval" envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, threads_dir, device, False) action_sample = envs.action_space.sample() def init_alg(): actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy }).to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) return actor_critic, agent, rollouts actor_critic, agent, rollouts = init_alg() print(actor_critic) print(actor_critic.num_params) # init observations and rollouts obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # init train loggers(not useful for the actual training, but for analysis) episode_rewards = deque(maxlen=args.average_over) start_time = time.time() abs_start = start_time min_rewards = [] max_rewards = [] mean_rewards = [] median_rewards = [] nr_episodes = [] times = [] num_total_steps = [] log_dict = { "min_rewards": min_rewards, "max_rewards": max_rewards, "mean_rewards": mean_rewards, "median_rewards": median_rewards, "nr_episodes": nr_episodes, "times": times, "num_total_steps": num_total_steps } # init convergence checks and other useful variables best_avg = -1e6 best_med = -1e6 since_improve = 0 solved = 0 epochs = int(args.num_env_steps) // args.num_steps // args.num_processes for j in range(1, epochs + 1): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, epochs, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs if type(action_sample) is int: obs, reward, done, infos = envs.step(action.squeeze()) else: obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end_time = time.time() s_total = end_time - abs_start print( "Updates(epochs) {}, num timesteps {}, elapsed {:01}:{:02}:{:02.2f} epoch seconds {} \n Last {} training episodes: " "mean/median reward {:.1f}/{:.1f},min/max reward {:.1f}/{:.1f}\n " .format(j, total_num_steps, int(s_total // 3600), int(s_total % 3600 // 60), s_total % 60, end_time - start_time, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss), flush=True) min_rewards.append(np.min(episode_rewards)) max_rewards.append(np.max(episode_rewards)) mean_rewards.append(np.mean(episode_rewards)) median_rewards.append(np.median(episode_rewards)) nr_episodes.append(total_num_steps) times.append(end_time - start_time) num_total_steps.append(total_num_steps) start_time = end_time if (j % args.save_interval == 0 or j == epochs - 1) and save_dir != "": save_path = "{}it{}_val{:.1f}.pth".format(save_dir, j, np.mean(episode_rewards)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], save_path) print("-------Saved at path {}-------\n".format(save_path)) with open(save_dir + "it_{}_log.json".format(j), "w") as file: json.dump(log_dict, file) if args.convergence_its != 0: worse = True if best_avg < np.mean(episode_rewards): best_avg = np.mean(episode_rewards) since_improve = 0 worse = False if best_med < np.median(episode_rewards): best_med = np.median(episode_rewards) since_improve = 0 worse = False if worse: since_improve += 1 if since_improve > args.convergence_its: print( "No improvements in {} iterations, best average is {}, best median is {}, stopping training" .format(since_improve, best_avg, best_med)) save_path = "{}it{}_val{:.1f}_c.pth".format( save_dir, j, np.mean(episode_rewards)) print("Saved final model at {}".format(save_path)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], save_path) return if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(base=IAMBase, num_frame_stack=None): seed = 1 env_name = "Warehouse-v0" num_processes = 32 log_dir = './logs/' eval_interval = None log_interval = 10 use_linear_lr_decay = False use_proper_time_limits = False save_dir = './trained_models/' use_cuda = True # PPO gamma = 0.99 # reward discount factor clip_param = 0.1 #0.2 ppo_epoch = 3 #4 num_mini_batch = 32 value_loss_coef = 1 #0.5 entropy_coef = 0.01 lr = 2.5e-4 #7e-4 eps = 1e-5 max_grad_norm = float('inf') use_gae = True gae_lambda = 0.95 num_steps = 8 #5 # Store num_env_steps = 4e6 save_interval = 100 # IAM dset = [ 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72 ] #gym.envs.register(env_name, entry_point="environments.warehouse.warehouse:Warehouse", # kwargs={'seed': seed, 'parameters': {"num_frames": 1}}) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) log_dir = os.path.expanduser(log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if use_cuda else "cpu") envs = make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, False, num_frame_stack=num_frame_stack) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base=base, base_kwargs=({ 'dset': dset } if base == IAMBase else {})) actor_critic.to(device) agent = algo.PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=lr, eps=eps, max_grad_norm=max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int(num_env_steps) // num_steps // num_processes for j in range(num_updates): if use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, lr) for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda, use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % save_interval == 0 or j == num_updates - 1) and save_dir != "": save_path = os.path.join(save_dir, 'PPO') try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, env_name + ".pt")) if j % log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * num_processes * num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (eval_interval is not None and len(episode_rewards) > 1 and j % eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, env_name, seed, num_processes, eval_log_dir, device)
def train_loop(agent, envs, env_name, num_updates, num_steps, curiosity_module=None, save_interval=None, eval_interval=None, log_interval=None, time_limit=1000, curiosity_rew_after=0, curiosity_rew_before=None, use_linear_lr_decay=True, lr_decay_horizon=None, callbacks=None): # Create rollout storage num_processes = envs.num_envs rollouts = RolloutStorage(num_steps, num_processes, envs.observation_space.shape, envs.action_space, agent.actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) device = next(agent.actor_critic.parameters()).device rollouts.to(device) # Create curiosity statistics saver if curiosity_module is not None: curiosity_stats = CuriosityStatistics(num_processes=num_processes, time_limit=time_limit) else: curiosity_stats = None # Create train statistics saver stats = TrainStatistics(log_interval, with_curiosity=(curiosity_module is not None)) # Possibility to use curiosity only for a few epochs if curiosity_rew_before is None: curiosity_rew_before = num_updates # Train loop start = time.time() for j in range(num_updates): if use_linear_lr_decay: if lr_decay_horizon is None: lr_decay_horizon = num_updates utils.update_linear_schedule(agent.optimizer, j, lr_decay_horizon, agent.optimizer.defaults['lr']) curiosity_loss = 0 for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = agent.actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe extrinsic reward and next obs obs, reward, done, infos = envs.step(action) # Compute intrinsic rewards if curiosity_module is not None: time_limit_mask = np.array([ 0 if 'bad_transition' in info.keys() else 1 for info in infos ]) if use_proper_time_limits: curiosity_done = done * time_limit_mask else: curiosity_done = done curiosity_reward = curiosity_module.get_reward( rollouts.obs[step], action, obs, curiosity_done) curiosity_loss += curiosity_module.update( rollouts.obs[step], action, obs, curiosity_done) # Update current reward statistics stats.update_extrinsic_reward(infos) if curiosity_module is not None: curiosity_stats.update(curiosity_reward.cpu().numpy().ravel(), done) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) if (curiosity_module is not None) and (j >= curiosity_rew_after and j <= curiosity_rew_before): reward = reward + curiosity_reward rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = agent.actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda, use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if curiosity_module is not None: curiosity_loss /= num_steps else: curiosity_loss = None stats.update_losses(value_loss, action_loss, dist_entropy, curiosity_loss) # Save for every interval-th episode or for the last epoch if (j % save_interval == 0 or j == num_updates - 1) and save_dir != "": save_path = os.path.join(save_dir, "ppo") try: os.makedirs(save_path) except OSError: pass torch.save([ agent.actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, env_name + ".pt")) if j % eval_interval == 0: try: ob_rms = utils.get_vec_normalize(envs).ob_rms except: ob_rms = None stats.eval_episode_rewards.append( evaluate(agent.actor_critic, env_name, device, ob_rms)) if j % log_interval == 0 and len(stats.episode_rewards) > 1: total_num_steps = (j + 1) * num_processes * num_steps end = time.time() stats.update_log(total_num_steps, curiosity_stats) if callbacks is not None: for callback in callbacks: callback(stats, agent, n_updates=j, total_n_steps=total_num_steps, fps=int(total_num_steps / (end - start))) return stats
def run(self): args = self.args torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) print("CUDA is available: ", torch.cuda.is_available()) if args.cuda: print("CUDA enabled") torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True else: if args.cuda_deterministic: print("Warning CUDA is requested but is not available") else: print("CUDA disabled") log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) print("get_num_thread", torch.get_num_threads()) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, self.config_parameters, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = create_IAM_model(envs, args, self.config_parameters) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) # This algorithm should be used for the reproduction project. elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # Always return the average of the last 100 steps. This means the average is sampled. episode_rewards = deque(maxlen=100) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, self.model_file_name)) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() elapsed_time = end - start data = [ j, # Updates total_num_steps, # timesteps int(total_num_steps / elapsed_time), # FPS len(episode_rewards), # Only useful for print statement np.mean(episode_rewards), # mean of rewards np.median(episode_rewards), # median of rewards np.min(episode_rewards), # min rewards np.max(episode_rewards), # max rewards dist_entropy, value_loss, action_loss, elapsed_time ] output = ''.join([str(x) + ',' for x in data]) self.data_saver.append(output) print( f"Updates {data[0]}, num timesteps {data[1]}, FPS {data[2]}, elapsed time {int(data[11])} sec. Last {data[3]} training episodes: mean/median reward {data[4]:.2f}/{data[5]:.2f}, min/max reward {data[6]:.1f}/{data[7]:.1f}", end="\r") if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") receipts = StorageReceipt() make_env = lambda tasks: MiniWoBGraphEnvironment( base_url=os.environ.get("BASE_URL", f"file://{MINIWOB_HTML}/"), levels=tasks, level_tracker=LevelTracker(tasks), wait_ms=500, ) task = args.env_name if args.env_name == "PongNoFrameskip-v4": args.env_name = "clickbutton" task = "miniwob/click-button.html" if task == "levels": tasks = MINIWOB_CHALLENGES else: tasks = [[task]] print("Selected tasks:", tasks) NUM_ACTIONS = 1 envs = make_vec_envs( [make_env(tasks[i % len(tasks)]) for i in range(args.num_processes)], receipts) if os.path.exists("./datadir/autoencoder.pt"): dom_autoencoder = torch.load("./datadir/autoencoder.pt") dom_encoder = dom_autoencoder.encoder for param in dom_encoder.parameters(): param.requires_grad = False else: print("No dom encoder") dom_encoder = None actor_critic = Policy( envs.observation_space.shape, gym.spaces.Discrete(NUM_ACTIONS), # envs.action_space, base=GNNBase, base_kwargs={ "dom_encoder": dom_encoder, "recurrent": args.recurrent_policy }, ) actor_critic.dist = NodeObjective() actor_critic.to(device) if args.algo == "a2c": agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == "ppo": agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == "acktr": agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator(envs.observation_space.shape[0], 100, device) rr = ReplayRepository("/code/miniwob-plusplus-demos/*turk/*") ds = rr.get_dataset() print("GAIL Replay Dataset", ds) gail_train_loader = torch_geometric.data.DataLoader( ds, batch_size=args.gail_batch_size, shuffle=True, drop_last=True) from tensorboardX import SummaryWriter import datetime ts_str = datetime.datetime.fromtimestamp( time.time()).strftime("%Y-%m-%d_%H-%M-%S") tensorboard_writer = SummaryWriter( log_dir=os.path.join("/tmp/log", ts_str)) rollouts = ReceiptRolloutStorage( args.num_steps, args.num_processes, (1, ), # envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, receipts, ) # resume from last save if args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass model_path = os.path.join(save_path, args.env_name + ".pt") if False and os.path.exists(model_path): print("Loadng previous model:", model_path) actor_critic = torch.load(model_path) actor_critic.train() obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs)) rollouts.to(device) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print("Iterations:", num_updates, args.num_steps) for j in range(num_updates): episode_rewards = deque(maxlen=args.num_steps * args.num_processes) if j and last_action_time + 5 < time.time(): # task likely timed out print("Reseting tasks") obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs)) rollouts.recurrent_hidden_states[0].copy_( torch.zeros_like(rollouts.recurrent_hidden_states[0])) rollouts.masks[0].copy_(torch.zeros_like(rollouts.masks[0])) if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr, ) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( receipts.redeem(rollouts.obs[step]), rollouts.recurrent_hidden_states[step], rollouts.masks[step], ) # Obser reward and next obs last_action_time = time.time() obs, reward, done, infos = envs.step(action) for e, i in enumerate(infos): if i.get("real_action") is not None: action[e] = i["real_action"] if i.get("bad_transition"): action[e] = torch.zeros_like(action[e]) for info in infos: if "episode" in info.keys(): episode_rewards.append(info["episode"]["r"]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if "bad_transition" in info.keys() else [1.0] for info in infos]) rollouts.insert( torch.tensor(obs), recurrent_hidden_states, action, action_log_prob, value, torch.tensor(reward).unsqueeze(1), masks, bad_masks, ) with torch.no_grad(): next_value = actor_critic.get_value( receipts.redeem(rollouts.obs[-1]), rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], ).detach() if args.gail: # if j >= 10: # envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): obsfilt = lambda x, update: x # utils.get_vec_normalize(envs)._obfilt gl = discr.update(gail_train_loader, rollouts, obsfilt) print("Gail loss:", gl) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( receipts.redeem(rollouts.obs[step]), rollouts.actions[step], args.gamma, rollouts.masks[step], ) rollouts.compute_returns( next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits, ) value_loss, action_loss, dist_entropy = agent.update(rollouts) obs_shape = rollouts.obs.size()[2:] obs = rollouts.obs[:-1].view(-1, *obs_shape) obs = obs[torch.randint(0, obs.size(0), (1, 32))] rollouts.after_update() receipts.prune(rollouts.obs) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass model_path = os.path.join(save_path, args.env_name + ".pt") torch.save(actor_critic, model_path) print("Saved model:", model_path) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss, )) from pprint import pprint pprint(LevelTracker.global_scoreboard) # tensorboard_writer.add_histogram( # "task_ranks", torch.tensor(predictor._difficulty_rank), total_num_steps # ) tensorboard_writer.add_histogram("value", value, total_num_steps) tensorboard_writer.add_histogram("x", actor_critic.base.last_x, total_num_steps) tensorboard_writer.add_histogram("query", actor_critic.base.last_query, total_num_steps) tensorboard_writer.add_histogram("inputs_at", actor_critic.base.last_inputs_at, total_num_steps) tensorboard_writer.add_scalar("mean_reward", np.mean(episode_rewards), total_num_steps) tensorboard_writer.add_scalar("median_reward", np.median(episode_rewards), total_num_steps) tensorboard_writer.add_scalar("min_reward", np.min(episode_rewards), total_num_steps) tensorboard_writer.add_scalar("max_reward", np.max(episode_rewards), total_num_steps) tensorboard_writer.add_scalar("dist_entropy", dist_entropy, total_num_steps) tensorboard_writer.add_scalar("value_loss", value_loss, total_num_steps) tensorboard_writer.add_scalar("action_loss", action_loss, total_num_steps) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate( actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device, )
def main(): args = get_args() trace_size = args.trace_size toke = tokenizer() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() tobs = torch.zeros((args.num_processes, trace_size), dtype=torch.long) #print (tobs.dtype) rollouts.obs[0].copy_(obs) rollouts.tobs[0].copy_(tobs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.tobs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) tobs = [] envs.render() for info in infos: if 'episode' in info.keys(): #print ("episode ", info['episode']) episode_rewards.append(info['episode']['r']) trace = info['trace'][0:trace_size] trace = [x[2] for x in trace] word_to_ix = toke.tokenize(trace) seq = prepare_sequence(trace, word_to_ix) if len(seq) < trace_size: seq = torch.zeros((trace_size), dtype=torch.long) seq = seq[:trace_size] #print (seq.dtype) tobs.append(seq) tobs = torch.stack(tobs) #print (tobs) #print (tobs.size()) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, tobs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.tobs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True args_dir, logs_dir, models_dir, samples_dir = get_all_save_paths( args, 'pretrain', combine_action=args.combine_action) eval_log_dir = logs_dir + "_eval" utils.cleanup_log_dir(logs_dir) utils.cleanup_log_dir(eval_log_dir) _, _, intrinsic_models_dir, _ = get_all_save_paths(args, 'learn_reward', load_only=True) if args.load_iter != 'final': intrinsic_model_file_name = os.path.join( intrinsic_models_dir, args.env_name + '_{}.pt'.format(args.load_iter)) else: intrinsic_model_file_name = os.path.join( intrinsic_models_dir, args.env_name + '.pt'.format(args.load_iter)) intrinsic_arg_file_name = os.path.join(args_dir, 'command.txt') # save args to arg_file with open(intrinsic_arg_file_name, 'w') as f: json.dump(args.__dict__, f, indent=2) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, logs_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) else: raise NotImplementedError if args.use_intrinsic: obs_shape = envs.observation_space.shape if len(obs_shape) == 3: action_dim = envs.action_space.n elif len(obs_shape) == 1: action_dim = envs.action_space.shape[0] if 'NoFrameskip' in args.env_name: file_name = os.path.join( args.experts_dir, "trajs_ppo_{}.pt".format( args.env_name.split('-')[0].replace('NoFrameskip', '').lower())) else: file_name = os.path.join( args.experts_dir, "trajs_ppo_{}.pt".format(args.env_name.split('-')[0].lower())) rff = RewardForwardFilter(args.gamma) intrinsic_rms = RunningMeanStd(shape=()) if args.intrinsic_module == 'icm': print('Loading pretrained intrinsic module: %s' % intrinsic_model_file_name) inverse_model, forward_dynamics_model, encoder = torch.load( intrinsic_model_file_name) icm = IntrinsicCuriosityModule(envs, device, inverse_model, forward_dynamics_model, \ inverse_lr=args.intrinsic_lr, forward_lr=args.intrinsic_lr,\ ) if args.intrinsic_module == 'vae': print('Loading pretrained intrinsic module: %s' % intrinsic_model_file_name) vae = torch.load(intrinsic_model_file_name) icm = GenerativeIntrinsicRewardModule(envs, device, \ vae, lr=args.intrinsic_lr, \ ) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) obs, reward, done, infos = envs.step(action) next_obs = obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, next_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.use_intrinsic: for step in range(args.num_steps): state = rollouts.obs[step] action = rollouts.actions[step] next_state = rollouts.next_obs[step] if args.intrinsic_module == 'icm': state = encoder(state) next_state = encoder(next_state) with torch.no_grad(): rollouts.rewards[ step], pred_next_state = icm.calculate_intrinsic_reward( state, action, next_state, args.lambda_true_action) if args.standardize == 'True': buf_rews = rollouts.rewards.cpu().numpy() intrinsic_rffs = np.array( [rff.update(rew) for rew in buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments( intrinsic_rffs.ravel()) intrinsic_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) mean = intrinsic_rms.mean std = np.asarray(np.sqrt(intrinsic_rms.var)) rollouts.rewards = rollouts.rewards / torch.from_numpy(std).to( device) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(models_dir, args.algo) policy_file_name = os.path.join(save_path, args.env_name + '.pt') try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], policy_file_name) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "{} Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(args.env_name, j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() use_ppo = args.algo == 'ppo' torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy, 'share_parameter': args.share_parameter}) actor_critic.to(device) return_distributions = False if args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo_rb': agent = algo.PPO_RB( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rb_alpha, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm ) elif args.algo == 'tr_ppo': agent = algo.TR_PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ppo_clip_param=args.ppo_clip_param ) return_distributions = True elif args.algo == 'tr_ppo_rb': agent = algo.TR_PPO_RB( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rb_alpha, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ppo_clip_param=args.ppo_clip_param ) return_distributions = True if not return_distributions: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) else: if actor_critic.dist_name == 'DiagGaussian': rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, distribution_param_dim=envs.action_space.shape[0]*2 ) elif actor_critic.dist_name == 'Bernoulli' or actor_critic.dist_name == 'Categorical': rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, distribution_param_dim=envs.action_space.n ) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes prev_mean_reward = None for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, parameters = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], return_distribution=True) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, parameters) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts, use_ppo) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) mean_rewards = np.mean(episode_rewards) if (prev_mean_reward is not None) and (mean_rewards < prev_mean_reward) and \ (use_ppo == False) and args.revert_to_ppo and j > 3: use_ppo = True print('Revert Back to PPO Training') # args.lr = 3e-4 prev_mean_reward = mean_rewards if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): chrono = exp.chrono() envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(args.repeat): with chrono.time('train') as t: for n in range(args.number): if args.use_linear_lr_decay: utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() # --- rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) exp.log_batch_loss(action_loss) exp.log_metric('value_loss', value_loss) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) # -- number # -- chrono exp.show_eta(j, t) # -- epoch exp.report() envs.close()
def main(): args = get_args() # Record trajectories if args.record_trajectories: record_trajectories() return print(args) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Append the model name log_dir = os.path.expanduser(args.log_dir) log_dir = os.path.join(log_dir, args.model_name, str(args.seed)) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, device, False) # Take activation for carracing print("Loaded env...") activation = None if args.env_name == 'CarRacing-v0' and args.use_activation: activation = torch.tanh print(activation) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'env': args.env_name }, activation=activation) actor_critic.to(device) # Load from previous model if args.load_model_name: state = torch.load( os.path.join(args.save_dir, args.load_model_name, args.load_model_name + '_{}.pt'.format(args.seed)))[0] try: actor_critic.load_state_dict(state) except: actor_critic = state if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: if len(envs.observation_space.shape) == 1: discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=3, subsample_frequency=1) expert_dataset_test = gail.ExpertDataset(file_name, num_trajectories=1, start=3, subsample_frequency=1) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) gail_test_loader = torch.utils.data.DataLoader( dataset=expert_dataset_test, batch_size=args.gail_batch_size, shuffle=False, drop_last=False) print(len(expert_dataset), len(expert_dataset_test)) else: # env observation shape is 3 => its an image assert len(envs.observation_space.shape) == 3 discr = gail.CNNDiscriminator(envs.observation_space.shape, envs.action_space, 100, device) file_name = os.path.join(args.gail_experts_dir, 'expert_data.pkl') expert_dataset = gail.ExpertImageDataset(file_name, train=True) test_dataset = gail.ExpertImageDataset(file_name, train=False) gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=len(expert_dataset) > args.gail_batch_size, ) gail_test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=args.gail_batch_size, shuffle=False, drop_last=len(test_dataset) > args.gail_batch_size, ) print('Dataloader size', len(gail_train_loader)) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() #num_updates = int( #args.num_env_steps) // args.num_steps // args.num_processes num_updates = args.num_steps print(num_updates) # count the number of times validation loss increases val_loss_increase = 0 prev_val_action = np.inf best_val_loss = np.inf for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: try: envs.venv.eval() except: pass gail_epoch = args.gail_epoch #if j < 10: #gail_epoch = 100 # Warm up for _ in range(gail_epoch): #discr.update(gail_train_loader, rollouts, #None) pass for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) #value_loss, action_loss, dist_entropy = agent.update(rollouts) value_loss = 0 dist_entropy = 0 for data in gail_train_loader: expert_states, expert_actions = data expert_states = Variable(expert_states).to(device) expert_actions = Variable(expert_actions).to(device) loss = agent.update_bc(expert_states, expert_actions) action_loss = loss.data.cpu().numpy() print("Epoch: {}, Loss: {}".format(j, action_loss)) with torch.no_grad(): cnt = 0 val_action_loss = 0 for data in gail_test_loader: expert_states, expert_actions = data expert_states = Variable(expert_states).to(device) expert_actions = Variable(expert_actions).to(device) loss = agent.get_action_loss(expert_states, expert_actions) val_action_loss += loss.data.cpu().numpy() cnt += 1 val_action_loss /= cnt print("Val Loss: {}".format(val_action_loss)) #rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": if val_action_loss < best_val_loss: val_loss_increase = 0 best_val_loss = val_action_loss save_path = os.path.join(args.save_dir, args.model_name) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic.state_dict(), getattr(utils.get_vec_normalize(envs), 'ob_rms', None), getattr(utils.get_vec_normalize(envs), 'ret_rms', None) ], os.path.join( save_path, args.model_name + "_{}.pt".format(args.seed))) elif val_action_loss > prev_val_action: val_loss_increase += 1 if val_loss_increase == 10: print("Val loss increasing too much, breaking here...") break elif val_action_loss < prev_val_action: val_loss_increase = 0 # Update prev val action prev_val_action = val_action_loss # log interval if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # coinrun environments need to be treated differently. coinrun_envs = { 'CoinRun': 'standard', 'CoinRun-Platforms': 'platform', 'Random-Mazes': 'maze' } envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, coin_run_level=args.num_levels, difficulty=args.high_difficulty, coin_run_seed=args.seed) if args.env_name in coinrun_envs.keys(): observation_space_shape = (3, 64, 64) args.save_dir = args.save_dir + "/NUM_LEVELS_{}".format( args.num_levels) # Save the level info in the else: observation_space_shape = envs.observation_space.shape # trained model name if args.continue_ppo_training: actor_critic, _ = torch.load(os.path.join(args.check_point, args.env_name + ".pt"), map_location=torch.device(device)) elif args.cor_gail: embed_size = args.embed_size actor_critic = Policy(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, embed_size=embed_size, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) correlator = Correlator(observation_space_shape, envs.action_space, hidden_dim=args.hidden_size, embed_dim=embed_size, lr=args.lr, device=device) correlator.to(device) embeds = torch.zeros(1, embed_size) else: embed_size = 0 actor_critic = Policy(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) embeds = None if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, use_clipped_value_loss=True, ftrl_mode=args.cor_gail or args.no_regret_gail, correlated_mode=args.cor_gail) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail or args.no_regret_gail or args.cor_gail: file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=50, subsample_frequency=1) #if subsample set to a different number, # grad_pen might need adjustment drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) if args.gail: discr = gail.Discriminator(observation_space_shape, envs.action_space, device=device) if args.no_regret_gail or args.cor_gail: queue = deque( maxlen=args.queue_size ) # Strategy Queues: Each element of a queue is a dicr strategy agent_queue = deque( maxlen=args.queue_size ) # Strategy Queues: Each element of a queue is an agent strategy pruning_frequency = 1 if args.no_regret_gail: discr = regret_gail.NoRegretDiscriminator(observation_space_shape, envs.action_space, device=device) if args.cor_gail: discr = cor_gail.CorDiscriminator(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, embed_size=embed_size, device=device) discr.to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, observation_space_shape, envs.action_space, actor_critic.recurrent_hidden_state_size, embed_size) obs = envs.reset() rollouts.obs[0].copy_(obs) if args.cor_gail: rollouts.embeds[0].copy_(embeds) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions # Roll-out with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], rollouts.embeds[step]) obs, reward, done, infos = envs.step(action.to('cpu')) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) # Sample mediating/correlating actions # Correlated Roll-out if args.cor_gail: embeds, embeds_log_prob, mean = correlator.act( rollouts.obs[step], rollouts.actions[step]) rollouts.insert_embedding(embeds, embeds_log_prob) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], rollouts.embeds[-1]).detach() if args.gail or args.no_regret_gail or args.cor_gail: if args.env_name not in {'CoinRun', 'Random-Mazes'}: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if args.gail: if j < 10: gail_epoch = 100 # Warm up # no need for gail epoch or warm up in the no-regret case and cor_gail. for _ in range(gail_epoch): if utils.get_vec_normalize(envs): obfilt = utils.get_vec_normalize(envs)._obfilt else: obfilt = None if args.gail: discr.update(gail_train_loader, rollouts, obfilt) if args.no_regret_gail or args.cor_gail: last_strategy = discr.update(gail_train_loader, rollouts, queue, args.max_grad_norm, obfilt, j) for step in range(args.num_steps): if args.gail: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) if args.no_regret_gail: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step], queue) if args.cor_gail: rollouts.rewards[ step], correlator_reward = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], rollouts.embeds[step], args.gamma, rollouts.masks[step], queue) rollouts.correlated_reward[step] = correlator_reward rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if args.gail: value_loss, action_loss, dist_entropy = agent.update(rollouts, j) elif args.no_regret_gail or args.cor_gail: value_loss, action_loss, dist_entropy, agent_gains, agent_strategy = \ agent.mixed_update(rollouts, agent_queue, j) if args.cor_gail: correlator.update(rollouts, agent_gains, args.max_grad_norm) if args.no_regret_gail or args.cor_gail: queue, _ = utils.queue_update(queue, pruning_frequency, args.queue_size, j, last_strategy) agent_queue, pruning_frequency = utils.queue_update( agent_queue, pruning_frequency, args.queue_size, j, agent_strategy) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass if not args.cor_gail: torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) else: print("saving models in {}".format( os.path.join(save_path, args.env_name))) torch.save( correlator.state_dict(), os.path.join(save_path, args.env_name + "correlator.pt")) torch.save([ actor_critic.state_dict(), getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "actor.pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}," " value loss/action loss {:.1f}/{}".format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def train(): processes = [] if os.path.isdir(args.log_dir): ans = input('{} exists\ncontinue and overwrite? y/n: '.format(args.log_dir)) if ans == 'n': return logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join(args.log_dir, 'params.json'), 'w')) torch.set_num_threads(2) start = time.time() policy_update_time, policy_forward_time = 0, 0 step_time_env, step_time_total, step_time_rewarder = 0, 0, 0 visualize_time = 0 rewarder_fit_time = 0 envs = RL2EnvInterface(args) if args.look: looker = Looker(args.log_dir) actor_critic = Policy(envs.obs_shape, envs.action_space, base=RL2Base, base_kwargs={'recurrent': True, 'num_act_dim': envs.action_space.shape[0]}) actor_critic.to(args.device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.obs_shape, envs.action_space, actor_critic.recurrent_hidden_state_size) rollouts.to(args.device) def copy_obs_into_beginning_of_storage(obs): obs_raw, obs_act, obs_rew, obs_flag = obs rollouts.obs[0].copy_(obs_raw) rollouts.obs_act[0].copy_(obs_act) rollouts.obs_rew[0].copy_(obs_rew) rollouts.obs_flag[0].copy_(obs_flag) for j in range(args.num_updates): obs = envs.reset() copy_obs_into_beginning_of_storage(obs) if args.use_linear_lr_decay: update_linear_schedule(agent.optimizer, j, args.num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(args.num_updates)) episode_returns = [0 for i in range(args.trial_length)] episode_final_reward = [0 for i in range(args.trial_length)] i_episode = 0 log_marginal = 0 lambda_log_s_given_z = 0 for step in range(args.num_steps): # Sample actions policy_forward_start = time.time() with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.get_obs(step), rollouts.recurrent_hidden_states[step], rollouts.masks[step]) policy_forward_time += time.time() - policy_forward_start # Obser reward and next obs step_total_start = time.time() obs, reward, done, info = envs.step(action) step_time_total += time.time() - step_total_start step_time_env += info['step_time_env'] step_time_rewarder += info['reward_time'] log_marginal += info['log_marginal'].sum().item() lambda_log_s_given_z += info['lambda_log_s_given_z'].sum().item() episode_returns[i_episode] += reward.sum().item() if all(done['episode']): episode_final_reward[i_episode] += reward.sum().item() i_episode = (i_episode + 1) % args.trial_length # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done['trial']]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) assert all(done['trial']) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.get_obs(-1), rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) policy_update_start = time.time() if args.rewarder != 'supervised' and envs.rewarder.fit_counter == 0 and not args.vae_load: value_loss, action_loss, dist_entropy = 0, 0, 0 else: value_loss, action_loss, dist_entropy = agent.update(rollouts) policy_update_time += time.time() - policy_update_start rollouts.after_update() # metrics trajectories_pre = envs.trajectories_pre_current_update state_entropy_pre = calculate_state_entropy(args, trajectories_pre) trajectories_post = envs.trajectories_post_current_update state_entropy_post = calculate_state_entropy(args, trajectories_post) return_avg = rollouts.rewards.sum() / args.trials_per_update reward_avg = return_avg / (args.trial_length * args.episode_length) log_marginal_avg = log_marginal / args.trials_per_update / (args.trial_length * args.episode_length) lambda_log_s_given_z_avg = lambda_log_s_given_z / args.trials_per_update / (args.trial_length * args.episode_length) num_steps = (j + 1) * args.num_steps * args.num_processes num_episodes = num_steps // args.episode_length num_trials = num_episodes // args.trial_length logger.logkv('state_entropy_pre', state_entropy_pre) logger.logkv('state_entropy_post', state_entropy_post) logger.logkv('value_loss', value_loss) logger.logkv('action_loss', action_loss) logger.logkv('dist_entropy', dist_entropy) logger.logkv('return_avg', return_avg.item()) logger.logkv('reward_avg', reward_avg.item()) logger.logkv('steps', (j + 1) * args.num_steps * args.num_processes) logger.logkv('episodes', num_episodes) logger.logkv('trials', num_trials) logger.logkv('policy_updates', (j + 1)) logger.logkv('time', time.time() - start) logger.logkv('policy_forward_time', policy_forward_time) logger.logkv('policy_update_time', policy_update_time) logger.logkv('step_time_rewarder', step_time_rewarder) logger.logkv('step_time_env', step_time_env) logger.logkv('step_time_total', step_time_total) logger.logkv('visualize_time', visualize_time) logger.logkv('rewarder_fit_time', rewarder_fit_time) logger.logkv('log_marginal_avg', log_marginal_avg) logger.logkv('lambda_log_s_given_z_avg', lambda_log_s_given_z_avg) for i_episode in range(args.trial_length): logger.logkv('episode_return_avg_{}'.format(i_episode), episode_returns[i_episode] / args.trials_per_update) logger.logkv('episode_final_reward_{}'.format(i_episode), episode_final_reward[i_episode] / args.trials_per_update) if (j % args.save_period == 0 or j == args.num_updates - 1) and args.log_dir != '': save_model(args, actor_critic, envs, iteration=j) if not args.vae_freeze and j % args.rewarder_fit_period == 0: rewarder_fit_start = time.time() envs.fit_rewarder() rewarder_fit_time += time.time() - rewarder_fit_start if (j % args.vis_period == 0 or j == args.num_updates - 1) and args.log_dir != '': visualize_start = time.time() if args.look: eval_return_avg, eval_episode_returns, eval_episode_final_reward = looker.look(iteration=j) logger.logkv('eval_return_avg', eval_return_avg) for i_episode in range(args.trial_length): logger.logkv('eval_episode_return_avg_{}'.format(i_episode), eval_episode_returns[i_episode] / args.trials_per_update) logger.logkv('eval_episode_final_reward_{}'.format(i_episode), eval_episode_final_reward[i_episode] / args.trials_per_update) if args.plot: p = Popen('python visualize.py --log-dir {}'.format(args.log_dir), shell=True) processes.append(p) visualize_time += time.time() - visualize_start logger.dumpkvs()
def main(): realEval = True #False gettrace = getattr(sys, 'gettrace', None) parser = argparse.ArgumentParser(description='RL') parser.add_argument('--action-type', type=int, default=-1, help='action type to play (default: -1)') parser.add_argument('--tasks-difficulty-from', type=int, default=0, help='tasks_difficulty_from') parser.add_argument('--tasks-difficulty-to', type=int, default=100000, help='tasks-difficulty-to') parser.add_argument('--verboseLevel', type=int, default=5, help='verboseLevel') parser.add_argument('--filesNamesSuffix', default="", help='filesNamesSuffix') parser.add_argument('--nobest-exit', type=int, default=10000, help='nobest_exit') args = get_args(parser) args.algo = 'ppo' args.env_name = 'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' args.use_gae = True args.num_steps = 2048 #args.num_processes = 4 args.num_processes = 4 if gettrace(): args.num_processes = 1 args.lr = 0.0001 args.entropy_coef = 0.0 args.value_loss_coef = 0.5 args.ppo_epoch = 4 args.num_mini_batch = 256 args.gamma = 0.99 args.gae_lambda = 0.95 args.clip_param = 0.2 args.use_linear_lr_decay = True #True #True #True #True args.use_proper_time_limits = True args.save_dir = "./trained_models/" + args.env_name + "/" args.load_dir = "./trained_models/" + args.env_name + "/" args.log_dir = "./logs/robot" if gettrace(): args.save_dir = "./trained_models/" + args.env_name + "debug/" args.load_dir = "./trained_models/" + args.env_name + "debug/" args.log_dir = "./logs/robot_d" args.log_interval = 30 args.hidden_size = 64 args.last_hidden_size = args.hidden_size args.recurrent_policy = False #True args.save_interval = 20 #args.seed = 1 reward_shaping = 0.01 allowMutate = False if args.seed == -1: args.seed = time.clock_gettime_ns(time.CLOCK_REALTIME) quadruppedEnv.settings.tasks_difficulty_from = args.tasks_difficulty_from quadruppedEnv.settings.tasks_difficulty_to = args.tasks_difficulty_to # 0 is a walk # 1 is a balance # 2 multitasks # 3 multitask experiments trainType = 14 filesNamesSuffix = "" if args.action_type >= 0: trainType = args.action_type makeEnvFunction = makeEnv.make_env_with_best_settings if trainType == 1: filesNamesSuffix = "balance_" makeEnvFunction = makeEnv.make_env_for_balance if trainType == 2: filesNamesSuffix = "analytical_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical if trainType == 3: filesNamesSuffix = "analytical2_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical2 if trainType == 4: filesNamesSuffix = "frontback_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_front_back if trainType == 5: filesNamesSuffix = "leftright_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_left_right if trainType == 6: filesNamesSuffix = "all_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_all if trainType == 7: filesNamesSuffix = "rotate_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_rotate if trainType == 8: filesNamesSuffix = "compound_" makeEnvFunction = make_env_multinetwork if trainType == 9: import pickle realEval = False allowMutate = False args.use_linear_lr_decay = True #False args.num_env_steps = 5000000 filesNamesSuffix = "test_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_test if trainType == 10: import pickle realEval = False allowMutate = False args.use_linear_lr_decay = True #False args.num_env_steps = 5000000 filesNamesSuffix = "zoo_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_test_zoo if trainType == 11: args.hidden_size = 128 #64 #128 args.last_hidden_size = args.hidden_size import pickle if gettrace(): args.num_processes = 1 else: args.num_processes = 8 realEval = False allowMutate = False args.lr = 0.00001 args.use_linear_lr_decay = True #False args.num_env_steps = 10000000 filesNamesSuffix = "zigote2_updown_" print("Samples preload") global samplesEnvData samplesEnvData = pickle.load( open("./QuadruppedWalk-v1_MoveNoPhys.samples", "rb")) # samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1.samples", "rb" ) ) makeEnvFunction = makeSamplesEnv if trainType == 12: import pickle args.lr = 0.00001 args.hidden_size = 64 args.last_hidden_size = args.hidden_size filesNamesSuffix = "zigote2_front_back_" args.clip_param = 0.9 args.value_loss_coef = 0.9 makeEnvFunction = makeEnv.make_env_with_best_settings_for_train #makeEnvFunction = makeEnv.make_env_with_best_settings_for_record #makeEnv.samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1_MoveNoPhys.samples", "rb" ) ) if trainType == 13: filesNamesSuffix = "all_bytasks_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_all if trainType == 14: #args.lr = 0.00001 #args.num_env_steps = 000000 #args.clip_param = 0.5 #args.value_loss_coef =0.8 #random.seed(time.clock_gettime_ns(time.CLOCK_REALTIME)) #args.num_steps = random.choice([256,512,1024,2048,4096]) #args.num_mini_batch = random.choice([32,64,256,512]) #args.ppo_epoch = random.choice([2,4,8,10]) #args.clip_param = random.choice([0.2,0.4,0.6,0.8]) #args.value_loss_coef =random.choice([0.4,0.5,0.6,0.8]) #args.lr = random.choice([0.00001,0.0001,0.00005,0.0005]) args.num_steps = 2048 args.num_mini_batch = 64 args.ppo_epoch = 8 args.lr = 0.0001 args.hidden_size = 64 args.last_hidden_size = args.hidden_size # filesNamesSuffix = args.filesNamesSuffix makeEnvFunction = makeEnv.make_env_with_best_settings_for_all ''' num_steps: 1024 num_mini_batch 64 ppo_epoch 2 clip_param: 0.2 value_loss_coef 0.6 lr 0.0001 ''' if trainType == 15: args.num_env_steps = 5000000 filesNamesSuffix = "zigote_updown_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_train_analytic if trainType == 16: args.lr = 0.00001 filesNamesSuffix = "compound_tasks_" makeEnvFunction = make_env_multinetwork reward_shaper = DefaultRewardsShaper(scale_value=reward_shaping) print("ActionType ", trainType, " ", filesNamesSuffix, "seed", args.seed, "num env steps:", args.num_env_steps, " tasks_dif", args.tasks_difficulty_from, args.tasks_difficulty_to) print("Num processes:", args.num_processes) print("num_steps:", args.num_steps, "num_mini_batch", args.num_mini_batch, "ppo_epoch", args.ppo_epoch) print("clip_param:", args.clip_param, "value_loss_coef", args.value_loss_coef, "lr", args.lr) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True args.log_dir = "/tmp/tensorboard/" #TesnorboardX writer = SummaryWriter(log_dir=args.log_dir + 'runs/{}_PPO_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, "ppo")) writer.add_scalar('options/num_steps', args.num_steps, 0) writer.add_scalar('options/num_mini_batch', args.num_mini_batch, 0) writer.add_scalar('options/ppo_epoch', args.ppo_epoch, 0) writer.add_scalar('options/clip_param', args.clip_param, 0) writer.add_scalar('options/value_loss_coef', args.value_loss_coef, 0) writer.add_scalar('options/lr', args.lr, 0) device = torch.device("cuda:0" if args.cuda else "cpu") torch.set_num_threads(1) load_dir = os.path.join(args.load_dir, args.algo) multiNetworkName = ["frontback_", "all_", "leftright_", "rotate_"] if trainType == 8: for net in multiNetworkName: bestFilename = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, net, args.hidden_size)) ac, _ = torch.load(bestFilename) policies.append(PPOPlayer(ac, device)) print("Policy multi loaded: ", bestFilename) multiNetworkName2 = [ "all_bytasks_0_", "all_bytasks_1_", "all_bytasks_2_", "all_bytasks_3_", "all_bytasks_4_", "all_bytasks_5_", "all_bytasks_6_", "all_bytasks_7_", "all_bytasks_8_", "all_bytasks_9_", "all_bytasks_10_", "all_bytasks_11_", "all_bytasks_12_", ] if trainType == 16: for net in multiNetworkName2: bestFilename = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, net, args.hidden_size)) ac, _ = torch.load(bestFilename) policies.append(PPOPlayer(ac, device)) print("Policy multi loaded: ", bestFilename) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, None, device, False, normalizeOb=False, normalizeReturns=False, max_episode_steps=args.num_steps, makeEnvFunc=makeEnvFunction, num_frame_stack=1, info_keywords=( 'episode_steps', 'episode_reward', 'progress', 'servo', 'distToTarget', )) #print(envs.observation_space.shape,envs.action_space) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_size, 'last_hidden_size': args.last_hidden_size, 'activation_layers_type': "Tanh" }) ''' # if args.load_dir not None: load_path = os.path.join(args.load_dir, args.algo) actor_critic, ob_rms = torch.load(os.path.join(load_path, args.env_name + ".pt")) ''' load_path = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) #load_path = os.path.join(load_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) preptrained_path = "../Train/trained_models/QuadruppedWalk-v1/Train_QuadruppedWalk-v1_256.pth" loadPretrained = False if loadPretrained and os.path.isfile(preptrained_path): print("Load preptrained") abj = torch.load(preptrained_path) print(abj) print(actor_critic.base) actor_critic.base.load_state_dict() actor_critic.base.eval() if os.path.isfile(load_path) and not loadPretrained: actor_critic, ob_rms = torch.load(load_path) actor_critic.eval() print("----NN loaded: ", load_path, " -----") else: bestFilename = os.path.join( load_dir, "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) if os.path.isfile(bestFilename): actor_critic, ob_rms = torch.load(bestFilename) actor_critic.eval() print("----NN loaded: ", bestFilename, " -----") maxReward = -10000.0 maxSteps = 0 minDistance = 50000.0 actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) deque_maxLen = 10 episode_rewards = deque(maxlen=deque_maxLen) episode_steps = deque(maxlen=deque_maxLen) episode_rewards_alive = deque(maxlen=deque_maxLen) episode_rewards_progress = deque(maxlen=deque_maxLen) episode_rewards_servo = deque(maxlen=deque_maxLen) episode_dist_to_target = deque(maxlen=deque_maxLen) ''' load_path = os.path.join(args.load_dir, args.algo) load_path = os.path.join(load_path, args.env_name + ".pt") actor_critic, ob_rms = torch.load(load_path) actor_critic.to(device) actor_critic.eval() #ob_rms.eval() ''' ''' args.use_gym_monitor = 1 args.monitor_dir = "./results/" monitor_path = os.path.join(args.monitor_dir, args.algo) monitor_path = os.path.join(monitor_path, args.env_name) args. if args.use_gym_monitor: env = wrappers.Monitor( env, monitor_path, video_callable=False, force=True) ''' i_episode = 0 save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass trainOnSamplesAndExit = False #False if trainOnSamplesAndExit: import pickle print("---------------------------------------") print("Samples preload") data = pickle.load(open("./QuadruppedWalk-v1_UpDown.samples", "rb")) #data = pickle.load( open( "../QuadruppedWalk-v1_NN.samples", "rb" ) ) learning_rate = 0.0001 max_episodes = 100 max_timesteps = 4000 betas = (0.9, 0.999) log_interval = 1 envSamples = SamplesEnv(data) envSamples.numSteps = max_timesteps # create a stochastic gradient descent optimizer optimizer = torch.optim.Adam(actor_critic.base.actor.parameters(), lr=learning_rate, betas=betas) #optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9) # create a loss function criterion = nn.MSELoss(reduction="sum") # run the main training loop for epoch in range(max_episodes): state = envSamples.reset() time_step = 0 testReward = 0 testSteps = 0 loss_sum = 0 loss_max = 0 for t in range(max_timesteps): time_step += 1 nn_state = torch.FloatTensor((state).reshape(1, -1)).to(device) optimizer.zero_grad() net_out = actor_critic.base.forwardActor(nn_state) net_out = actor_critic.dist.fc_mean(net_out) state, reward, done, info = envSamples.step( net_out.detach().numpy()) sim_action = envSamples.recordedActions sim_action_t = torch.FloatTensor([sim_action]).to(device) loss = criterion(net_out, sim_action_t) loss.backward() optimizer.step() loss_sum += loss.mean() loss_max = max(loss_max, loss.max()) testReward += reward testSteps += 1 if done: if epoch % log_interval == 0: #print(best_action_t*scaleActions-net_out*scaleActions) if args.verboseLevel > 0: print( 'Train Episode: {} t:{} Reward:{} Loss: mean:{:.6f} max: {:.6f}' .format(epoch, t, testReward, loss_sum / t, loss_max)) print(info) reward = 0 break bestFilename = os.path.join( save_path, "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) exit(0) skipWriteBest = True if args.verboseLevel > 0: printNetwork(actor_critic.base.actor) lock(actor_critic, first=False, last=False) #if trainType==9: #allowMutate = False #lock(actor_critic,first=True,last=False) #mutate(actor_critic,power=0.00,powerLast=0.3) if args.verboseLevel > 0: printNetwork(actor_critic.base.actor) #from torchsummary import summary #summary(actor_critic.base.actor, (1, 48, 64)) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episodeBucketIndex = 0 maxReward = -10000000000 numEval = 10 if realEval: envEval = makeEnvFunction(args.env_name) if hasattr(envEval.env, "tasks") and len(envEval.env.tasks): numEval = max(numEval, len(envEval.env.tasks)) maxReward = evaluate_policy(envEval, actor_critic, numEval * 2, render=False, device=device, verbose=args.verboseLevel) print("MaxReward on start", maxReward) noMaxRewardCount = 0 updateIndex = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) episode_r = 0.0 stepsDone = 0 for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #envs.venv.venv.venv.envs[0].render() if args.verboseLevel > 0: index = 0 for d in done: if d: print(infos[index], flush=True) index += 1 episodeDone = False ''' index = 0 for d in done: if d: print("") print(infos[index]) index+=1 ''' for info in infos: if 'reward' in info.keys(): episodeDone = True i_episode += 1 episode_rewards.append(info['reward']) writer.add_scalar('reward/episode', info['reward'], i_episode) #print("E:",i_episode," T:",info['episode_steps'], " R:", info['episode_reward'], " D:",info['distToTarget']) if 'steps' in info.keys(): episode_steps.append(info['steps']) writer.add_scalar('reward/steps', info['steps'], i_episode) if 'alive' in info.keys(): episode_rewards_alive.append(info['alive']) writer.add_scalar('reward/alive', info['alive'], i_episode) if 'prog' in info.keys(): episode_rewards_progress.append(info['prog']) writer.add_scalar('reward/progress', info['prog'], i_episode) if 'servo' in info.keys(): episode_rewards_servo.append(info['servo']) writer.add_scalar('reward/servo', info['servo'], i_episode) if 'd2T' in info.keys(): episode_dist_to_target.append(info['d2T']) writer.add_scalar('reward/distToTarget', info['d2T'], i_episode) for val in info.keys(): if val not in [ "reward", "steps", "alive", "prog", "servo", "d2T", 'epos', 't' ]: writer.add_scalar('reward/' + val, info[val], i_episode) #if episodeDone and i_episode%10==0: # print(i_episode,"({:.1f}/{}/{:.2f}) ".format(episode_rewards[-1],episode_steps[-1],episode_dist_to_target[-1]),end='',flush=True) if episodeDone: episodeBucketIndex += 1 if args.verboseLevel > 0: print("Mean:", Fore.WHITE, np.mean(episode_rewards), Style.RESET_ALL, " Median:", Fore.WHITE, np.median(episode_rewards), Style.RESET_ALL, " max reward:", maxReward) #'''len(episode_rewards) and np.mean(episode_rewards)>maxReward and''' if realEval: if episodeBucketIndex % args.log_interval == 0 and episodeBucketIndex > args.log_interval: print("Step:", (j + 1) * args.num_processes * args.num_steps) if skipWriteBest == False: evalReward = evaluate_policy( envEval, actor_critic, numEval, device=device, verbose=args.verboseLevel) writer.add_scalar('reward/eval', evalReward, i_episode) if evalReward > maxReward: maxReward = evalReward #maxReward = np.mean(episode_rewards) bestFilename = os.path.join( save_path, "{}_{}{}_best.pt".format( args.env_name, filesNamesSuffix, args.hidden_size)) print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{:.1f}/{}/{:.2f}) ".format( maxReward, np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps), episode_dist_to_target[-1]), Style.RESET_ALL, bestFilename) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) noMaxRewardCount = 0 else: noMaxRewardCount += 1 if allowMutate: if noMaxRewardCount == 5: print("Mutation low last layer") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.00, powerLast=0.01) if noMaxRewardCount == 8: print("Mutation low non last") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.01, powerLast=0.0) if noMaxRewardCount == 11: print("Mutation low all") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.02, powerLast=0.2) if noMaxRewardCount == 14: print("Mutation hi all") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.03, powerLast=0.03) noMaxRewardCount = 0 if noMaxRewardCount == args.nobest_exit: exit(0) else: skipWriteBest = False else: if len(episode_rewards) and np.mean( episode_rewards ) > maxReward and j > args.log_interval: if skipWriteBest == False: maxReward = np.mean(episode_rewards) writer.add_scalar('reward/maxReward', maxReward, i_episode) bestFilename = os.path.join( save_path, "{}_{}{}_best.pt".format( args.env_name, filesNamesSuffix, args.hidden_size)) if len(episode_dist_to_target): print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{}/{:.2f}) ".format( np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps), episode_dist_to_target[-1]), Style.RESET_ALL, bestFilename) else: print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{}) ".format( np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps)), Style.RESET_ALL, bestFilename) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) else: skipWriteBest = False # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) shaped_reward = reward_shaper(reward) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, shaped_reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) writer.add_scalar('reward/value_loss', value_loss, updateIndex) writer.add_scalar('reward/action_loss', action_loss, updateIndex) writer.add_scalar('reward/dist_entropy', dist_entropy, updateIndex) updateIndex += 1 rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": ''' fileName = os.path.join(save_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], fileName) print("Saved:",fileName, " cur avg rewards:",np.mean(episode_rewards)) fileName = os.path.join(save_path, "{}_{}{}_actor.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) torch.save(actor_critic.state_dict, fileName) print("Saved:",fileName) ''' if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if args.verboseLevel > 0: print("") print("Updates {}, num timesteps {}, FPS {}".format( j, total_num_steps, int(total_num_steps / (end - start)))) print(" Last {} training episodes:".format( len(episode_rewards))) print( " reward mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}". format(np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) print(" steps mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}". format(np.mean(episode_steps), np.median(episode_steps), np.min(episode_steps), np.max(episode_steps))) if len(episode_rewards_alive): print( " alive mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_alive), np.median(episode_rewards_alive), np.min(episode_rewards_alive), np.max(episode_rewards_alive))) if len(episode_rewards_progress): print( " progress mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_progress), np.median(episode_rewards_progress), np.min(episode_rewards_progress), np.max(episode_rewards_progress))) if len(episode_rewards_servo): print( " servo mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_servo), np.median(episode_rewards_servo), np.min(episode_rewards_servo), np.max(episode_rewards_servo))) if len(episode_dist_to_target): print( " dist to target mean/median {:.3f}/{:.3f} min/max {:.3f}/{:.3f}" .format(np.mean(episode_dist_to_target), np.median(episode_dist_to_target), np.min(episode_dist_to_target), np.max(episode_dist_to_target))) print( " Reward/Steps {:.3f} Progress/Steps: {:.3f} entropy {:.1f} value_loss {:.5f} action_loss {:.5f}\n" .format( np.mean(episode_rewards) / np.mean(episode_steps), (0 if len(episode_rewards_progress) == 0 else np.mean(episode_rewards_progress) / np.mean(episode_steps)), dist_entropy, value_loss, action_loss))
def train(train_states, run_dir, num_env_steps, eval_env_steps, writer, writer_name, args, init_model=None): envs = make_vec_envs(train_states, args.seed, args.num_processes, args.gamma, 'cpu', 'train', args) if init_model: actor_critic, env_step, model_name = init_model obs_space = actor_critic.obs_space obs_process = actor_critic.obs_process obs_module = actor_critic.obs_module print(f" [load] Loaded model {model_name} at step {env_step}") else: obs_space = envs.observation_space actor_critic = Policy(obs_space, args.obs_process, args.obs_module, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) env_step = 0 actor_critic.to(args.device) #print(actor_critic) run_name = run_dir.replace('/', '_') vid_save_dir = f"{run_dir}/videos/" try: os.makedirs(vid_save_dir) except OSError: pass ckpt_save_dir = f"{run_dir}/ckpts/" try: os.makedirs(ckpt_save_dir) except OSError: pass if args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.device, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=False) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=True) else: raise NotImplementedError rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() actor_critic.eval() """ try: writer.add_graph(actor_critic, obs) except ValueError: print("Unable to write model graph to tensorboard.") """ actor_critic.train() for k in rollouts.obs.keys(): rollouts.obs[k][0].copy_(obs[k][0]) episode_rewards = deque(maxlen=10) num_updates = num_env_steps // args.num_steps // args.num_processes batch_size = args.num_steps * args.num_processes start = time.time() while env_step < num_env_steps: s = time.time() if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act( { k: rollouts.obs[k][step].float().to(args.device) for k in rollouts.obs.keys() }, rollouts.recurrent_hidden_states[step].to(args.device), rollouts.masks[step].to(args.device)) value = value.cpu() action = action.cpu() action_log_prob = action_log_prob.cpu() recurrent_hidden_states = recurrent_hidden_states.cpu() # Observe reward and next obs obs, reward, dones, infos = envs.step(action) for done, info in zip(dones, infos): env_state = info['env_state'][1] if done: writer.add_scalar(f'train_episode_x/{env_state}', info['max_x'], env_step) writer.add_scalar(f'train_episode_%/{env_state}', info['max_x'] / info['lvl_max_x'] * 100, env_step) writer.add_scalar(f'train_episode_r/{env_state}', info['sum_r'], env_step) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done else [1.0] for done in dones]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( { k: rollouts.obs[k][-1].float().to(args.device) for k in rollouts.obs.keys() }, rollouts.recurrent_hidden_states[-1].to(args.device), rollouts.masks[-1].to(args.device)).detach().cpu() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() env_step += batch_size fps = batch_size / (time.time() - s) #res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) #writer.add_scalar(f'gpu_usage/{writer_name}', res.gpu, env_step) #writer.add_scalar(f'gpu_mem/{writer_name}', res.memory, env_step) total_norm = 0 for p in list( filter(lambda p: p.grad is not None, actor_critic.parameters())): param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) obs_norm = {} for obs_name in args.obs_keys: t_norm = 0 if obs_name == 'video': md = actor_critic.base.video_module elif obs_name == 'audio': md = actor_critic.base.audio_module else: raise NotImplementedError for p in list(filter(lambda p: p.grad is not None, md.parameters())): param_norm = p.grad.data.norm(2) t_norm += param_norm.item()**2 obs_norm[obs_name] = t_norm**(1. / 2) prev_env_step = max(0, env_step + 1 - batch_size) # write training metrics for this batch, usually takes 0.003s if (env_step + 1 ) // args.write_interval > prev_env_step // args.write_interval: writer.add_scalar(f'grad_norm/{writer_name}', total_norm, env_step) writer.add_scalar(f'fps/{writer_name}', fps, env_step) writer.add_scalar(f'value_loss/{writer_name}', value_loss / batch_size, env_step) writer.add_scalar(f'action_loss/{writer_name}', action_loss / batch_size, env_step) writer.add_scalar(f'dist_entropy/{writer_name}', dist_entropy / batch_size, env_step) writer.add_scalar(f'cpu_usage/{writer_name}', psutil.cpu_percent(), env_step) writer.add_scalar(f'cpu_mem/{writer_name}', psutil.virtual_memory()._asdict()['percent'], env_step) for obs_name in args.obs_keys: writer.add_scalar(f'grad_norm_{obs_name}/{writer_name}', obs_norm[obs_name], env_step) # print log to console if (env_step + 1) // args.log_interval > prev_env_step // args.log_interval: end = time.time() print(" [log] Env step {} of {}: {:.1f}s, {:.1f}fps".format( env_step + 1, num_env_steps, end - start, fps)) if len(episode_rewards) > 0: print( " Last {} episodes: mean/med reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) print( " dist_entropy {:.5f}, value_loss {:.6f}, action_loss {:.6f}, grad_norm {:.6f}" .format(dist_entropy, value_loss, action_loss, total_norm)) start = time.time() # save model to ckpt if ((env_step + 1) // args.save_interval > prev_env_step // args.save_interval): torch.save([ actor_critic, env_step, run_name, ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")) print(f" [save] Saved model at step {env_step+1}.") # save model to ckpt and run evaluation if eval_interval and not final iteration in training loop if ((env_step + 1) // args.eval_interval > prev_env_step // args.eval_interval ) and env_step < num_env_steps and eval_env_steps > 0: torch.save([ actor_critic, env_step, run_name, ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")) print(f" [save] Saved model at step {env_step+1}.") envs.close() del envs # close does not actually get rid of envs, need to del actor_critic.eval() eval_score, e_dict = evaluate(train_states, actor_critic, eval_env_steps, env_step, writer, vid_save_dir, args.vid_tb_steps, args.vid_file_steps, args.obs_viz_layer, args) print(f" [eval] Evaluation score: {eval_score}") writer.add_scalar('eval_score', eval_score, env_step) actor_critic.train() envs = make_vec_envs(train_states, args.seed, args.num_processes, args.gamma, 'cpu', 'train', args) obs = envs.reset() # TODO: does this work? do we need to increment env step or something? whydden_states insert at 0 for k in rollouts.obs.keys(): rollouts.obs[k][0].copy_(obs[k][0]) # final model save final_model_path = os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt") torch.save([ actor_critic, env_step, run_name, ], final_model_path) print( f" [save] Final model saved at step {env_step+1} to {final_model_path}" ) # final model eval envs.close() del envs eval_score = None eval_dict = None if eval_env_steps > 0: eval_score, eval_dict = evaluate(train_states, actor_critic, eval_env_steps, env_step, writer, vid_save_dir, args.vid_tb_steps, args.vid_file_steps, args.obs_viz_layer, args) print(f" [eval] Final model evaluation score: {eval_score:.3f}") return (actor_critic, env_step, run_name), eval_score, eval_dict
def pg(envs, printout, use_gail=False): if use_gail: assert len(envs.observation_space.shape) == 1 discr = gail_util.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( '/home/paperspace/repos/pytorch-a2c-ppo-acktr-gail/gail_experts', "trajs_reacher.pt") gail_train_loader = torch.utils.data.DataLoader( gail_util.ExpertDataset(file_name, num_trajectories=4, subsample_step=4), batch_size=ppo_args.gail_batchsize, shuffle=True, drop_last=True) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = algo.PPO(actor_critic=actor_critic, clip_param=ppo_args.clip_param, ppo_epoch=ppo_args.ppo_epoch, num_mini_batch=ppo_args.num_mb, value_loss_coef=ppo_args.vloss_coef, entropy_coef=ppo_args.entropy_coef, lr=ppo_args.lr, eps=ppo_args.adam_eps, max_grad_norm=.5) rollouts = storage.RolloutStorage(ppo_args.num_steps, ppo_args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) num_updates = int( ppo_args.total_steps) // ppo_args.num_steps // ppo_args.num_processes episode_rewards = deque(maxlen=10) scores = np.zeros((ppo_args.num_envs, 1)) final_scores = np.zeros((ppo_args.num_envs, 1)) start = timer() for j in range(num_updates): utils.update_linear_schedule(agent.optimizer, j, num_updates, ppo_args.lr) for step in range(ppo_args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.ones_like(masks) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) if use_gail: if j >= 10: envs.venv.eval() gail_epoch = ppo_args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(ppo_args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], ppo_args.gamma, rollouts.masks[step]) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, ppo_args.use_gae, ppo_args.gamma, ppo_args.gae_lambda, ppo_args.time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() save_path = 'saved_models' save_interval = 100 # save for every interval-th update or for the last epoch if (j % save_interval == 0 or j == num_updates - 1): torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, "ppo" + env_name + ".pt")) log_interval = 10 if j % log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * ppo_args.num_processes * ppo_args.num_steps end = timer() printout( f'Updates {j}, num timesteps {total_num_steps}, FPS { int(total_num_steps / (end - start))} \n ' f'Last {len(episode_rewards)} training episodes: mean/median reward {np.mean(episode_rewards):.1f}/{ np.median(episode_rewards):.1f}, ' f'min/max reward {np.min(episode_rewards):.1f}/{np.max(episode_rewards):.1f}' )
def main(): all_episode_rewards = [] ### 记录 6/29 all_temp_rewards = [] ### 记录 6/29 args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print('num_updates ', num_updates) print('num_steps ', args.num_steps) count = 0 h5_path = './data/' + args.env_name if not os.path.exists(h5_path): os.makedirs(h5_path) h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (count) data = {} data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] episode_step = 0 for j in range(num_updates): ### num-steps temp_states = [] temp_actions = [] temp_rewards = [] temp_done = [] temp_lenthgs = [] if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if j == 0 and step == 0: print('obs ', type(rollouts.obs[step]), rollouts.obs[step].shape) print('hidden_states ', type(rollouts.recurrent_hidden_states[step]), rollouts.recurrent_hidden_states[step].shape) print('action ', type(action), action.shape) print('action prob ', type(action_log_prob), action_log_prob.shape) print('-' * 20) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #print(infos) #print(reward) temp_states += [np.array(rollouts.obs[step].cpu())] temp_actions += [np.array(action.cpu())] #temp_rewards += [np.array(reward.cpu())] temp_rewards += [np.array([infos[0]['myrewards']]) ] ### for halfcheetah不能直接用 reward !! 6/29 temp_done += [np.array(done)] if j == 0 and step == 0: print('obs ', type(obs), obs.shape) print('reward ', type(reward), reward.shape) print('done ', type(done), done.shape) print('infos ', len(infos)) for k, v in infos[0].items(): print(k, v.shape) print() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) all_episode_rewards += [info['episode']['r']] ### 记录 6/29 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) temp_lengths = len(temp_states) temp_states = np.concatenate(temp_states) temp_actions = np.concatenate(temp_actions) temp_rewards = np.concatenate(temp_rewards) temp_done = np.concatenate(temp_done) #print('temp_lengths',temp_lengths) #print('temp_states', temp_states.shape) #print('temp_actions', temp_actions.shape) #print('temp_rewards', temp_rewards.shape) if j > int(0.4 * num_updates): data['states'] += [temp_states] data['actions'] += [temp_actions] data['rewards'] += [temp_rewards] data['lengths'] += [temp_lengths] data['done'] += [temp_done] #print('temp_lengths',data['lengths'].shape) #print('temp_states', data['states'].shape) #print('temp_actions', data['actions'].shape) #print('temp_rewards', data['rewards'].shape) if args.save_expert and len(data['states']) >= 100: with h5py.File(h5_filename, 'w') as f: f['states'] = np.array(data['states']) f['actions'] = np.array(data['actions']) f['rewards'] = np.array(data['rewards']) f['done'] = np.array(data['done']) f['lengths'] = np.array(data['lengths']) #print('f_lengths',f['lengths'].shape) #print('f_states', f['states'].shape) #print('f_actions', f['actions'].shape) #print('f_rewards', f['rewards'].shape) count += 1 h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % ( count) data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "_%d.pt" % (args.seed))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) #np.save(os.path.join(save_path, args.env_name+"_%d"%(args.seed)), all_episode_rewards) ### 保存记录 6/29 #print(temp_rewards) print("temp rewards size", temp_rewards.shape, "mean", np.mean(temp_rewards), "min", np.min(temp_rewards), "max", np.max(temp_rewards)) all_temp_rewards += [temp_rewards] np.savez(os.path.join(save_path, args.env_name + "_%d" % (args.seed)), episode=all_episode_rewards, timestep=all_temp_rewards) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) '''data['states'] = np.array(data['states'])
def main(): import copy import glob import os import time from collections import deque import gym import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from a2c_ppo_acktr import algo from a2c_ppo_acktr.envs import make_vec_envs from a2c_ppo_acktr.storage import RolloutStorage from a2c_ppo_acktr.utils import get_vec_normalize, update_linear_schedule from a2c_ppo_acktr.visualize import visdom_plot device = torch.device('cuda') envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) print(envs.observation_space.shape) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = ProximalPolicyOptimization(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, chrono=chrono) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes start = time.time() for j in range(args.repeat): with chrono.time('train', verbose=True) as t: for n in range(args.number): with chrono.time('one_batch', verbose=True): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * ( 1 - j / float(num_updates)) with chrono.time('generate_rollouts', verbose=True): generate_rollouts(**locals()) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() # --- with chrono.time('compute_returns', verbose=True): rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) with chrono.time('agent.update', verbose=True): # 11.147009023304644 value_loss, action_loss, dist_entropy = agent.update( rollouts) #exp.log_batch_loss(action_loss) #exp.log_metric('value_loss', value_loss) with chrono.time('after_update', verbose=True): rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps
def train_ppo_fine_tune_joint(args): torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(2) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, True) actor_critic = Policy( # 2-layer fully connected network envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': False, 'hidden_size': 32 }) # behavioral cloning model = PusherPolicyModel() num_epochs = 20 model.train(num_epochs=num_epochs) actor_critic.base.actor[0].weight.data.copy_(model.net.fc1.weight.data) actor_critic.base.actor[0].bias.data.copy_(model.net.fc1.bias.data) actor_critic.base.actor[2].weight.data.copy_(model.net.fc2.weight.data) actor_critic.base.actor[2].bias.data.copy_(model.net.fc2.bias.data) actor_critic.base.critic[0].weight.data.copy_(model.net.fc1.weight.data) actor_critic.base.critic[0].bias.data.copy_(model.net.fc1.bias.data) actor_critic.base.critic[2].weight.data.copy_(model.net.fc2.weight.data) actor_critic.base.critic[2].bias.data.copy_(model.net.fc2.bias.data) actor_critic.dist.fc_mean.weight.data.copy_(model.net.fc3.weight.data) actor_critic.dist.fc_mean.bias.data.copy_(model.net.fc3.bias.data) actor_critic.to(device) dataset = np.load('./expert.npz') obs_expert = torch.Tensor(dataset['obs']) actions_expert = torch.Tensor(dataset['action']) obs_expert.to(device) actions_expert.to(device) joint_loss_coef = 0.03 agent = PPOJointLoss(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, joint_loss_coef=joint_loss_coef, obs_expert=obs_expert, actions_expert=actions_expert, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episode_reward_means = [] episode_reward_times = [] for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) episode_reward_means.append(np.mean(episode_rewards)) episode_reward_times.append(total_num_steps) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) print(episode_reward_means, episode_reward_times) return episode_reward_means, episode_reward_times
def train_agent(self, num_env_steps): env_name = self.env_def.name obs = self.envs.reset() self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) n = 30 episode_rewards = deque(maxlen=n) episode_values = deque(maxlen=n) episode_end_values = deque(maxlen=n) episode_end_probs = deque(maxlen=n) episode_lengths = deque(maxlen=n) compile_est = deque(maxlen=n) first_steps = [True for i in range(self.num_processes)] start = time.time() num_updates = int( num_env_steps) // self.num_steps // self.num_processes for j in range(num_updates): if self.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( self.agent.optimizer, j, num_updates, self.agent.optimizer.lr if self.algo == "acktr" else self.lr) for step in range(self.num_steps): # Sample actions with torch.no_grad(): value, Q, action, action_prob, action_log_prob, recurrent_hidden_states = \ self.actor_critic.act(self.rollouts.obs[step], self.rollouts.recurrent_hidden_states[step], self.rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = self.envs.step(action) for i, step in enumerate(first_steps): if step: episode_values.append(value[i].item()) elif (done[i]): episode_end_values.append(Q[i].item()) episode_end_probs.append(action_log_prob[i].item()) first_steps = done for worker, info in enumerate(infos): if 'episode' in info.keys(): r = info['episode']['r'] l = info['episode']['l'] episode_rewards.append(r) episode_lengths.append(l) if (r < -1): compile_est.append(value[worker].item()) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) self.rollouts.insert(obs, recurrent_hidden_states, action, action_prob, action_log_prob, value, Q, reward, masks, bad_masks) with torch.no_grad(): next_value = self.actor_critic.get_value( self.rollouts.obs[-1], self.rollouts.recurrent_hidden_states[-1], self.rollouts.masks[-1]).detach() if self.gail: if j >= 10: self.envs.venv.eval() gail_epoch = self.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): self.gail_discr.update( self.gail_train_loader, self.rollouts, utils.get_vec_normalize(self.envs)._obfilt) for step in range(self.num_steps): self.rollouts.rewards[ step] = self.gail_discr.predict_reward( self.rollouts.obs[step], self.rollouts.actions[step], self.gamma, self.rollouts.masks[step]) self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.gae_lambda, self.use_proper_time_limits) value_loss, action_loss, dist_entropy = self.agent.update( self.rollouts) if (self.reconstruct): recon_loss = self.update_reconstruction(self.rollouts) self.writer.add_scalar('generator/Reconstruction Loss', recon_loss.item(), self.total_steps) self.rollouts.after_update() #Tensorboard Reporting self.total_steps += self.num_processes * self.num_steps self.writer.add_scalar('value/Mean Reward', np.mean(episode_rewards), self.total_steps) self.writer.add_scalar('value/Episode Mean Length', np.mean(episode_lengths), self.total_steps) self.writer.add_scalar('policy/Action Loss', action_loss, self.total_steps) self.writer.add_scalar('value/Value Loss', value_loss, self.total_steps) self.writer.add_scalar('policy/Distribution Entropy', dist_entropy, self.total_steps) self.writer.add_scalar('value/Win Probability', np.mean(np.array(episode_rewards) > 0), self.total_steps) self.writer.add_scalar('value/Starting Value', np.mean(episode_values), self.total_steps) #self.writer.add_scalar('value/Ending Value', np.mean(episode_end_values), self.total_steps) self.writer.add_scalar('value/Log Probs', np.mean(episode_end_probs), self.total_steps) if (len(compile_est) > 0): self.writer.add_scalar('value/Compile Estimate', np.mean(compile_est), self.total_steps) # save for every interval-th episode or for the last epoch total_num_steps = (j + 1) * self.num_processes * self.num_steps end = time.time() if (j % self.save_interval == 0 or j == num_updates - 1) and self.save_dir != "": self.version += 1 #self.save(self.version) self.report(self.version, total_num_steps, int(total_num_steps / (end - start)), episode_rewards) if j % self.log_interval == 0 and len(episode_rewards) > 1: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_size }) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) rewards = [] start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) rewards.extend(reward) for info in infos: if 'episode' in info.keys(): # print( info['episode'] ) episode_rewards.append(info['episode']['r'] / info['episode']['l']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() # print( episode_rewards ) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) # print( list( map( lambda x : x["score"] , infos ) ) ) rewards = [] if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): chrono = exp.chrono() envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, chrono=chrono) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(args.repeat): with chrono.time('train') as t: for n in range(args.number): with chrono.time('one_batch'): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * ( 1 - j / float(num_updates)) with chrono.time('generate_rollouts'): generate_rollouts(**locals()) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() # --- with chrono.time('compute_returns'): rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) with chrono.time('agent.update'): # 11.147009023304644 value_loss, action_loss, dist_entropy = agent.update( rollouts) exp.log_batch_loss(action_loss) exp.log_metric('value_loss', value_loss) with chrono.time('after_update'): rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if args.eval_interval is not None and len( episode_rewards ) > 1 and j % args.eval_interval == 0: eval_model(**locals()) # -- number # -- chrono exp.show_eta(j, t) # -- epoch exp.report()