def main(): os.environ['OMP_NUM_THREADS'] = '1' envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) actor_critic = Policy(obs_numel, envs.action_space) # Maxime: log some info about the model and its size modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize print(str(actor_critic)) print('Total model size: %d' % modelSize) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True) ) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) elif current_obs.dim() == 3: current_obs *= masks.unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True) )[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[:-1].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape)) ) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch) ) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0] ) ) if args.vis and j % args.vis_interval == 0: win = visdom_plot( total_num_steps, final_rewards.mean() )
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # Prepare demos demo_actions = np.zeros( (1, args.num_processes, envs.action_space.shape[0])) demo_states = np.zeros( (1, args.num_processes, envs.observation_space.shape[0])) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) # obs, reward and next obs demo_actions = np.concatenate( [demo_actions, action.reshape(1, args.num_processes, -1)], 0) demo_states = np.concatenate([ demo_states, rollouts.obs[step].reshape( 1, args.num_processes, -1) ], 0) # do one step obs, reward, done, infos = envs.step(action) if step > 1 and step % 1000 == 0: done = [True for _ in range(args.num_processes)] for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) r = 0 for key, val in info.items(): if 'reward' in key: r += val episode_rewards.append(r) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob,\ value, reward, masks) # Save demos: action_file_name = args.demos_dir + '/actions_step_' + str(j) + '.npy' state_file_name = args.demos_dir + '/states_step_' + str(j) + '.npy' policy_file_name = args.demos_dir + '/policy_step_' + str(j) + '.pth' np.save(action_file_name, demo_actions[1:]) np.save(state_file_name, demo_states[1:]) torch.save(actor_critic.state_dict(), policy_file_name) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act(obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def main(): is_limit_action = True # is_limit_action = False args_cuda = True # args_cuda = False torch.manual_seed(args_seed) torch.cuda.manual_seed_all(args_seed) device = torch.device("cuda:0" if args_cuda else "cpu") train_log = Log(log_name+'_train_log') evl_log = Log(log_name+'_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs( args_env_name, args_seed, args_num_processes, device, gamma=args_gamma) if is_limit_action: envs.action_space.n = 3 print('Number of Actions:', envs.action_space.n) actor_critic = Policy( envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = PPO( actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm, use_clipped_value_loss=args_use_clipped_value_loss) rollouts = RolloutStorage( args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # print(obs) # ss('i am over it') num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob\ = actor_critic.act(rollouts.obs[step]) # print(action) # print() # action = action + 1 # print(action) # ss('hoiohasdfhioas') if is_limit_action: obs, reward, done, infos = envs.step(action+1) else: obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) # print(done) # print(sum_re[i]) sum_re[i] *= 0 masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1]) rollouts.compute_returns(next_value, args_gamma, args_use_gae, args_gae_lambda) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ob_rms = get_vec_normalize(envs).ob_rms ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed, args_num_processes, device, is_limit_action=is_limit_action) ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result evl_log.log(ev_log_string)
def make_env(): return gym.make(ENV_NAME) # Parallelize environments envs = [make_env for i in range(N_ENVS)] envs = SubprocVecEnv(envs) envs = VecNormalize(envs, gamma=GAMMA) obs_shape = envs.observation_space.shape # Print observation space so we know what we are dealing with. print('Obs shape', obs_shape) policy = Policy(obs_shape, envs.action_space) optimizer = optim.Adam(policy.parameters(), lr=LR, eps=EPS) # Intialize the tensor we will use everytime for the observation. See the note # in update_current_obs for more current_obs = torch.zeros(N_ENVS, *obs_shape) obs = envs.reset() def update_current_obs(obs): # we want to use the same tensor every time so just copy it over. obs = torch.from_numpy(obs).float() current_obs[:, :] = obs
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
env = make_vec_envs(args.env_name, args.seed + 1000, 1, None, None, args.add_timestep, device='cpu', allow_early_resets=False, args=args) # Get a render function # render_func = get_render_func(env) # We need to use the same statistics for normalization as used in training actor_critic = Policy(env.observation_space.shape, env.action_space, args=args) torch.nn.Module.dump_patches = True actor_critic, ob_rms = \ torch.load(os.path.join(args.load_dir, args.env_name + ".pt")) if args.active_column is not None: actor_critic.base.active_column = args.active_column actor_critic.base.global_drop = True vec_norm = get_vec_normalize(env) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size) masks = torch.zeros(1, 1)
class PPO_agent: def __init__(self, params): self.params = params self.net = Policy(4, 2) # state_size, action_size #self.net = Policy(params.state_size, params.action_size) # state_size, action_size if self.params.cuda: print("network is moved to cuda") self.net.cuda() self.optimizer = Adam(self.net.parameters(), lr = params.lr) # this method select action given input state # it return log_prob, value and action for given state # acording to current policy def select_action(self, state): state = torch.FloatTensor(state).to(device) action, log_prob, value = self.net.select_action(state) return action, log_prob, value # calculating surrogate function for PPO # - averaging advantage # - calcualting log_prob(s,a) # - calculating log_prob/old_log_prob # - clipping above ratio # - taking min value from clipped/no-clipped ratios (policy_loss) # - calculating losses : value_loss/entropy_loss # - backpropagating with given losses # - optimizing one step def evaluate_data(self, experience): # unpacking given experience data states, actions, rewards, dones, old_log_probs, values, gae_returns = experience # averaging advantage with baseline (value) advantages = gae_returns - values advantages = (advantages - advantages.mean())/(advantages.std() + 01e-5) # chaging all data into tensors of size (-1, 1) states = torch.FloatTensor(states).view(-1, 1).to(device) actions = torch.FloatTensor(actions).view(-1, 1).to(device) #old_log_probs = torch.FloatTensor(old_log_probs).view(-1, 1).to(device) advantages = torch.FloatTensor(advantages).view(-1, 1).to(device) gae_returns = torch.FloatTensor(gae_returns).view(-1, 1).to(device) values = torch.FloatTensor(values).view(-1, 1).to(device) #calculating new log_prob with given s,a new_log_probs, new_values, entropys = self.net.evaluate_inputs(states, actions) # calculating ratio ratio = torch.exp(new_log_probs - old_log_probs) ratio_without_clipping = ratio*advantage # clipping ratio clipped_ratio = torch.clamp(ratio, 1.0 - self.params.clipping_value, 1.0 + self.params.clipping_value)*advantage # taking min value from both ratios ( policy_loss ) policy_loss = torch.min(ratio_without_clipping, clipped_ratio).mean() # calculation losses (returns - values)^2 value_loss = (gae_returns - new_values).pow(2).mean() # entropy loss = entropy*scaling_value entropy_loss = entropys*self.params.entropy_beta # backpropagation # zeroing gradient self.optimizer.zero_grad() # backpropagation (policy_loss + value_loss + entropy_loss).backward() # clipping gradinet ( TO DO ) # optipmizer step to apply gradient self.optimizer.step()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) vizz = Visdom(port=args.port) win = None winloss = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) #Initialize bw Model if args.bw: bw_model = bw_module(actor_critic, args, agent.optimizer, envs.action_space, envs.observation_space) vis_timesteps = [] vis_loss = [] obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # Add stuff to the Buffer if args.bw: bw_model.step(rollouts.obs[step].detach().cpu().numpy(), action.detach().cpu().numpy(), reward.detach().cpu().numpy(), done, obs.detach().cpu().numpy()) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # Do BW STEPS if args.bw and (j % args.n_a2c == 0): if not args.consistency: l_bw, l_imi = 0.0, 0.0 for _ in range(args.n_bw): l_bw += bw_model.train_bw_model(j) l_bw /= args.n_bw for _ in range(args.n_imi): l_imi += bw_model.train_imitation(j) l_imi /= args.n_imi else: l_bw, l_fw = 0.0, 0.0 for _ in range(args.n_bw): l_bw_, l_fw_ = bw_model.train_bw_model(j) l_bw += l_bw_ l_fw += l_fw_ l_bw /= args.n_bw l_fw /= args.n_bw l_imi, l_cons = 0.0, 0.0 for _ in range(args.n_imi): l_imi_, l_cons_ = bw_model.train_imitation(j) l_imi += l_imi_ l_cons_ += l_cons_ l_imi /= args.n_imi l_cons /= args.n_imi if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs env_name = args.env_name if args.bw: env_name += 'BW' win = visdom_plot(viz, win, args.log_dir, env_name, args.algo, args.num_frames) except IOError: pass # Save to Visdom Plots if args.vis and (j % args.vis_interval == 0): if args.bw and args.consistency: vis_loss.append( [value_loss, action_loss, l_bw, l_imi, l_fw, l_cons]) legend = [ 'Value loss', 'Action loss', 'BW Loss', 'IMI loss', 'FW Loss', 'CONST loss' ] title = args.env_name + '-' + 'bw' + '-' + 'consistency' + args.title elif args.bw: vis_loss.append([value_loss, action_loss, l_bw, l_imi]) legend = ['Value loss', 'Action loss', 'BW Loss', 'IMI loss'] title = args.env_name + '-' + 'bw' + args.title else: vis_loss.append([value_loss, action_loss]) legend = ['Value loss', 'Action loss'] title = args.env_name + '-' + 'vanilla' vis_timesteps.append( (j + 1) * (args.num_processes * args.num_steps)) # vis_rewards.append(final_rewards.mean()) # vis_rewards.append(np.mean(reward_queue)) # if win is None: # win = vizz.line(Y=np.array(vis_rewards), X=np.array(vis_timesteps), opts=dict(title=title, xlabel='Timesteps', # ylabel='Avg Rewards')) # vizz.line(Y=np.array(vis_rewards), X=np.array(vis_timesteps), win=win, update='replace', opts=dict(title=title, xlabel='Timesteps', # ylabel='Avg Rewards')) if winloss is None: winloss = vizz.line(Y=np.array(vis_loss), X=np.array(vis_timesteps), opts=dict(title=title, xlabel='Timesteps', ylabel='Losses', legend=legend)) vizz.line(Y=np.array(vis_loss), X=np.array(vis_timesteps), win=winloss, update='replace', opts=dict(title=title, xlabel='Timesteps', ylabel='Losses', legend=legend))
def main(): torch.set_num_threads(1) envs = make_vec_envs(args_env_name, args_seed, args_num_processes) actor_critic = Policy(envs.observation_space.shape, envs.action_space) agent = PPO(actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm) rollouts = RolloutStorage(args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob\ = actor_critic.act(rollouts.obs[step]) obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) sum_re[i] *= 0 masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1]) rollouts.compute_returns(next_value, args_gamma) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() print( "E {}, N_steps {}, FPS {}" " mean/median {:.1f}/{:.1f}, min/max {:.1f}/{:.1f} Ent {:.4f},V {:.4f},A {:.4f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
############################################# test_pf = True action = np.full((parallel_env), 1) total_gym_time = 0 total_vitis_time = 0 total_openCL_time = 0 test_data = RL_data() list_rewards = [] list_episodes = [] rew_list = [] obs_list = [] iteration = 0 policy = Policy() policy.load_state_dict(torch.load('params_6_15.ckpt')) policy.eval() opt = torch.optim.Adam(policy.parameters(), lr=1e-3) w1nparray = policy.layers[0].weight.detach().numpy()[:, :] w1nparray_part = w1nparray.reshape((4, 128, 12000)) w2nparray = policy.layers[2].weight.detach().numpy() obs = env.reset() # policy=policy.to(device) test_data.doneVec = np.full((parallel_env), False) rew = np.full(shape=(parallel_env), fill_value=0, dtype=float)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") run_id = "alpha{}".format(args.gcn_alpha) if args.use_logger: from utils import Logger folder = "{}/{}".format(args.folder, run_id) logger = Logger(algo_name=args.algo, environment_name=args.env_name, folder=folder, seed=args.seed) logger.save_args(args) print("---------------------------------------") print('Saving to', logger.save_folder) print("---------------------------------------") else: print("---------------------------------------") print('NOTE : NOT SAVING RESULTS') print("---------------------------------------") all_rewards = [] envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, args.env_name, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, actor_critic.base.output_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) ############################ # GCN Model and optimizer from pygcn.train import update_graph from pygcn.models import GCN gcn_model = GCN(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) gcn_model.to(device) gcn_optimizer = optim.Adam(gcn_model.parameters(), lr=args.gcn_lr, weight_decay=args.gcn_weight_decay) gcn_loss = nn.NLLLoss() gcn_states = [[] for _ in range(args.num_processes)] Gs = [nx.Graph() for _ in range(args.num_processes)] node_ptrs = [0 for _ in range(args.num_processes)] rew_states = [[] for _ in range(args.num_processes)] ############################ episode_rewards = deque(maxlen=100) avg_fwdloss = deque(maxlen=100) rew_rms = RunningMeanStd(shape=()) delay_rew = torch.zeros([args.num_processes, 1]) delay_step = torch.zeros([args.num_processes]) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob,\ recurrent_hidden_states, hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) delay_rew += reward delay_step += 1 for idx, (info, hid, eps_done) in enumerate(zip(infos, hidden_states, done)): if eps_done or delay_step[idx] == args.reward_freq: reward[idx] = delay_rew[idx] delay_rew[idx] = delay_step[idx] = 0 else: reward[idx] = 0 if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if args.gcn_alpha < 1.0: gcn_states[idx].append(hid) node_ptrs[idx] += 1 if not eps_done: Gs[idx].add_edge(node_ptrs[idx] - 1, node_ptrs[idx]) if reward[idx] != 0. or eps_done: rew_states[idx].append( [node_ptrs[idx] - 1, reward[idx]]) if eps_done: adj = nx.adjacency_matrix(Gs[idx]) if len(Gs[idx].nodes)\ else sp.csr_matrix(np.eye(1,dtype='int64')) update_graph(gcn_model, gcn_optimizer, torch.stack(gcn_states[idx]), adj, rew_states[idx], gcn_loss, args, envs) gcn_states[idx] = [] Gs[idx] = nx.Graph() node_ptrs[idx] = 0 rew_states[idx] = [] # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, hidden_states) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau, gcn_model, args.gcn_alpha) agent.update(rollouts) rollouts.after_update() ####################### Saving and book-keeping ####################### if (j % int(num_updates / 5.) == 0 or j == num_updates - 1) and args.save_dir != "": print('Saving model') print() save_dir = "{}/{}/{}".format(args.save_dir, args.folder, run_id) save_path = os.path.join(save_dir, args.algo, 'seed' + str(args.seed)) + '_iter' + str(j) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_gcn = gcn_model if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_gcn = copy.deepcopy(gcn_model).cpu() save_model = [ save_gcn, save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + "ac.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {}\ training episodes: mean/median reward {:.2f}/{:.2f},\ min/max reward {:.2f}/{:.2f}, success rate {:.2f}, avg fwdloss {:.2f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards), np.mean(avg_fwdloss), )) all_rewards.append(np.mean(episode_rewards)) if args.use_logger: logger.save_task_results(all_rewards) ####################### Saving and book-keeping ####################### envs.close()
class TD3_agent(object): def __init__(self, config: Dict) -> None: super(TD3_agent).__init__() self.lr = config['lr'] self.gamma = config['gamma'] self.tau = config['tau'] self.noise_std = config['noise_std'] self.noise_clip = config['noise_clip'] self.a_max = config['a_max'] self.a_min = config['a_min'] self.batch_size = config['batch_size'] self.update_delay = config['update_delay'] self.device = torch.device(config['device']) self.policy = Policy(config).to(self.device) self.policy_target = Policy(config).to(self.device) self.twin_q = Twin_Q(config).to(self.device) self.twin_q_target = Twin_Q(config).to(self.device) self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.twin_q.parameters(), lr=self.lr) self.policy_target.load_state_dict(self.policy.state_dict()) self.twin_q_target.load_state_dict(self.twin_q.state_dict()) self.memory = Memory(config['memory_size']) def choose_action(self, obs: np.array, use_noise: bool = True) -> np.array: obs = torch.from_numpy(obs).to(self.device).float() with torch.no_grad(): action = self.policy(obs) if use_noise: noise = torch.randn_like(action, dtype=torch.float, device=self.device) * self.noise_std action = action + noise action = np.clip(action.cpu().numpy(), self.a_min, self.a_max) return action def get_target_action(self, next_obs_batch: torch.tensor) -> torch.tensor: target_action = self.policy_target(next_obs_batch) noise = (torch.randn_like(target_action) * self.noise_std).clamp( -self.noise_clip, self.noise_clip) return target_action + noise def update_critic(self, obs_batch: torch.tensor, a_batch: torch.tensor, next_obs_batch: torch.tensor, r_batch: torch.tensor, done_batch: torch.tensor) -> float: next_action_target = self.get_target_action(next_obs_batch) Q1_next, Q2_next = self.twin_q_target(next_obs_batch, next_action_target) Q_target = r_batch + (1 - done_batch) * self.gamma * torch.min( Q1_next, Q2_next) Q1_predict, Q2_predict = self.twin_q(obs_batch, a_batch) loss_critic = F.mse_loss(Q1_predict, Q_target) + F.mse_loss( Q2_predict, Q_target) self.optimizer_q.zero_grad() loss_critic.backward() self.optimizer_q.step() return loss_critic.item() def update_actor(self, obs_batch: torch.tensor) -> float: loss_actor = -self.twin_q.Q1_value(obs_batch, self.policy(obs_batch)).mean() self.optimizer_pi.zero_grad() loss_actor.backward() self.optimizer_pi.step() return loss_actor.item() def update_target(self) -> None: soft_update(self.policy, self.policy_target, self.tau) soft_update(self.twin_q, self.twin_q_target, self.tau) def update(self, step: int) -> None: batch = self.memory.sample(batch_size=self.batch_size) o, a, r, o_, done = batch o = torch.from_numpy(np.array(o)).to(self.device).float() a = torch.from_numpy(np.array(a)).to(self.device).float() r = torch.from_numpy(np.array(r)).to(self.device).float() o_ = torch.from_numpy(np.array(o_)).to(self.device).float() done = torch.from_numpy(np.array(done)).to(self.device).int() loss_critic = self.update_critic(o, a, r, o_, done) loss_actor = 0. if step % self.update_delay == 0: loss_actor = self.update_actor(o) self.update_target() return loss_actor, loss_critic def save_transition(self, transition: List) -> None: self.memory.save_trans(transition)
def main(): import copy import glob import os import time import matplotlib.pyplot as plt import gym import numpy as np import torch torch.multiprocessing.set_start_method('spawn') import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from gym.spaces import Discrete from arguments import get_args from baselines.common.vec_env.dummy_vec_env import DummyVecEnv from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.vec_normalize import VecNormalize from envs import make_env from img_env_corner import ImgEnv, IMG_ENVS from model import Policy from storage import RolloutStorage from utils import update_current_obs, eval_episode from torchvision import transforms from visdom import Visdom import algo viz = Visdom(port=8097) print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") plot_rewards = [] plot_policy_loss = [] plot_value_loss = [] # x = np.array([0]) # y = np.array([0]) # counter = 0 # win = viz.line( # X=x, # Y=y, # win="test1", # name='Line1', # opts=dict( # title='Reward', # ) # ) # win2 = viz.line( # X=x, # Y=y, # win="test2", # name='Line2', # opts=dict( # title='Policy Loss', # ) # ) # win3 = viz.line( # X=x, # Y=y, # win="test3", # name='Line3', # opts=dict( # title='Value Loss', # ) # ) args = get_args() if args.no_cuda: args.cuda = False print(args) assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' num_updates = int(args.num_frames) // args.num_steps // args.num_processes torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) toprint = ['seed', 'lr', 'nat', 'resnet'] if args.env_name in IMG_ENVS: toprint += ['window', 'max_steps'] toprint.sort() name = args.tag args_param = vars(args) os.makedirs(os.path.join(args.out_dir, args.env_name), exist_ok=True) for arg in toprint: if arg in args_param and (args_param[arg] or arg in ['gamma', 'seed']): if args_param[arg] is True: name += '{}_'.format(arg) else: name += '{}{}_'.format(arg, args_param[arg]) model_dir = os.path.join(args.out_dir, args.env_name, args.algo) os.makedirs(model_dir, exist_ok=True) results_dict = { 'episodes': [], 'rewards': [], 'args': args } torch.set_num_threads(1) eval_env = make_env(args, 'cifar10', args.seed, 1, None, args.add_timestep, natural=args.nat, train=False) envs = make_env(args, 'cifar10', args.seed, 1, None, args.add_timestep, natural=args.nat, train=True) #print(envs) # envs = envs[0] # if args.num_processes > 1: # envs = SubprocVecEnv(envs) # else: # envs = DummyVecEnv(envs) # eval_env = DummyVecEnv(eval_env) # if len(envs.observation_space.shape) == 1: # envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy, dataset=args.env_name, resnet=args.resnet, pretrained=args.pretrained) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) action_space = envs.action_space if args.env_name in IMG_ENVS: action_space = np.zeros(2) # obs_shape = envs.observation_space.shape rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): # envs.display_original(j) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # envs.display_step(step, j) # print("OBS", obs) # print("REWARD", reward) # print("DONE", done) # print("INFO", info) reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) # print("envs.curr_img SHAPE: ", envs.curr_img.shape) #display_state = envs.curr_img # display_state[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] = 5 # display_state = custom_replace(display_state, 1, 0) # display_state[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] = \ # envs.curr_img[:, envs.pos[0]:envs.pos[0]+envs.window, envs.pos[1]:envs.pos[1]+envs.window] # img = transforms.ToPILImage()(display_state) # img.save("state_cifar/"+"state"+str(j)+"_"+str(step)+".png") with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0: torch.save((actor_critic.state_dict(), results_dict), os.path.join( model_dir, name + 'cifar_model_ppo_ex1_corner.pt')) if j % args.log_interval == 0: end = time.time() total_reward = eval_episode(eval_env, actor_critic, args) results_dict['rewards'].append(total_reward) total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, reward {:.1f} entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), np.mean(results_dict['rewards'][-10:]), dist_entropy, value_loss, action_loss)) plot_rewards.append(np.mean(results_dict['rewards'][-10:])) plot_policy_loss.append(action_loss) plot_value_loss.append(value_loss) plt.plot(range(len(plot_rewards)), plot_rewards) plt.savefig("rewards_corner.png") plt.close() plt.plot(range(len(plot_policy_loss)), plot_policy_loss) plt.savefig("policyloss_corner.png") plt.close() plt.plot(range(len(plot_value_loss)), plot_value_loss) plt.savefig("valueloss_corner.png") plt.close()
def main(): torch.set_num_threads(1) device = torch.device("cpu") # if args.vis: # from visdom import Visdom # viz = Visdom(port=args.port) # win = None # envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, args.add_timestep, device, False) observation_space = Box(low=0, high=10000, shape=(26,), dtype=np.float32) # Box(84,84,4) action_space = Discrete(7) # Discrete(4) actor_critic = Policy(observation_space.shape, action_space, base_kwargs={'recurrent': None}) actor_critic.to(device) # if args.algo == 'a2c': # agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, # args.entropy_coef, lr=args.lr, # eps=args.eps, alpha=args.alpha, # max_grad_norm=args.max_grad_norm) # elif args.algo == 'ppo': # agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, # args.value_loss_coef, args.entropy_coef, lr=args.lr, # eps=args.eps, # max_grad_norm=args.max_grad_norm) # elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, value_loss_coef=0.1, entropy_coef=0.01, acktr=True) rollouts = RolloutStorage(8000, 1, observation_space.shape, action_space, actor_critic.recurrent_hidden_state_size) obs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] rollouts.obs[0].copy_(torch.Tensor(obs)) rollouts.to(device) episode_rewards = deque(maxlen=10) f = open('poktr_rtmdp_20_2.txt', 'w') f.write("\noriginal loss(schedule 6 packets):") start = time.time() for j in range(num_updates): # num_updates net = Net() node_list, path_list = net.read_graph(net.node_list, net.path_list) startnode = node_list[0] # 起始节点 net.get_data(startnode) count = 0 remove_count = 0 # 记录丢弃的数据包的值 end_time = startnode.messages[0].end_time pre_action_item = random.randint(0, 6) pre_action_item_oh = convert_one_hot(pre_action_item, 7) s = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, end_time, pre_action_item_oh] states = [[0], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []] # 用来存储所有节点状态 ep_r = 0 ep_acc_r = 0 obs[:] = s reward_ten = torch.Tensor(1, 1) pre_value = torch.FloatTensor([[0.1]]) pre_action = torch.Tensor([[random.randint(0, 6)]]) pre_action_log_prob = torch.FloatTensor([[-1.]]) pre_recurrent_hidden_states = torch.FloatTensor([[0.]]) pre_masks = torch.FloatTensor([[0.]]) for step in range(8000): # Sample actions count += 1 old_action_log_prob = torch.Tensor([[0]]) # print(rollouts, rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) action_item = action.item() # 将Tensor类型的数据转化为Int型 action_item_oh = convert_one_hot(action_item, 7) # Obser reward and next obs obs, reward, done, states, remove_count, acc_r, su_packets = net.schedule(pre_action_item, count, states, node_list, path_list, remove_count) ep_r += reward ep_acc_r += acc_r reward_ten[[0]] = reward # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) obs.extend(pre_action_item_oh) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done else [1.0]]) # print((obs), recurrent_hidden_states, torch.Tensor(action), type(action_log_prob), type(value), type(reward), type(masks)) rollouts.insert(torch.Tensor(obs), recurrent_hidden_states, action, action_log_prob, value, reward_ten, masks) # rollouts.insert(torch.Tensor(obs), pre_recurrent_hidden_states, pre_action, pre_action_log_prob, pre_value, reward_ten, pre_masks) pre_action = action pre_action_item = action_item pre_action_log_prob = action_log_prob pre_recurrent_hidden_states = recurrent_hidden_states pre_value = value pre_action_item_oh = convert_one_hot(pre_action_item, 7) f.write("\ntime:"+str(time.strftime('%H:%M:%S', time.localtime(time.time())))+"|"+str(j)+"|ep_r:"+str(ep_r)+"|pakcets:"+str(su_packets)+"|remove:"+str(remove_count)+"|ep_acc_r:"+str(ep_acc_r / 8000)) f.flush() with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, False, 0.99, 0.95) value_loss, action_loss, dist_entropy = agent.update(rollouts) print("time:", time.strftime('%H:%M:%S', time.localtime(time.time())), "|", j, "|ep_r:", ep_r, "|pakcets:", su_packets, "|remove:", remove_count, "|ep_acc_r:", ep_acc_r / 8000, "|value_loss:", value_loss, "|action_loss:", action_loss, "|entropy:", dist_entropy) rollouts.after_update()
def main(): torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # Environment stuffs envs = [] for i in range(args.num_processes): if args.scene_dir: scene_dir = os.path.join(args.scene_dir, "seed{}".format(args.seed + i)) assert os.path.exists(scene_dir) else: scene_dir = None envs.append( make_env(args.env_name, args.seed, i, log_path, args.add_timestep, scene_dir)) # Hack infomation of gym environment tmp_env = envs[0]() sensor_type = tmp_env.unwrapped.hp_sensing_mode num_agent = tmp_env.unwrapped.hp_uav_n dim = tmp_env.unwrapped.hp_dim # Shape of o_env for each agent, required by the observation feature extraction module of the model if sensor_type == "lidar": atom_o_env_shape = tmp_env.unwrapped.hp_lidar_n + dim elif sensor_type == "pos": atom_o_env_shape = (dim + 1) * tmp_env.unwrapped.hp_n_nearest_obs else: raise Exception( "No implementation for sensing mode {}".format(sensor_type)) if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if not args.unordered: envs = VecNormalize( envs, gamma=args.gamma ) # Different observation normalization factors for different agents else: envs = VecNormalize(envs, gamma=args.gamma, num_agent=num_agent) num_subagents = num_agent if args.indep else 1 # The way you view the robot team (i.e., a virtual structure or many robots) obs_shape = envs.observation_space.shape atom_obs_shape = (obs_shape[0] // num_subagents * args.num_stack, *obs_shape[1:]) # Shape for each logical agent action_shape = envs.action_space.shape atom_action_shape = (action_shape[0] // num_subagents, *action_shape[1:]) # Agent stuffs (core elements of PPO) if args.load_dir: # Resume from breakpoint print("Loading model parameters from: " + args.load_dir) actor_critic, ob_rms, ret_rms = torch.load(args.load_dir) assert envs.ob_rms.mean.shape == ob_rms.mean.shape, "Mismatched observation shape, which may be induced by wrong flags (e.g., --unordered / --num_stack)" envs.ob_rms = ob_rms envs.ret_rms = ret_rms else: actor_critic = Policy(atom_obs_shape, atom_action_shape, sensor_type, atom_o_env_shape, dim, num_agent, args.unordered, args.indep, args.sigmoid, args.share, args.no_rnn) if args.cuda: actor_critic.cuda() agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = [ RolloutStorage(args.num_steps, args.num_processes, atom_obs_shape, atom_action_shape, actor_critic.state_size) for _ in range(num_subagents) ] # Auxiliary stuffs current_obs = [ torch.zeros(args.num_processes, *atom_obs_shape) for _ in range(num_subagents) ] # Stack sequent observations to get current_obs, using the trick of reshaping. # # current_obs # Index |1 |2 |3 # Observation |a1 a2 a3 |b1 b2 b3 |c1 c2 c3 def update_current_obs(obs, idx): nonlocal current_obs shape_dim0 = atom_obs_shape[0] // args.num_stack obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[idx][:, :-shape_dim0] = current_obs[idx][:, shape_dim0:] current_obs[idx][:, -shape_dim0:] = obs obs = envs.reset() for i in range(num_subagents): update_current_obs( obs[:, i * atom_obs_shape[0]:(i + 1) * atom_obs_shape[0]], i) rollouts[i].observations[0].copy_(current_obs[i]) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: for i in range(num_subagents): current_obs[i] = current_obs[i].cuda() rollouts[i].cuda() # Main loop train_start = datetime.datetime.now() print("Training starts at: {}".format(train_start)) env_time = 0. # time cost of interaction with environment env_compute_time = 0. env_step_time = 0. env_rollout_time = 0. update_time = 0. # time cost of updating parameters log_time = 0. # time cost of logging for j in range(num_updates): # Interact with the environment start_env_time = time.time() # Timer for step in range(args.num_steps): start_env_compute_time = time.time() # Sample actions with torch.no_grad(): l_value, l_action, l_action_log_prob, l_states = [], [], [], [] for i in range(num_subagents): value, action, action_log_prob, states = actor_critic.act( rollouts[i].observations[step], rollouts[i].states[step], rollouts[i].masks[step]) l_value.append(value) l_action.append(action) l_action_log_prob.append(action_log_prob) l_states.append(states) action = torch.cat(l_action, dim=1) cpu_actions = action.squeeze(1).cpu().numpy() env_compute_time += time.time() - start_env_compute_time start_env_step_time = time.time() obs, reward, done, info = envs.step(cpu_actions) env_step_time += time.time() - start_env_step_time start_env_rollout_time = time.time() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) # final_rewards is the accumulated reward of the last trajectory, episode_rewards is an auxuliary variable. # The motivation is to enable logging in arbitrary time step. final_rewards *= masks final_rewards += ( 1 - masks ) * episode_rewards # If not done, mask=1, final_rewards doesn't change episode_rewards *= masks if args.cuda: masks = masks.cuda() for i in range(num_subagents): current_obs[i] *= masks # Useful when args.num_stack > 1 update_current_obs( obs[:, i * atom_obs_shape[0]:(i + 1) * atom_obs_shape[0]], i) rollouts[i].insert(current_obs[i], l_states[i], l_action[i], l_action_log_prob[i], l_value[i], reward, masks) env_rollout_time += time.time() - start_env_rollout_time env_time += time.time() - start_env_time # Update parameters start_update_time = time.time() # Timer for i in range(num_subagents): with torch.no_grad(): next_value = actor_critic.get_value( rollouts[i].observations[-1], rollouts[i].states[-1], rollouts[i].masks[-1]).detach() rollouts[i].compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts[i]) rollouts[i].after_update() update_time += time.time() - start_update_time # Logging start_log_time = time.time() # Timer # Save models if j % args.save_interval == 0 or j == num_updates - 1: # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None, hasattr(envs, 'ret_rms') and envs.ret_rms or None ] torch.save(save_model, os.path.join(model_path, "model" + str(j) + ".pt")) # For logging training information if j % args.log_interval == 0 or j == num_updates - 1: log_env_time = [] for i, info_i in enumerate(info): log_reset_i = " Average reset time for env{}: {:.1f}ms = {:.1f}h / {}".format( i, info_i['reset_time'] * 1000 / info_i['reset_num'], info_i['reset_time'] / 3600, info_i['reset_num']) log_step_i = " Average step time for env{}: {:.1f}ms = {:.1f}h / {}".format( i, info_i['step_time'] * 1000 / info_i['step_num'], info_i['step_time'] / 3600, info_i['step_num']) log_env_time.append(log_reset_i) log_env_time.append(log_step_i) log_env_time = '\n'.join(log_env_time) current_time = datetime.datetime.now() summary = '\n'.join([ "Training starts at: {}".format(train_start), "Current time: {}".format(current_time), "Elapsed time: {}".format(current_time - train_start), " Environment interaction: {:.1f}h".format( env_time / 3600), " Compute action: {:.1f}h".format( env_compute_time / 3600), " Rollout: {:.1f}h".format(env_rollout_time / 3600), " Interaction with gym: {:.1f}h".format( env_step_time / 3600), log_env_time, " Parameters update: {:.1f}h".format(update_time / 3600), " logging: {:.1f}h".format(log_time / 3600) ]) + '\n' # Write down summary of the training with open(os.path.join(root_path, "summary.txt"), 'w') as f: f.write(summary) # For Visdom visualization if args.vis and (j % args.vis_interval == 0 or j == num_updates - 1): # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.vis_env, log_path, title, args.algo, args.num_frames, save_dir=root_path) viz.save([args.vis_env]) log_time += time.time() - start_log_time print(summary)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def __init__(self): import random import gym_city import game_of_life self.fieldnames = self.get_fieldnames() args = get_args() args.log_dir = args.save_dir + '/logs' assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' num_updates = int(args.num_frames) // args.num_steps // args.num_processes torch.manual_seed(args.seed) if args.cuda: print('CUDA ENABLED') torch.cuda.manual_seed(args.seed) graph_name = args.save_dir.split('trained_models/')[1].replace('/', ' ') self.graph_name = graph_name actor_critic = False agent = False past_steps = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: if args.overwrite: os.remove(f) else: pass torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") self.device = device if args.vis: from visdom import Visdom viz = Visdom(port=args.port) self.viz = viz win = None self.win = win win_eval = None self.win_eval = win_eval if 'GameOfLife' in args.env_name: print('env name: {}'.format(args.env_name)) num_actions = 1 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if isinstance(envs.observation_space, gym.spaces.Discrete): num_inputs = envs.observation_space.n elif isinstance(envs.observation_space, gym.spaces.Box): if 'golmulti' in args.env_name.lower(): multi_env = True observation_space_shape = envs.observation_space.shape[1:] else: multi_env = False observation_space_shape = envs.observation_space.shape self.multi_env = multi_env if len(observation_space_shape) == 3: in_w = observation_space_shape[1] in_h = observation_space_shape[2] else: in_w = 1 in_h = 1 num_inputs = observation_space_shape[0] if isinstance(envs.action_space, gym.spaces.Discrete) or\ isinstance(envs.action_space, gym.spaces.Box): out_w = args.map_width out_h = args.map_width if 'Micropolis' in args.env_name: #otherwise it's set if args.power_puzzle: num_actions = 1 else: num_actions = 19 # TODO: have this already from env elif 'GameOfLife' in args.env_name: num_actions = 1 else: num_actions = envs.action_space.n elif isinstance(envs.action_space, gym.spaces.Box): if len(envs.action_space.shape) == 3: out_w = envs.action_space.shape[1] out_h = envs.action_space.shape[2] elif len(envs.action_space.shape) == 1: out_w = 1 out_h = 1 num_actions = envs.action_space.shape[-1] print('num actions {}'.format(num_actions)) if args.auto_expand: args.n_recs -= 1 actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'map_width': args.map_width, 'num_actions': num_actions, 'recurrent': args.recurrent_policy, 'prebuild': args.prebuild, 'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs, 'out_w': out_w, 'out_h': out_h}, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) if args.auto_expand: args.n_recs += 1 evaluator = None self.evaluator = evaluator if not agent: agent = init_agent(actor_critic, args) vec_norm = get_vec_normalize(envs) self.vec_norm = vec_norm #saved_model = os.path.join(args.save_dir, args.env_name + '.pt') if args.load_dir: saved_model = os.path.join(args.load_dir, args.env_name + '.tar') else: saved_model = os.path.join(args.save_dir, args.env_name + '.tar') self.checkpoint = None if os.path.exists(saved_model) and not args.overwrite: checkpoint = torch.load(saved_model) self.checkpoint = checkpoint saved_args = checkpoint['args'] actor_critic.load_state_dict(checkpoint['model_state_dict']) #for o, l in zip(agent.optimizer.state_dict, checkpoint['optimizer_state_dict']): # print(o, l) #print(agent.optimizer.state_dict()['param_groups']) #print('\n') #print(checkpoint['model_state_dict']) actor_critic.to(self.device) #actor_critic.cuda() #agent = init_agent(actor_critic, saved_args) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if args.auto_expand: if not args.n_recs - saved_args.n_recs == 1: print('can expand by 1 rec only from saved model, not {}'.format(args.n_recs - saved_args.n_recs)) raise Exception actor_critic.base.auto_expand() print('expanded net: \n{}'.format(actor_critic.base)) past_steps = checkpoint['past_steps'] ob_rms = checkpoint['ob_rms'] past_steps = next(iter(agent.optimizer.state_dict()['state'].values()))['step'] print('Resuming from step {}'.format(past_steps)) #print(type(next(iter((torch.load(saved_model)))))) #actor_critic, ob_rms = \ # torch.load(saved_model) #agent = \ # torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) #if not agent.optimizer.state_dict()['state'].values(): # past_steps = 0 #else: # raise Exception if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms saved_args.num_frames = args.num_frames saved_args.vis_interval = args.vis_interval saved_args.eval_interval = args.eval_interval saved_args.overwrite = args.overwrite saved_args.n_recs = args.n_recs saved_args.intra_shr = args.intra_shr saved_args.inter_shr = args.inter_shr saved_args.map_width = args.map_width saved_args.render = args.render saved_args.print_map = args.print_map saved_args.load_dir = args.load_dir saved_args.experiment_name = args.experiment_name saved_args.log_dir = args.log_dir saved_args.save_dir = args.save_dir saved_args.num_processes = args.num_processes saved_args.n_chan = args.n_chan saved_args.prebuild = args.prebuild args = saved_args actor_critic.to(device) if 'LSTM' in args.model: recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size() else: recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size if args.curiosity: rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() self.model = model = actor_critic.base self.reset_eval = False plotter = None env_param_bounds = envs.get_param_bounds() # in case we want to change this dynamically in the future (e.g., we may # not know how much traffic the agent can possibly produce in Micropolis) envs.set_param_bounds(env_param_bounds) # start with default bounds if args.model == 'FractalNet' or args.model == 'fractal': n_cols = model.n_cols if args.rule == 'wide1' and args.n_recs > 3: col_step = 3 else: col_step = 1 else: n_cols = 0 col_step = 1 self.col_step = col_step env_param_bounds = envs.get_param_bounds() # in case we want to change this dynamically in the future (e.g., we may # not know how much traffic the agent can possibly produce in Micropolis) envs.set_param_bounds(env_param_bounds) # start with default bounds self.past_steps = past_steps self.num_updates = num_updates self.envs = envs self.start = start self.rollouts = rollouts self.args = args self.actor_critic = actor_critic self.plotter = plotter self.agent = agent self.episode_rewards = episode_rewards self.n_cols = n_cols
class Agent(): def __init__(self, args): self.buffer_size = int(1e5) self.batch_size = 32 self.num_agents = 0 self.num_of_actions = 9 self.model = [] self.buffer = [] self.time = 0 self.gamma = 0.95 self.episode_length = 10000 self.args = args self.time_now = datetime.datetime.now().strftime('%Y-%m-%d') try: os.mkdir(self.time_now) except: pass def init(self, obs): self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu' self.device = torch.device(self.device) self.num_agents = len(obs['image']) self.buffer = ReplayBuffer(self.buffer_size, self.batch_size, self.num_agents) self.model = Policy(self.num_of_actions).to(self.device) if self.args.model != 'None': self.load_model(self.args.model) # self.load_model('2019-07-09/30_160600') self.target = Policy(self.num_of_actions).to(self.device) self.update_target() self.optimizer = optim.Adam(self.model.parameters()) self.last_state_cnn = np.zeros((self.num_agents,3,128,128)) self.last_state_oth = np.zeros((self.num_agents, 11)) self.last_action = np.zeros((self.num_agents, 1)) def get_obs_cnn(self, obs): temp = [] for i in range(len(obs["image"])): temp.append(np.r_[obs["image"][i]]) temp = np.r_[temp] t = np.transpose(temp, (0,3,1,2)) # t /= 255.0 return t def get_obs_oth(self, obs): temp = [] # change in another network structure for i in range(len(obs["ir"])): temp.append(np.r_[obs["ir"][i], obs["gyro"][i], obs["target"][i]]) t = np.r_[temp] return t def get_new_cnn(self, t): t = np.concatenate((self.last_state_cnn, t), axis=1) return t def get_new_oth(self,t): t = np.concatenate((self.last_state_oth, t), axis=1) return t def update_target(self): self.target.load_state_dict(self.model.state_dict()) def get_action(self, obs, epsilon, done): if self.num_agents == 0: self.init(obs) state_cnn = self.get_obs_cnn(obs) state_oth = self.get_obs_oth(obs) cat_cnn = self.get_new_cnn(state_cnn) cat_oth = self.get_new_oth(state_oth) q = self.model(cat_cnn,cat_oth) actions = q.max(1)[1] index_action = np.zeros((self.num_agents,), dtype=np.uint8) for i in range(self.num_agents): if random.random() > epsilon: index_action[i] = random.randint(0, self.num_of_actions - 1) else: index_action[i] = actions[i].item() if done.item(0) != True: self.last_state_cnn = state_cnn self.last_state_oth = state_oth self.last_action = index_action elif done.item(0) == True: self.last_state_cnn = np.zeros((self.num_agents,3, 128, 128)) self.last_state_oth = np.zeros((self.num_agents, 11)) self.last_action = np.zeros((self.num_agents, 1)) return index_action def learn(self): self.time += 1 if len(self.buffer) < self.batch_size*self.num_agents: return state_cnn, state_oth, action, reward, next_cnn, next_oth, done = self.buffer.sample() # max_q = self.target(next_cnn, next_oth).max(1)[0].unsqueeze(1) pred_q = self.model(state_cnn, state_oth) pred_q = pred_q.gather(1, action.view(-1).unsqueeze(1).long()) target_chosen_actions = self.model(next_cnn, next_oth).max(1)[1].unsqueeze(1) max_q = self.target(next_cnn, next_oth).gather(1, target_chosen_actions) reward = reward.view(-1,1) true_q = reward + (1 - done) * self.gamma * max_q.detach() criterion = nn.MSELoss() loss = criterion(pred_q, true_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.model.reset_noise() self.target.reset_noise() if self.time % 10 == 0: self.update_target() if self.time % 100 == 0: self.save_model(self.time_now + '/' + str(self.num_agents) + '_' + str(self.time)) def store_experience(self, obs, action, reward, done): state_cnn = self.get_obs_cnn(obs) state_oth = self.get_obs_oth(obs) self.buffer.add(state_cnn, state_oth, action, reward, done) def save_model(self, filename): # filename = './' + str(self.num_agents) torch.save(self.model.state_dict(), filename) def load_model(self, filename): # filename = './' + self.num_agents self.model.load_state_dict(torch.load(filename)) self.model.eval()
class PPOCarla(Agent): def __init__(self, obs_converter, action_converter, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=None, eps=None, max_grad_norm=None, use_clipped_value_loss=False): self.obs_converter = obs_converter self.action_converter = action_converter self.model = Policy( self.obs_converter.get_observation_space(), self.action_converter.get_action_space()).to("cuda:0") self.clip_param = clip_param self.ppo_epoch = ppo_epoch self.num_mini_batch = num_mini_batch self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef self.max_grad_norm = max_grad_norm self.use_clipped_value_loss = use_clipped_value_loss self.optimizer = optim.Adam(self.model.parameters(), lr=lr, eps=eps) def update(self, rollouts): advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) value_loss_epoch = 0 action_loss_epoch = 0 dist_entropy_epoch = 0 for e in range(self.ppo_epoch): if self.model.is_recurrent: data_generator = rollouts.recurrent_generator( advantages, self.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, self.num_mini_batch) for sample in data_generator: obs_batch, recurrent_hidden_states_batch, actions_batch, \ value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, _ = self.model.evaluate_actions( obs_batch['img'], obs_batch['v'], recurrent_hidden_states_batch, masks_batch, actions_batch) ratio = torch.exp(action_log_probs - old_action_log_probs_batch) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() if self.use_clipped_value_loss: value_pred_clipped = value_preds_batch + \ (values - value_preds_batch).clamp(-self.clip_param, self.clip_param) value_losses = (values - return_batch).pow(2) value_losses_clipped = (value_pred_clipped - return_batch).pow(2) value_loss = .5 * torch.max(value_losses, value_losses_clipped).mean() else: value_loss = 0.5 * F.mse_loss(return_batch, values) self.optimizer.zero_grad() (value_loss * self.value_loss_coef + action_loss - dist_entropy * self.entropy_coef).backward() nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() value_loss_epoch += value_loss.item() action_loss_epoch += action_loss.item() dist_entropy_epoch += dist_entropy.item() num_updates = self.ppo_epoch * self.num_mini_batch value_loss_epoch /= num_updates action_loss_epoch /= num_updates dist_entropy_epoch /= num_updates return value_loss_epoch, action_loss_epoch, dist_entropy_epoch def act(self, inputs, rnn_hxs, masks, deterministic=False): eps_curr = 0. # TODO: Change if you want to implement epsilon-greedy return self.model.act(inputs['img'], inputs['v'], rnn_hxs, masks, eps_curr, deterministic=False) def get_value(self, inputs, rnn_hxs, masks): return self.model.get_value(inputs['img'], inputs['v'], rnn_hxs, masks)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") """ if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None """ envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, args.ep_max_step) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.useNeural: #FLAGS = update_tf_wrapper_args(args,) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) pixel_bonus = PixelBonus(FLAGS, sess) tf.initialize_all_variables().run(session=sess) if args.loadNeural is not None: pixel_bonus.load_model(args.loadNeural) #with tf.variable_scope('step'): # self.step_op = tf.Variable(0, trainable=False, name='step') # self.step_input = tf.placeholder('int32', None, name='step_input') # self.step_assign_op = self.step_op.assign(self.step_input) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) steper =0 img_scale = 1 psc_weight = float(args.pscWeight) psc_rollout=list() start = time.time() for j in range(num_updates): step_counter = 0 psc_tot=list() for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) psc_add = 0 if args.useNeural: for i in obs[0]: frame = imresize((i / img_scale).cpu().numpy(), (42, 42), order=1) psc_add += pixel_bonus.bonus(i, steper) steper += 1 psc_add = psc_add / 12 else: useNeural = 0 psc_tot.append(psc_add) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: episode_rewards.append(reward[idx]) psc_add=torch.tensor(psc_add,requires_grad=True, dtype = torch.float) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, psc=psc_add) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts, psc_tot, psc_weight) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n". format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards) ) ) if args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards) )) if useNeural: pixel_bonus.save_model(str(args.nameDemonstrator) + "neural", step) """
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") """ if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None """ envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: episode_rewards.append(reward[idx]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n". format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards) ) ) with open(save_path+'/TrainingStats_file.csv', mode='a') as train_file: train_writer = csv.writer(train_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) train_writer.writerow([j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards)]) if args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards) )) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass """ envs.close()
def __init__(self, args, im_log_dir): self.im_log_dir = im_log_dir self.log_dir = args.load_dir env_name = args.env_name if torch.cuda.is_available() and not args.no_cuda: args.cuda = True device = torch.device('cuda') map_location = torch.device('cuda') else: args.cuda = False device = torch.device('cpu') map_location = torch.device('cpu') try: checkpoint = torch.load(os.path.join(args.load_dir, env_name + '.tar'), map_location=map_location) except FileNotFoundError: print('load-dir does not start with valid gym environment id, using command line args') env_name = args.env_name checkpoint = torch.load(os.path.join(args.load_dir, env_name + '.tar'), map_location=map_location) saved_args = checkpoint['args'] past_frames = checkpoint['n_frames'] args.past_frames = past_frames env_name = saved_args.env_name if 'Micropolis' in env_name: args.power_puzzle = saved_args.power_puzzle if not args.evaluate and not 'GoLMulti' in env_name: # assume we just want to observe/interact w/ a single env. args.num_proc = 1 dummy_args = args envs = make_vec_envs(env_name, args.seed + 1000, args.num_processes, None, args.load_dir, args.add_timestep, device=device, allow_early_resets=False, args=dummy_args) print(args.load_dir) if isinstance(envs.observation_space, gym.spaces.Discrete): in_width = 1 num_inputs = envs.observation_space.n elif isinstance(envs.observation_space, gym.spaces.Box): if len(envs.observation_space.shape) == 3: in_w = envs.observation_space.shape[1] in_h = envs.observation_space.shape[2] else: in_w = 1 in_h = 1 num_inputs = envs.observation_space.shape[0] if isinstance(envs.action_space, gym.spaces.Discrete): out_w = 1 out_h = 1 num_actions = int(envs.action_space.n // (in_w * in_h)) #if 'Micropolis' in env_name: # num_actions = env.venv.venv.envs[0].num_tools #elif 'GameOfLife' in env_name: # num_actions = 1 #else: # num_actions = env.action_space.n elif isinstance(envs.action_space, gym.spaces.Box): out_w = envs.action_space.shape[0] out_h = envs.action_space.shape[1] num_actions = envs.action_space.shape[-1] # We need to use the same statistics for normalization as used in training #actor_critic, ob_rms = \ # torch.load(os.path.join(args.load_dir, args.env_name + ".pt")) if saved_args.model == 'fractal': saved_args.model = 'FractalNet' actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'map_width': args.map_width, 'recurrent': args.recurrent_policy, 'in_w': in_w, 'in_h': in_h, 'num_inputs': num_inputs, 'out_w': out_w, 'out_h': out_h }, curiosity=args.curiosity, algo=saved_args.algo, model=saved_args.model, args=saved_args) actor_critic.to(device) torch.nn.Module.dump_patches = True actor_critic.load_state_dict(checkpoint['model_state_dict']) ob_rms = checkpoint['ob_rms'] if 'fractal' in args.model.lower(): new_recs = args.n_recs - saved_args.n_recs for nr in range(new_recs): actor_critic.base.auto_expand() print('expanded network:\n', actor_critic.base) if args.active_column is not None \ and hasattr(actor_critic.base, 'set_active_column'): actor_critic.base.set_active_column(args.active_column) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms self.actor_critic = actor_critic self.envs = envs self.args = args
def gen_frequencies(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, 1, args.gammas[-1], None, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'num_values': args.num_values, 'sum_values': args.sum_values }) state_dict = torch.load(args.log_dir + '/ppo/' + args.env_name + '.pt') actor_critic.load_state_dict(state_dict[0].state_dict()) actor_critic.to(device) rollouts = RolloutStorage(1, 1, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, tau=args.tau, gammas=args.gammas, use_delta_gamma=args.use_delta_gamma, use_capped_bias=args.use_capped_bias) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = [] values = [] rewards = [] NUM_STEPS = 10000 total_num_rewards = 0 for step in range(NUM_STEPS): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( obs, rollouts.recurrent_hidden_states[0], rollouts.masks[0]) obs, reward, done, infos = envs.step(action) r = reward.item() if r > 0 or r < 0: total_num_rewards += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with open( 'learned_frequencies/' + args.env_name[:-14] + '_learned_reward_frequency.pkl', 'wb') as handle: pickle.dump(total_num_rewards / NUM_STEPS, handle)
print(pr_y[i]) break print("test auc_local: ", auc) print("p_4", p_4) return auc, p_4, pr_x, pr_y, test_result if __name__ == "__main__": conf = config.Config() os.environ['CUDA_VISIBLE_DEVICES'] = conf.gpu conf.load_train_data() conf.load_test_data() tree = Tree(conf) conf.global_num_classes = tree.n_class base_model = PCNN_ATT(conf) policy = Policy(conf, tree.n_class, base_model) policy.cuda() policy_optimizer = torch.optim.SGD(policy.parameters(), lr = conf.policy_lr, weight_decay = conf.policy_weight_decay) for name,parameters in policy.named_parameters(): print(name, parameters.size()) criterion = torch.nn.CrossEntropyLoss() if conf.is_training : train() else: test()