def __init__(self, envs, hparams): # self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] # self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] if hparams['dropout'] == True: print ('CNNPolicy_dropout2') actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space) elif len(envs.observation_space.shape) == 3: print ('CNNPolicy2') actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage_list() #self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() # rollouts.cuda() if self.opt == 'rms': self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) else: print ('no opt specified') self.actor_critic = actor_critic self.rollouts = rollouts
def select_network(self): if len(self.envs.observation_space.shape) == 3: actor_critic = CNNPolicy(self.obs_shape[0], self.envs.action_space, self.args.recurrent_policy) else: assert not self.args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(self.obs_shape[0], self.envs.action_space) #actor_critic = BPW_MLPPolicy(obs_shape[0], self.envs.action_space) return actor_critic
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.ppo_epoch = hparams['ppo_epoch'] self.batch_size = hparams['batch_size'] self.clip_param = hparams['clip_param'] if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() self.eps = hparams['eps'] # self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) # if hparams['lr_schedule'] == 'linear': self.init_lr = hparams['lr'] self.final_lr = hparams['final_lr'] # lr_func = lambda epoch: max( init_lr*(1.-(epoch/500.)), final_lr) # self.optimizer2 = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lr_func) # self.current_lr = hparams['lr'] self.actor_critic = actor_critic self.rollouts = rollouts self.old_model = copy.deepcopy(self.actor_critic)
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] if hparams['dropout'] == True: print ('CNNPolicy_dropout2') actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space) elif len(envs.observation_space.shape) == 3: print ('CNNPolicy2') actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() if self.opt == 'rms': self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') self.actor_critic = actor_critic self.rollouts = rollouts self.rollouts_list = RolloutStorage_list()
def load_policy(self): actor_critic = MLPPolicy(self.args.obs_shape[1], self.args.full_state_shape[1], self.env.robot.action_space, symm_policy=self.args.symm_policy) print(os.path.join(self.args.load_dir + self.args.algo, self.args.phase, self.args.env_name, self.args.env_name + self.args.tr_itr + ".pt")) state_dict, ob_rms, st_rms, ret_rms = \ torch.load( os.path.join(self.args.load_dir + self.args.algo, self.args.phase, self.args.env_name, self.args.env_name + self.args.tr_itr + ".pt"), map_location='cpu') actor_critic.load_state_dict(state_dict) actor_critic.train(False) actor_critic.eval() self.env.robot.ob_rms = ob_rms return actor_critic
def __init__(self, envs, cuda, num_steps, num_processes, obs_shape, lr, eps, alpha, use_gae, gamma, tau, value_loss_coef, entropy_coef): if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if cuda: actor_critic.cuda() # if args.algo == 'a2c': self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps, alpha) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if cuda: rollouts.cuda() self.actor_critic = actor_critic self.rollouts = rollouts self.use_gae = use_gae self.gamma = gamma self.tau = tau self.obs_shape = obs_shape self.action_shape = action_shape self.num_steps = num_steps self.num_processes = num_processes self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) fdsafasd # if args.vis: # from visdom import Visdom # viz = Visdom() # win = None envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) # print('here3') # fdasf obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) # elif args.algo == 'ppo': # optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) # elif args.algo == 'acktr': # optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) #set the first state to current state # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() # if args.algo == 'ppo': # old_model = copy.deepcopy(actor_critic) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) # make prediction using state that you put into rollouts cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) # print (state.shape) # [nProcesss, ndims, height, width] # fsdf reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) # insert all that info into current step # not exactly why next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() # if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # # Sampled fisher, see Martens 2014 # actor_critic.zero_grad() # pg_fisher_loss = -action_log_probs.mean() # value_noise = Variable(torch.randn(values.size())) # if args.cuda: # value_noise = value_noise.cuda() # sample_values = values + value_noise # vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() # fisher_loss = pg_fisher_loss + vf_fisher_loss # optimizer.acc_stats = True # fisher_loss.backward(retain_graph=True) # optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() # if args.algo == 'a2c': # nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() # elif args.algo == 'ppo': # advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) # old_model.load_state_dict(actor_critic.state_dict()) # if hasattr(actor_critic, 'obs_filter'): # old_model.obs_filter = actor_critic.obs_filter # for _ in range(args.ppo_epoch): # sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) # for indices in sampler: # indices = torch.LongTensor(indices) # if args.cuda: # indices = indices.cuda() # states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] # actions_batch = rollouts.actions.view(-1, action_shape)[indices] # return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # # Reshape to do in a single forward pass for all steps # values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) # _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) # ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) # adv_targ = Variable(advantages.view(-1, 1)[indices]) # surr1 = ratio * adv_targ # surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ # action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) # value_loss = (Variable(return_batch) - values).pow(2).mean() # optimizer.zero_grad() # (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() # optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") print(args) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) for gamma in args.gamma: with open(args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv', "wt") as monitor_file: monitor = csv.writer(monitor_file) monitor.writerow([ 'update', 'error', str(int(args.num_frames) // args.num_steps) ]) os.environ['OMP_NUM_THREADS'] = '1' print("Using env {}".format(args.env_name)) envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) num_heads = len( args.gamma) if not args.reward_predictor else len(args.gamma) - 1 if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, num_heads=num_heads, hidden_size=args.hidden_size) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space, num_heads=num_heads, reward_predictor=args.reward_predictor, use_s=args.use_s, use_s_a=args.use_s_a, use_s_a_sprime=args.use_s_a_sprime) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() lrs = [args.lr] * len(actor_critic.param_groups) if not args.reward_predictor: assert len(actor_critic.param_groups) == len(lrs) model_params = [{ 'params': model_p, 'lr': args.lr } for model_p, lr in zip(actor_critic.param_groups, lrs)] else: model_params = [{ 'params': model_p, 'lr': p_lr } for model_p, p_lr in zip(actor_critic.param_groups[:-1], lrs)] model_params.append({ 'params': actor_critic.param_groups[-1], 'lr': args.lr_rp }) if args.algo == 'a2c': optimizer = optim.RMSprop(model_params, args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(model_params, args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, gamma=args.gamma, use_rp=args.reward_predictor) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs, obs_tensor): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: obs_tensor[:, :-shape_dim0] = obs_tensor[:, shape_dim0:] obs_tensor[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs, current_obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) advantages_list = [] if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() cpu_actions = add_gaussian_noise(cpu_actions, args.action_noise) # Obser reward and next obs obs, raw_reward, done, info = envs.step(cpu_actions) reward = np.copy(raw_reward) reward = add_gaussian_noise(reward, args.reward_noise) reward = epsilon_greedy(reward, args.reward_epsilon, args.reward_high, args.reward_low) raw_reward = torch.from_numpy( np.expand_dims(np.stack(raw_reward), 1)).float() episode_rewards += raw_reward reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs) if args.reward_predictor: r_hat = actor_critic.predict_reward( Variable(rollouts.observations[step], volatile=True), action, Variable(current_obs, volatile=True)) p_hat = min(args.rp_burn_in, j) / args.rp_burn_in estimate_reward = (1 - p_hat) * reward + p_hat * r_hat.data.cpu() reward = torch.cat([reward, estimate_reward], dim=-1) value = torch.cat([r_hat, value], dim=-1).data else: value = value.data rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value, reward, masks, raw_reward) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data if args.reward_predictor: if args.use_s or args.use_s_a: r_hat = actor_critic.predict_reward( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.actions[-1], volatile=True), None).data next_value = torch.cat([r_hat, next_value], dim=-1) else: next_value = torch.cat([ torch.zeros(list(next_value.size())[:-1] + [1]), next_value ], dim=-1) rollouts.compute_returns(next_value, args.use_gae, args.tau) if args.algo in ['a2c']: batch_states = Variable(rollouts.states[0].view( -1, actor_critic.state_size)) batch_masks = Variable(rollouts.masks[:-1].view(-1, 1)) batch_obs = Variable(rollouts.observations[:-1].view( -1, *obs_shape)) batch_actions = Variable(rollouts.actions.view(-1, action_shape)) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( batch_obs, batch_states, batch_masks, batch_actions) if args.reward_predictor: batch_obs_prime = Variable(rollouts.observations[1:].view( -1, *obs_shape)) values = torch.cat([ actor_critic.predict_reward(batch_obs, batch_actions, batch_obs_prime), values ], dim=-1) returns_as_variable = Variable(rollouts.returns[:-1]) batched_v_loss = 0 values = values.view(returns_as_variable.size()) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = returns_as_variable - values value_loss = advantages.pow(2).sum(-1).mean() action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) * action_log_probs).mean() if args.reward_predictor: rp_error = (values[:, :, 0].data - rollouts.raw_rewards).pow(2).mean() advantages_list.append([ rp_error, advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0] ]) else: advantages_list.append( advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0]) optimizer.zero_grad() (batched_v_loss + value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = advantages[:, :, -1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ, observations_batch_prime, true_rewards_batch, \ noisy_observations_batch, true_observations_batch = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) if args.reward_predictor: values = torch.cat([ actor_critic.predict_reward( Variable(observations_batch), Variable(actions_batch), Variable(observations_batch_prime)), values ], dim=-1) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) td = (Variable(return_batch) - values).pow(2) value_loss = td.sum(-1).mean() if args.reward_predictor: rp_error = (values[:, 0].data - true_rewards_batch).pow(2).mean() advantages_list.append( [rp_error, td[:, -1].mean(0).data.cpu().numpy()]) else: advantages_list.append( td[:, -1].mean(0).data.cpu().numpy()) optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, " "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if len(advantages_list) > 2: advantages_array = np.array(advantages_list).reshape( -1, len(args.gamma)).T for g, gamma in enumerate(args.gamma): with open( args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv', "a") as monitor_file: monitor = csv.writer(monitor_file) monitor.writerow( [total_num_steps, np.mean(advantages_array[g])]) advantages_list = []
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monit`or (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' # logger = Logger(algorithm_name = args.algo, environment_name = args.env_name, folder = args.folder) # logger.save_args(args) # print ("---------------------------------------") # print ('Saving to', logger.save_folder) # print ("---------------------------------------") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) target_actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) target_actor_critic = MLPPolicy(obs_shape[0], envs.action_space) for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()): target_param.data.copy_(param.data) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() actor_regularizer_criterion = nn.KLDivLoss() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) """ Used for KL Constraint in case of Continuous Action Stochastic Policies """ # target_values, target_action_log_probs, target_dist_entropy, target_states, target_action_mean, target_action_std = target_actor_critic.evaluate_actions_mean_and_std(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), # Variable(rollouts.states[0].view(-1, actor_critic.state_size)), # Variable(rollouts.masks[:-1].view(-1, 1)), # Variable(rollouts.actions.view(-1, action_shape))) # actor_regularizer_loss = (torch.log(action_std/target_action_std) + (action_std.pow(2) + (action_mean - target_action_mean).pow(2))/(2*target_action_std.pow(2)) - 0.5) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() ### Loss with regularizer added ##action_loss = -(Variable(advantages.data) * action_log_probs).mean() + args.actor_lambda * actor_regularizer_loss.mean(0).sum() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() total_loss = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef total_loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() ## Exponential average for target updates #if (j%args.target_update_interval == 0): # for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()): # target_param.data.copy_(args.target_tau * param.data + (1 - args.target_tau) * target_param.data) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) final_rewards_mean = [final_rewards.mean()] final_rewards_median = [final_rewards.median()] final_rewards_min = [final_rewards.min()] final_rewards_max = [final_rewards.max()] # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max) # logger.save() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' # T choose whetehr to visualize if args.vis: from visdom import Visdom viz = Visdom() win = None envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) # T get shape of observation array of the environment obs_shape = envs.observation_space.shape # T adjusting the shape; not sure what the * is obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) #T initialize the actor critic; MLP and CNN classes imported from model.py if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) #T - some kind of setup with the actor_critic if args.finetune: checkpoint_path = save_path = os.path.join(args.save_dir, args.algo, args.checkpoint) state_dict = torch.load(checkpoint_path) print("Finetuning from checkpoint: %s, at step: %d" % (checkpoint_path, state_dict['update'])) actor_critic.load_state_dict(state_dict['model_state_dict']) keep_layers = [ 'v_fc3.weight', 'v_fc3.bias', 'a_fc2.weight', 'a_fc2.bias', 'dist.fc_mean.weight', 'dist.fc_mean.bias', 'dist.logstd._bias' ] for name, param in actor_critic.named_parameters(): if name not in keep_layers: param.requires_grad = False for name, param in actor_critic.named_parameters(): print('Param name: %s, requires_grad: %d' % (name, param.requires_grad)) # T set up dimensions of the action space if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] # T all arguments imported from arguments.py # T enable cuda pythorch tensor support if args.cuda: actor_critic.cuda() # T - pull arguments and choose algorithm and optimizer if args.algo == 'a2c': optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, actor_critic.parameters()), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) #TO-DO figure out how to restore optimizer parameters when freezing some weights rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) # return all zeros, so nothing observed current_obs = torch.zeros(args.num_processes, *obs_shape) # T-not sure what this function is doing?? def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs # T - reset the environment; call function to update observation obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. # T - initialize rewards to be zero episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() if args.algo == 'ppo': old_model = copy.deepcopy(actor_critic) start = time.time() # T - begin iterative loop for j in range(num_updates): # T - take steps through single instance # T - this is the loop where action/critic happens for step in range(args.num_steps): # Sample actions # T - buried by the action method ultimately comes from torch.nn.Module value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # T done bool returned by steps; indicates if failure occurred (done) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks #T - now update the observation matrix update_current_obs(obs) #T - store what happened in this step rollouts.insert(step, current_obs, action.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler( range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() observations_batch = rollouts.observations[:-1].view( -1, *obs_shape)[indices] actions_batch = rollouts.actions.view( -1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(observations_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions( Variable(observations_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() file_name = FILE_PREFIX + '.pt' #torch.save(save_model, os.path.join(save_path, file_name)) data = { 'update': j, 'model_state_dict': save_model.state_dict(), 'optim_state_dict': optimizer.state_dict() } torch.save(data, os.path.join(save_path, file_name)) # T - write out some log information (not important for us) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
viz = Visdom(port=args.port) win = None env = blt.bullet_env('DIRECT', args.env_name, 50, args.log_dir) obs_shape = (env.robot_dict[args.env_name].observation_space.shape[0], ) obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) # load or new if args.cont_learning == True: print('#### continue learning... ####') actor_critic, ob_rms = \ torch.load(os.path.join(args.save_dir + args.algo + '/', args.env_name + ".pt")) env.robot_dict[args.env_name].ob_rms = ob_rms else: print('$$$$ new learning... $$$$') actor_critic = MLPPolicy(obs_shape[0], env.robot_dict[args.env_name].action_space) print('model desc: ', actor_critic) if args.cuda: print('cuda is available..2') actor_critic.cuda() # algorithm selection if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # Instantiate the environment config = getattr(configs, args.config)() # We make this in order to get the shapes. dummy_env = make_env(args, config, -1, [config['agent'](game_type=config['game_type'])])() envs_shape = dummy_env.observation_space.shape[1:] obs_shape = (envs_shape[0], *envs_shape[1:]) action_space = dummy_env.action_space if len(envs_shape) == 3: if args.model == 'convnet': actor_critic = lambda saved_model: PommeCNNPolicySmall( obs_shape[0], action_space, args) elif args.model == 'resnet': actor_critic = lambda saved_model: PommeResnetPolicy( obs_shape[0], action_space, args) else: actor_critic = lambda saved_model: MLPPolicy(obs_shape[0], action_space ) # We need to get the agent = config.agent(agent_id, config.game_type) and then # pass that agent into the agent.PPOAgent training_agents = [] saved_models = args.saved_models saved_models = saved_models.split( ',') if saved_models else [None] * args.nagents assert (len(saved_models)) == args.nagents for saved_model in saved_models: # TODO: implement the model loading. model = actor_critic(saved_model) agent = config['agent'](game_type=config['game_type']) agent = ppo_agent.PPOAgent(agent, model) training_agents.append(agent) if args.how_train == 'simple': # Simple trains a single agent against three SimpleAgents. assert ( args.nagents == 1), "Simple training should have a single agent." num_training_per_episode = 1 elif args.how_train == 'homogenous': # Homogenous trains a single agent against itself (self-play). assert (args.nagents == 1 ), "Homogenous toraining should have a single agent." num_training_per_episode = 4 elif args.how_train == 'heterogenous': assert (args.nagents > 1), "Heterogenous training should have more than one agent." print("Heterogenous training is not implemented yet.") return # NOTE: Does this work correctly? Will the threads operate independently? envs = [ make_env(args, config, i, training_agents) for i in range(args.num_processes) ] envs = SubprocVecEnv(envs) if args.num_processes > 1 else DummyVecEnv(envs) # TODO: Figure out how to render this for testing purposes. The following link may help: # https://github.com/MG2033/A2C/blob/master/envs/subproc_vec_env.py for agent in training_agents: agent.initialize(args, obs_shape, action_space, num_training_per_episode) current_obs = torch.zeros(num_training_per_episode, args.num_processes, *obs_shape) def update_current_obs(obs): current_obs = torch.from_numpy(obs).float() obs = envs.reset() update_current_obs(obs) if args.how_train == 'simple': training_agents[0].update_rollouts(obs=current_obs, timestep=0) elif args.how_train == 'homogenous': training_agents[0].update_rollouts(obs=current_obs, timestep=0) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_training_per_episode, args.num_processes, 1]) final_rewards = torch.zeros( [num_training_per_episode, args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() for agent in training_agents: agent.cuda() stats = utils.init_stats(args) start = time.time() for j in range(num_updates): for step in range(args.num_steps): value_agents = [] action_agents = [] action_log_prob_agents = [] states_agents = [] episode_reward = [] cpu_actions_agents = [] if args.how_train == 'simple': value, action, action_log_prob, states = training_agents[ 0].act_pytorch(step, 0) value_agents.append(value) action_agents.append(action) action_log_prob_agents.append(action_log_prob) states_agents.append(states) cpu_actions = action.data.squeeze(1).cpu().numpy() cpu_actions_agents = cpu_actions elif args.how_train == 'homogenous': cpu_actions_agents = [[] for _ in range(args.num_processes)] for i in range(4): value, action, action_log_prob, states = training_agents[ 0].act_pytorch(step, i) value_agents.append(value) action_agents.append(action) action_log_prob_agents.append(action_log_prob) states_agents.append(states) cpu_actions = action.data.squeeze(1).cpu().numpy() for num_process in range(args.num_processes): cpu_actions_agents[num_process].append( cpu_actions[num_process]) obs, reward, done, info = envs.step(cpu_actions_agents) reward = torch.from_numpy(np.stack(reward)).float().transpose(0, 1) episode_rewards += reward # import pdb; pdb.set_trace() if args.how_train == 'simple': masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]) elif args.how_train == 'homogenous': masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]).transpose(0, 1) final_rewards *= masks # nagents x nprocesses x 1 final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() reward_all = reward.unsqueeze(2) if args.how_train == 'simple': masks_all = masks.transpose(0, 1).unsqueeze(2) elif args.how_train == 'homogenous': masks_all = masks.unsqueeze(2) current_obs *= masks_all.unsqueeze(2).unsqueeze(2) update_current_obs(obs) states_all = torch.from_numpy( np.stack([x.data for x in states_agents])).float() action_all = torch.from_numpy( np.stack([x.data for x in action_agents])).float() action_log_prob_all = torch.from_numpy( np.stack([x.data for x in action_log_prob_agents])).float() value_all = torch.from_numpy( np.stack([x.data for x in value_agents])).float() if args.how_train in ['simple', 'homogenous']: training_agents[0].insert_rollouts(step, current_obs, states_all, action_all, action_log_prob_all, value_all, reward_all, masks_all) next_value_agents = [] if args.how_train == 'simple': agent = training_agents[0] next_value_agents.append(agent.run_actor_critic(-1, 0)) advantages = [ agent.compute_advantages(next_value_agents, args.use_gae, args.gamma, args.tau) ] elif args.how_train == 'homogenous': agent = training_agents[0] next_value_agents = [ agent.run_actor_critic(-1, num_agent) for num_agent in range(4) ] advantages = [ agent.compute_advantages(next_value_agents, args.use_gae, args.gamma, args.tau) ] final_action_losses = [] final_value_losses = [] final_dist_entropies = [] for num_agent, agent in enumerate(training_agents): for _ in range(args.ppo_epoch): data_generator = agent.feed_forward_generator( advantages[num_agent], args) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = agent.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() value_loss = (Variable(return_batch) - values).pow(2).mean() agent.optimize(value_loss, action_loss, dist_entropy, args.entropy_coef, args.max_grad_norm) final_action_losses.append(action_loss) final_value_losses.append(value_loss) final_dist_entropies.append(dist_entropy) agent.after_update() ##### # Save model. ##### if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # XXX: new way for saving model # XXX: we should also add the optimizer along with the state_dict for num_agent, agent in enumerate(training_agents): save_model = agent.get_model() save_optimizer = agent.get_optimizer() torch.save( { 'epoch': j, 'arch': args.model, 'state_dict': save_model.state_dict(), 'optimizer': save_optimizer.state_dict(), }, os.path.join( save_path, "train={}-config={}-model={}-agent={}.pt".format( args.how_train, args.config, args.model, num_agent))) ##### # Log to console. ##### if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, avg entropy {:.5f}, avg value loss {:.5f}, avg policy loss {:.5f}" .format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), np.mean([ dist_entropy.data[0] for dist_entropy in final_dist_entropies ]), np.mean([ value_loss.data[0] for value_loss in final_value_losses ]), np.mean([ action_loss.data[0] for action_loss in final_action_losses ]))) # save stats to h5 file # TODO: need to fix this error # stats = utils.log_stats(args, stats, j, int(total_num_steps / (end - start)), \ # final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), \ # np.mean([action_loss.data[0] for action_loss in final_action_losses]), \ # np.mean([value_loss.data[0] for value_loss in final_value_losses]), \ # np.mean([dist_entropy.data[0] for dist_entropy in final_dist_entropies])) # # log_path = os.path.join(args.log_dir) # filename_stats = '%s/stats.h5' % log_path # utils.save_dict(filename_stats, stats) ##### # Log to Visdom. ##### if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, 'ppo') except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() if args.algo == 'ppo': old_model = copy.deepcopy(actor_critic) for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler( range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view( -1, *obs_shape)[indices] actions_batch = rollouts.actions.view( -1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions( Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: print( "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, (j + 1) * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if j % args.vis_interval == 0: win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
def main(): os.environ['OMP_NUM_THREADS'] = '1' envs = UsbCamEnv(ENV_IMG_W, ENV_IMG_H, env_done_reward) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) action_shape = envs.action_space.shape[0] print('+++++++++++++++++++++++++++++++++++++') print('obs_shape:', obs_shape) print('action_shape:', action_shape) print('+++++++++++++++++++++++++++++++++++++') if args.cuda: actor_critic.cuda() optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() old_model = copy.deepcopy(actor_critic) for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) print('%3d [%3d %3d %3d %3d] %3d' % (step, int(envs.convert_2_real_action(cpu_actions)[0, 0]), int(envs.convert_2_real_action(cpu_actions)[0, 1]), int(envs.convert_2_real_action(cpu_actions)[0, 2]), int(envs.convert_2_real_action(cpu_actions)[0, 3]), reward[0])) if reward[0] >= search_done_reward: sys.exit() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] actions_batch = rollouts.actions.view(-1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, j * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
SubprocVecEnv([ make_env(args.env_name[j], args.seed, i, log_dir_teacher[j]) for i in range(args.num_processes) ]) for j in range(args.num_heads) ] obs_shape = envs_teacher[0].observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs_teacher[0].observation_space.shape) == 3: teacher = CNNPolicy(obs_shape[0], envs_teacher.action_space) # TODO: change student student = CNNPolicy(obs_shape[0], envs_student_train.action_space) else: teacher = [ MLPPolicy(obs_shape[0], envs_teacher[i].action_space) for i in range(args.num_heads) ] # TODO: change student student = MultiHead_MLPPolicy(obs_shape[0], envs_student_train[0].action_space, num_heads=args.num_heads) # load teacher model from checkpoint for i in range(args.num_heads): assert os.path.exists(args.checkpoint[i]) state_dict = torch.load(args.checkpoint[i]) print('Loading teacher network from : %s' % args.checkpoint[i]) teacher[i].load_state_dict(state_dict['model_state_dict']) if args.cuda:
writer = SummaryWriter() # Hyper-parameters BATCH_SIZE = 512 MEMORY_SIZE = 5000 LR = 0.001 test_interval = 1000 test_episodes = 100 TIMESTEPS = 10000 EPSILON_ENDT = 3000 env = gym.make('CartPole-v0') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = DQNAgent(d_actions=env.action_space.n, device=device, batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, lr=LR, epsilon_endt=EPSILON_ENDT) agent.policy_net = MLPPolicy(d_state=env.observation_space.shape[0], d_hidden=20, d_action=env.action_space.n).to(device) init = time.time() print("Init time {}".format(init-start)) num_episode = 0 episode_t = 0 state = env.reset() state = torch.from_numpy(state).unsqueeze_(0).to(device=device, dtype=torch.float) while agent.time_step < TIMESTEPS: action = agent.act(state) next_state, reward, done, _ = env.step(action.item()) episode_t += 1 if not done:
def select_network(self): actor_critic = MLPPolicy(self.obs_shape[0], self.env.action_space) return actor_critic
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None names = getListOfGames("train") envs = [make_env_train(names[i], args.seed, i, args.log_dir) for i in range(len(names))] # TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO args.num_processes = len(envs) # REMEMBER YOU CHENGED IT if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape #print(obs_shape) obs_shape = (obs_shape[0], *obs_shape[1:]) #print(obs_shape) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) # Making it paralel actor_critic = torch.nn.parallel.DataParallel(actor_critic).module if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) # Make agent DataParallel agent = torch.nn.parallel.DataParallel(agent).module elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) # Make rollouts DataParallel rollouts = torch.nn.parallel.DataParallel(RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)).module current_obs = torch.nn.parallel.DataParallel(torch.zeros(envs.nenvs, *obs_shape)).module def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() # if args.num_stack > 1: # current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic.get_value(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() viz_1 = Visdom() win = None win1 = None env_name_1 = 'HalfCheetahSmallFoot-v0' args.env_name = 'HalfCheetahSmallLeg-v0' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] envs_1 = [ make_env(env_name_1, args.seed, i, args.log_dir_1) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) envs_1 = SubprocVecEnv(envs_1) else: envs = DummyVecEnv(envs) envs_1 = DummyVecEnv(envs_1) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) envs_1 = VecNormalize(envs_1) #same for both tasks obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) actor_critic_1 = MLPPolicy(obs_shape[0], envs_1.action_space) #same for both tasks action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() actor_critic_1.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) optimizer_1 = optim.RMSprop(actor_critic_1.parameters(), args.lr, eps=args.eps, alpha=args.alpha) #Different for both tasks rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) rollouts_1 = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs_1.action_space, actor_critic_1.state_size) current_obs_1 = torch.zeros(args.num_processes, *obs_shape) #Different update functions def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def update_current_obs_1(obs): shape_dim0 = envs_1.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs_1[:, :-shape_dim0] = current_obs_1[:, shape_dim0:] current_obs_1[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) obs_1 = envs_1.reset() update_current_obs_1(obs_1) rollouts.observations[0].copy_(current_obs) rollouts_1.observations[0].copy_(current_obs_1) episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_rewards_1 = torch.zeros([args.num_processes, 1]) final_rewards_1 = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() current_obs_1 = current_obs_1.cuda() rollouts_1.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions from branch 1 value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) #Sample actions from branch 2 value_1, action_1, action_log_prob_1, states_1 = actor_critic_1.act( Variable(rollouts_1.observations[step], volatile=True), Variable(rollouts_1.states[step], volatile=True), Variable(rollouts_1.masks[step], volatile=True)) cpu_actions_1 = action_1.data.squeeze(1).cpu().numpy() obs_1, reward_1, done_1, info_1 = envs_1.step(cpu_actions_1) reward_1 = torch.from_numpy(np.expand_dims(np.stack(reward_1), 1)).float() episode_rewards_1 += reward_1 masks_1 = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done_1]) final_rewards_1 *= masks_1 final_rewards_1 += (1 - masks_1) * episode_rewards_1 episode_rewards_1 *= masks_1 if args.cuda: masks_1 = masks_1.cuda() if current_obs_1.dim() == 4: current_obs_1 *= masks_1.unsqueeze(2).unsqueeze(2) else: current_obs_1 *= masks_1 update_current_obs_1(obs_1) rollouts_1.insert(step, current_obs_1, states_1.data, action_1.data, action_log_prob_1.data, value_1.data, reward_1, masks_1) #Update for branch 1 next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() #share params branch 1 -> branch 2 actor_critic_1.a_fc1.weight.data = copy.deepcopy( actor_critic.a_fc1.weight.data) actor_critic_1.a_fc1.bias.data = copy.deepcopy( actor_critic.a_fc1.bias.data) actor_critic_1.v_fc1.weight.data = copy.deepcopy( actor_critic.v_fc1.weight.data) actor_critic_1.v_fc1.bias.data = copy.deepcopy( actor_critic.v_fc1.bias.data) #Update for branch 2 next_value_1 = actor_critic_1( Variable(rollouts_1.observations[-1], volatile=True), Variable(rollouts_1.states[-1], volatile=True), Variable(rollouts_1.masks[-1], volatile=True))[0].data rollouts_1.compute_returns(next_value_1, args.use_gae, args.gamma, args.tau) values_1, action_log_probs_1, dist_entropy_1, states_1 = actor_critic_1.evaluate_actions( Variable(rollouts_1.observations[:-1].view(-1, *obs_shape)), Variable(rollouts_1.states[0].view(-1, actor_critic_1.state_size)), Variable(rollouts_1.masks[:-1].view(-1, 1)), Variable(rollouts_1.actions.view(-1, action_shape))) values_1 = values_1.view(args.num_steps, args.num_processes, 1) action_log_probs_1 = action_log_probs_1.view(args.num_steps, args.num_processes, 1) advantages_1 = Variable(rollouts_1.returns[:-1]) - values_1 value_loss_1 = advantages_1.pow(2).mean() action_loss_1 = -(Variable(advantages_1.data) * action_log_probs_1).mean() optimizer_1.zero_grad() (value_loss_1 * args.value_loss_coef + action_loss_1 - dist_entropy_1 * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic_1.parameters(), args.max_grad_norm) optimizer_1.step() rollouts_1.after_update() #share params branch 2 -> branch 1 actor_critic.a_fc1.weight.data = copy.deepcopy( actor_critic_1.a_fc1.weight.data) actor_critic.a_fc1.bias.data = copy.deepcopy( actor_critic_1.a_fc1.bias.data) actor_critic.v_fc1.weight.data = copy.deepcopy( actor_critic_1.v_fc1.weight.data) actor_critic.v_fc1.bias.data = copy.deepcopy( actor_critic_1.v_fc1.bias.data) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo, args.env_name + '_' + env_name_1) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = actor_critic_1 if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model_1 = copy.deepcopy(actor_critic_1).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] save_model_1 = [ save_model_1, hasattr(envs_1, 'ob_rms') and envs_1.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) torch.save(save_model_1, os.path.join(save_path, env_name_1 + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) print( "Updates_1 {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards_1.mean(), final_rewards_1.median(), final_rewards_1.min(), final_rewards_1.max(), dist_entropy_1.data[0], value_loss_1.data[0], action_loss_1.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) win1 = visdom_plot(viz_1, win1, args.log_dir_1, env_name_1, args.algo) except IOError: pass
obs_shape = env.robot.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) state_shape = env.robot.state_space.shape state_shape = (state_shape[0] * args.num_stack, *state_shape[1:]) action_shape = env.robot.action_space.shape full_state_shape = (state_shape[0] * args.num_stack, obs_shape[1] + state_shape[1]) if args.symm_policy: full_state_shape = state_shape """ ----[ Load Policy ]---- """ actor_critic = MLPPolicy(obs_shape[1], full_state_shape[1], env.robot.action_space, symm_policy=args.symm_policy) print( os.path.join(args.load_dir + args.algo, args.phase, args.env_name, args.env_name + args.tr_itr + ".pt")) # state_dict, ob_rms = \ state_dict, ob_rms, st_rms, ret_rms = \ torch.load(os.path.join(args.load_dir + args.algo, args.phase, args.env_name, args.env_name + args.tr_itr + ".pt"), map_location='cpu') actor_critic.load_state_dict(state_dict) actor_critic.train(False) actor_critic.eval() # TODO print('ob_rms: ', ob_rms)
def main(): print("######") print("HELLO! Returns start with infinity values") print("######") os.environ['OMP_NUM_THREADS'] = '1' if args.random_task: env_params = { 'wt': np.round(np.random.uniform(0.5, 1.0), 2), 'x': np.round(np.random.uniform(-0.1, 0.1), 2), 'y': np.round(np.random.uniform(-0.1, 0.1), 2), 'z': np.round(np.random.uniform(0.15, 0.2), 2), } else: env_params = { 'wt': args.euclidean_weight, 'x': args.goal_x, 'y': args.goal_y, 'z': args.goal_z, } envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = VecNormalize(envs, ob=False) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() actor_critic.input_norm.update(rollouts.observations[0]) last_return = -np.inf best_return = -np.inf best_models = None start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) actor_critic.input_norm.update(rollouts.observations[step + 1]) next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if args.vis and j % args.vis_interval == 0: last_return = plot(logger, args.log_dir) if last_return > best_return: best_return = last_return try: os.makedirs(os.path.dirname(args.save_path)) except OSError: pass info = { 'return': best_return, 'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon) } # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save((save_model, env_params, info), args.save_path) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), last_return, best_return, value_loss.data[0], action_loss.data[0]))
def main(): print("#######") print("WARNING: All rewards are not clipped or normalized ") print("#######") os.environ['OMP_NUM_THREADS'] = '1' envs = rafiki.Envs(args.num_processes, args.num_models, args.policy, args.beta, args.obs_size, args.max_latency, args.tau, args.cycle_len) obs_shape = envs.observation_space.shape actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() info_set = Info(args) for j in range(num_updates): for step in range(args.num_steps): logger.info('------------%d----------------' % j) # Sample actions with torch.no_grad(): action, probs, action_log_prob = actor_critic.act( Variable(rollouts.observations[step])) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs logger.info(probs) obs, reward, info = envs.step(cpu_actions) info_set.insert(info) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() update_current_obs(obs) rollouts.insert(step, current_obs, action.data, action_log_prob.data, reward) if args.algo in ['a2c', 'ppo']: action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) R = rollouts.rewards.detach() optimizer.zero_grad() policy_loss = -R.reshape(args.num_steps, args.num_processes).mul(action_log_probs) policy_loss = sum(policy_loss) / len(policy_loss) policy_loss.backward() # nn.utils.clip_grad_norm_(actor_critic.parameters(), args.max_grad_norm) optimizer.step() with torch.no_grad(): action, probs, action_log_prob = actor_critic.act( Variable(rollouts.observations[-1])) logger.info(probs) rollouts.after_update() if j % args.log_interval == 0: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, reward {}, policy loss {}". format(j, total_num_steps, R.data, policy_loss.reshape(-1).data)) logger.info(args) info_set.show()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:] ) # I guess the obs_shape[0] is channel number if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # args.num_steps should be the length of interactions before each updating/training # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy( ) # returns are state value, sampled action, act_log_prob, hidden states # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert( step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks ) # so the rollout stores one batch of interaction sequences, each sequence has length of args.num_steps next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) # values should be values of observations, states are the hidden states used in rnn module, by pwang8 values = values.view( args.num_steps, args.num_processes, 1) # values are estimated current state values action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) # rollouts.returns are current "Action" value calculted following Bellmans' eqaution gamma * State_value(t+1) + reward(t) advantages = Variable( rollouts.returns[:-1] ) - values # This is also the definition of advantage value (action_value - state_value). value_loss = advantages.pow( 2).mean() # values are estimated current state_value(t) action_loss = -(Variable(advantages.data) * action_log_probs).mean() # If ACKTR is utilized, it is not only a different optimizer is used, they also added some new loss source if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -( values - Variable(sample_values.data) ).pow(2).mean( ) # don't know what is the difference between this and just randomly sample some noise fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[: -1] # calculating the advantage value of an action advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) # The difference from this ppo optimization to the optimization above is that: it updates params for # multiple epochs in ppo optimization. Because of this, it samples from the rollouts storage a minibatch # every time to calculate gradient. Sampling is conducted for optimization purpose. for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) # For the 1st epoch of updating, I guess the action_log_probls is the same as old_action_log_probs_batch # because params of the NN have not been updated at that time. But later, in other updating epochs, # this ratio will generate some error. The old_action_log_probs_batch will not be updated during # these param updating epochs. # action_log_probs is the log prob of that action taken by the agent. So it's one value here, not # log_prob for all actions with certain input observation/state. By pwang8, Dec 31, 2017 adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) # compared to a2c, the major difference for ppo is that action_loss is calculated in controlled way value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #os.environ['CUDA_VISIBLE_DEVICES'] = "9" if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.use_cell: hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1) ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1) else: hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) if args.cuda: actor_critic=actor_critic.cuda() hs = hs.cuda() ft = ft.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, num_processes=args.num_processes, num_steps=args.num_steps, use_cell=args.use_cell, lenhs=args.lenhs,lenft=args.lenft, plan=args.plan, ac_intv=args.ac_interval, hs_intv=args.hs_interval, ft_intv=args.ft_interval ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, feat_size=512) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() rec_x = [] rec_y = [] file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w') hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda() hs_ind = torch.IntTensor(args.num_processes, 1).zero_() epinfobuf = deque(maxlen=100) start_time = time.time() for j in range(num_updates): print('begin sample, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) for step in range(args.num_steps): # Sample actions with torch.no_grad(): rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start_ind = max(hs_ind[i],step+1-args.lenhs) for ind in range(start_ind,step+1): h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c) hs_info[i,:]=h.view(1,2*actor_critic.hid_size) del h,c gc.collect() else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i]) hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info) value, action, action_log_prob, states = actor_critic.act( hidden_feat, rollouts.states[step]) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, infos = envs.step(cpu_actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfobuf.extend([maybeepinfo['r']]) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int() if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks) with torch.no_grad(): rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start = max(hs_ind[i], step + 1 - args.lenhs) for ind in range(start, step + 1): h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c) hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size) del h,c else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i]) hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info) next_value = actor_critic.get_value(hidden_feat).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) rollouts.compute_ft_ind() print('begin update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) value_loss, action_loss, dist_entropy = agent.update(rollouts) print('end update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps v_mean,v_median,v_min,v_max = safe(epinfobuf) print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), int(total_num_steps / (end - start_time)), v_mean, v_median, v_min, v_max, dist_entropy, value_loss, action_loss)) if not (v_mean==np.nan): rec_x.append(total_num_steps) rec_y.append(v_mean) file.write(str(total_num_steps)) file.write(' ') file.writelines(str(v_mean)) file.write('\n') if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name, args.env_name, args.num_frames) file.close()
def eval_pomme( saved_models='train=simple-config=ffa_v0-model=convnet-agent=0.pt'): os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(server=args.server, port=8097) # viz = Visdom(port=args.port) win = None # Instantiate the environment config = getattr(configs, args.config)() # We make this in order to get the shapes. dummy_env = make_env(args, config, -1, [config['agent'](game_type=config['game_type'])])() envs_shape = dummy_env.observation_space.shape[1:] obs_shape = (envs_shape[0], *envs_shape[1:]) action_space = dummy_env.action_space if len(envs_shape) == 3: if args.model == 'convnet': actor_critic = lambda saved_model: PommeCNNPolicySmall( obs_shape[0], action_space, args) elif args.model == 'resnet': actor_critic = lambda saved_model: PommeResnetPolicy( obs_shape[0], action_space, args) else: actor_critic = lambda saved_model: MLPPolicy(obs_shape[0], action_space ) # TODO: this only works for simple - need a list of checkpoints for self-play # We need to get the agent = config.agent(agent_id, config.game_type) and then # pass that agent into the agent.PPOAgent training_agents = [] # TODO: this is a bit hacky and doesn't work for more than 1 model # saved_models = args.saved_models save_path = os.path.join(args.save_dir) saved_models = [os.path.join(save_path, saved_models)] # saved_models = saved_models.split(',') if saved_models else [None]*args.nagents assert (len(saved_models)) == args.nagents if len(envs_shape) == 3: if args.model == 'convnet': actor_critic_model = PommeCNNPolicySmall(obs_shape[0], action_space, args) elif args.model == 'resnet': actor_critic_model = PommeResnetPolicy(obs_shape[0], action_space, args) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic_model = MLPPolicy(obs_shape[0], action_space) print("****") for saved_model in saved_models: # TODO: implement the model loading. loaded_model = torch.load(saved_model) print("epoch of model {} is: {}".format(saved_model, loaded_model['epoch'])) loaded_actor_critic_model = actor_critic_model.load_state_dict( loaded_model['state_dict']) model = actor_critic(loaded_actor_critic_model) model.eval() agent = config['agent'](game_type=config['game_type']) agent = ppo_agent.PPOAgent(agent, model) training_agents.append(agent) print("****") if args.how_train == 'simple': # Simple trains a single agent against three SimpleAgents. assert ( args.nagents == 1), "Simple training should have a single agent." num_training_per_episode = 1 elif args.how_train == 'homogenous': # Homogenous trains a single agent against itself (self-play). assert (args.nagents == 1 ), "Homogenous toraining should have a single agent." num_training_per_episode = 4 elif args.how_train == 'heterogenous': assert (args.nagents > 1), "Heterogenous training should have more than one agent." print("Heterogenous training is not implemented yet.") return # NOTE: Does this work correctly? Will the threads operate independently? envs = [ make_env(args, config, i, training_agents) for i in range(args.num_processes) ] envs = SubprocVecEnv(envs) if args.num_processes > 1 else DummyVecEnv(envs) for agent in training_agents: agent.initialize(args, obs_shape, action_space, num_training_per_episode) current_obs = torch.zeros(num_training_per_episode, args.num_processes, *obs_shape) def update_current_obs(obs): current_obs = torch.from_numpy(obs).float() obs = envs.reset() update_current_obs(obs) if args.how_train == 'simple': training_agents[0].update_rollouts(obs=current_obs, timestep=0) elif args.how_train == 'homogenous': training_agents[0].update_rollouts(obs=current_obs, timestep=0) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_training_per_episode, args.num_processes, 1]) final_rewards = torch.zeros( [num_training_per_episode, args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() for agent in training_agents: agent.cuda() start = time.time() for j in range(args.num_steps_eval): for step in range(args.num_steps): value_agents = [] action_agents = [] action_log_prob_agents = [] states_agents = [] episode_reward = [] cpu_actions_agents = [] if args.how_train == 'simple': value, action, action_log_prob, states = training_agents[ 0].act_pytorch(step, 0) value_agents.append(value) action_agents.append(action) action_log_prob_agents.append(action_log_prob) states_agents.append(states) cpu_actions = action.data.squeeze(1).cpu().numpy() cpu_actions_agents = cpu_actions elif args.how_train == 'homogenous': cpu_actions_agents = [[] for _ in range(args.num_processes)] for i in range(4): value, action, action_log_prob, states = training_agents[ 0].act_pytorch(step, i) value_agents.append(value) action_agents.append(action) action_log_prob_agents.append(action_log_prob) states_agents.append(states) cpu_actions = action.data.squeeze(1).cpu().numpy() for num_process in range(args.num_processes): cpu_actions_agents[num_process].append( cpu_actions[num_process]) obs, reward, done, info = envs.step(cpu_actions_agents) reward = torch.from_numpy(np.stack(reward)).float().transpose(0, 1) episode_rewards += reward if args.how_train == 'simple': masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]) elif args.how_train == 'homogenous': masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]).transpose(0, 1) masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]).transpose(0, 1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() reward_all = reward.unsqueeze(2) masks_all = masks.unsqueeze(2) if args.how_train == 'simple': masks_all = masks.transpose(0, 1).unsqueeze(2) elif args.how_train == 'homogenous': masks_all = masks.unsqueeze(2) current_obs *= masks_all.unsqueeze(2).unsqueeze(2) update_current_obs(obs) states_all = torch.from_numpy( np.stack([x.data for x in states_agents])).float() action_all = torch.from_numpy( np.stack([x.data for x in action_agents])).float() action_log_prob_all = torch.from_numpy( np.stack([x.data for x in action_log_prob_agents])).float() value_all = torch.from_numpy( np.stack([x.data for x in value_agents])).float() if args.how_train in ['simple', 'homogenous']: training_agents[0].insert_rollouts(step, current_obs, states_all, action_all, action_log_prob_all, value_all, reward_all, masks_all) if step % args.log_interval == 0: print("step ", step) end = time.time() total_num_steps = (step + 1) * args.num_processes * args.num_steps_eval final_rewards_tr = torch.zeros( [args.num_processes, args.nagents, 1]) final_rewards_tr.copy_(final_rewards) final_rewards_tr = final_rewards_tr.view(args.num_processes, args.nagents).transpose( 0, 1) for i in range(args.nagents): print("agent # ", i) print( "Updates {}, Agent {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(step, i, total_num_steps, int(total_num_steps / (end - start)), final_rewards_tr[i].mean(), final_rewards_tr[i].median(), final_rewards_tr[i].min(), final_rewards_tr[i].max()), "\n") print("\n") if args.vis and step % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name) except IOError: pass
str(args.seed)) if os.path.exists(log_path): shutil.rmtree(log_path) # clean directory each time os.makedirs(log_path) # Logging settings log_str_set = '# Settings: [T={:d}, Num_iter_algo={:d}, Num_iter_policy={:d}, lr_dynamics={:.4f}, lr_policy={:.4f}, c_sigma={:.2f}]\n' print( log_str_set.format(args.T, args.num_iter_algo, args.num_iter_policy, args.lr_dynamics, args.lr_policy, args.c_sigma)) # Create policy and its optimizer if args.policy_type == 'LinearPolicy': policy = LinearPolicy(env).cuda() elif args.policy_type == 'MLPPolicy': policy = MLPPolicy(env, hidden_size=args.hidden_size).cuda() else: raise TypeError('Policy type must be either LinearPolicy or MLPPolicy') # Initialize policy parameters to ensure small values for param in policy.parameters(): nn.init.normal(param, mean=0, std=1e-5) policy_optimizer = optim.Adam(policy.parameters(), lr=args.lr_policy) # 1e-2, RMSprop # Create dynamics and its optimizer dynamics = DynamicsModel(env, hidden_size=200, drop_prob=args.drop_p).cuda() dynamics_optimizer = optim.Adam(dynamics.parameters(), lr=args.lr_dynamics, weight_decay=1e-4)
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = [] win_dic ={} for i in range(len(mt_env_id_dic_selected)): win += [None] win_afs_per_m = None win_afs_loss = None win_basic_loss = None plot_dic = {} envs = [] ''' Because the oral program has only one game per model, so Song add loop i So whatever you wanna run , just put in SubprocVecEnvMt! ''' for i in range(len(mt_env_id_dic_selected)): log_dir = args.log_dir+mt_env_id_dic_selected[i]+'/' for j in range(args.num_processes): envs += [make_env(mt_env_id_dic_selected[i], args.seed, j, log_dir)] ''' This envs is an intergration of all the running env''' envs = SubprocVecEnvMt(envs) num_processes_total = args.num_processes * len(mt_env_id_dic_selected) '''(1,128,128)''' obs_shape = envs.observation_space.shape #num_stack :number of frames to stack obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) from arguments import is_restore if is_restore and args.save_dir: load_path = os.path.join(args.save_dir, args.algo) actor_critic =torch.load(os.path.join(load_path, args.env_name + ".pt")) # print ("restored previous model!") # print (actor_critic.Variable) # print (sss) else: if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) #'args.num_steps: number of forward steps in A2C #rollouts is an intergration of state\ reward\ next state\action and so on rollouts = RolloutStorage(args.num_steps, num_processes_total, obs_shape, envs.action_space) current_state = torch.zeros(num_processes_total, *obs_shape) ''' not sure about it''' def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] # print (shape_dim0) # print (sss) state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes_total, 1]) final_rewards = torch.zeros([num_processes_total, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() if args.algo == 'ppo': old_model = copy.deepcopy(actor_critic) from arguments import ewc, ewc_lambda, ewc_interval afs_per_m = [] afs_offset = [0.0]*gtn_M afs_loss_list = [] basic_loss_list = [] episode_reward_rec = 0.0 one = torch.FloatTensor([1]).cuda() mone = one * -1 '''for one whole game ''' for j in range(num_updates): for step in range(args.num_steps): if ewc == 1: try: states_store = torch.cat([states_store, rollouts.states[step].clone()], 0) except Exception as e: states_store = rollouts.states[step].clone() # Sample actions '''act fun refer to "observe it!"''' value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done = envs.step(cpu_actions) '''record the last 100 episodes rewards''' episode_reward_rec += reward episode_reward_rec = rec_last_100_epi_reward(episode_reward_rec,done) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() '''reward is shape of process_num_total, not batch-size''' # print ((reward).size()) # print (done) # print (sss) episode_rewards += reward ################ # rec_last_100_epi_reward(reward,done) # episode_reward_ppo += reward[0] # If done then clean the history of observations. final_rewards is used for compute after one whole num_step masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: # reset gradient optimizer.zero_grad() # forward values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # pre-process values = values.view(args.num_steps, num_processes_total, 1) action_log_probs = action_log_probs.view(args.num_steps, num_processes_total, 1) # compute afs loss afs_per_m_temp, afs_loss = actor_critic.get_afs_per_m( action_log_probs=action_log_probs, conv_list=conv_list, ) if len(afs_per_m_temp)>0: afs_per_m += [afs_per_m_temp] if (afs_loss is not None) and (afs_loss.data.cpu().numpy()[0]!=0.0): afs_loss.backward(mone, retain_graph=True) afs_loss_list += [afs_loss.data.cpu().numpy()[0]] advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() final_loss_basic = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef ewc_loss = None if j != 0: if ewc == 1: ewc_loss = actor_critic.get_ewc_loss(lam=ewc_lambda) if ewc_loss is None: final_loss = final_loss_basic else: final_loss = final_loss_basic + ewc_loss # print (final_loss_basic.data.cpu().numpy()[0]) # final_loss_basic basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]] final_loss.backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler(range(num_processes_total * args.num_steps)), args.batch_size * num_processes_total, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] actions_batch = rollouts.actions.view(-1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _, old_conv_list= old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() final_loss_basic = (value_loss + action_loss - dist_entropy * args.entropy_coef) basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]] final_loss_basic.backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # if j % int(num_updates/2-10) == 0 and args.save_dir != "": if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) import pickle with open(os.path.join(save_path, args.env_name + "_last_100_reward"), "wb") as f: pickle.dump(reward_dict, f) if j % args.log_interval == 0: print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, (j + 1) * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) try: print("ewc loss {:.5f}". format(ewc_loss.data.cpu().numpy()[0])) except Exception as e: pass if j > 5 and j % args.vis_interval == 0 and args.vis: ''' load from the folder''' for ii in range(len(mt_env_id_dic_selected)): log_dir = args.log_dir+mt_env_id_dic_selected[ii]+'/' win[ii] = visdom_plot(viz, win[ii], log_dir, mt_env_id_dic_selected[ii], args.algo) plot_dic = reward_dict for plot_name in plot_dic.keys(): # if plot_name not in win_dic: # win_dic[plot_name] = None if plot_name in win_dic.keys(): if len(plot_dic[plot_name]) > 0: win_dic[plot_name] = viz.line( torch.from_numpy(np.asarray(plot_dic[plot_name])), win=win_dic[plot_name], opts=dict(title=break_line_html(exp+'>>'+plot_name)) ) else: win_dic[plot_name] = None if len(afs_per_m)>0: win_afs_per_m = viz.line( torch.from_numpy(np.asarray(afs_per_m)), win=win_afs_per_m, opts=dict(title=title_html+'>>afs') ) # print (basic_loss_list) '''a2c:len(basic_loss_list) is vis_interval+1. because j start from 0 ppo:len(basic_loss_list) is (vis_interval+1)*ppo_epoch_4*len(BatchSampler) ''' # print (len(basic_loss_list)) # print (ss) win_basic_loss = viz.line( torch.from_numpy(np.asarray(basic_loss_list)), win=win_basic_loss, opts=dict(title=title_html+'>>basic_loss') ) if len(afs_loss_list) > 0: win_afs_loss = viz.line( torch.from_numpy(np.asarray(afs_loss_list)), win=win_afs_loss, opts=dict(title=title_html+'>>afs_loss') ) from arguments import parameter_noise, parameter_noise_interval if parameter_noise == 1: if j % parameter_noise_interval == 0: actor_critic.parameter_noise() if ewc == 1: if j % ewc_interval == 0 or j==0: actor_critic.compute_fisher(states_store) states_store = None actor_critic.star()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if len(envs.observation_space.shape) == 3: actor_critic = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if args.cuda: actor_critic.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) critic_optim = optim.Adam(critic.parameters(), lr=1e-4) gamma = 0.99 tau = 0.001 #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) mem_buffer.add((pre_state, current_obs, action_log_prob.data.cpu().numpy(), reward, done)) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if True: state, next_state, action, reward, done = mem_buffer.sample(5) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, 6]) next_q_values = critic_target( to_tensor(next_state, volatile=True), target_actor(to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True))[0]) next_q_values.volatile = False target_q_batch = to_tensor(reward) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() q_batch = critic(to_tensor(state), to_tensor(action)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor_critic.zero_grad() policy_loss = -critic( to_tensor(state), actor_critic(to_tensor(state), to_tensor(state), to_tensor(state))[0]) policy_loss = policy_loss.mean() policy_loss.backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() soft_update(target_actor, actor_critic, tau) soft_update(critic_target, critic, tau) ''' if args.algo in ['a2c', 'acktr']: action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) #advantages = Variable(rollouts.returns[:-1]) - values advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages) * action_log_probs).mean() #action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() critic_optim.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() critic_optim.step() ''' rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) # fsdaf # Create environment envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] # action_shape = action_shape # shape_dim0 = envs.observation_space.shape[0] # if args.cuda: # dtype = torch.cuda.FloatTensor # else: # dtype = torch.FloatTensor hparams = {'cuda':args.cuda, 'num_steps':args.num_steps, 'num_processes':args.num_processes, 'obs_shape':obs_shape, 'lr':args.lr, 'eps':args.eps, 'alpha':args.alpha, 'use_gae':args.use_gae, 'gamma':args.gamma, 'tau':args.tau, 'value_loss_coef':args.value_loss_coef, 'entropy_coef':args.entropy_coef} # Create agent # agent = a2c(envs, hparams) # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if args.cuda: actor_critic.cuda() # rollouts.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) # Init state current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype) def update_current_state(state):#, shape_dim0): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state # return current_state state = envs.reset() update_current_state(state)#, shape_dim0) # agent.insert_first_state(current_state) rollouts.states[0].copy_(current_state) #set the first state to current state # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda()#type(dtype) # if args.cuda: rollouts.cuda() #Begin training start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Act # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width] # Record rewards # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks # return reward, masks, final_rewards, episode_rewards, current_state # Update state update_current_state(state)#, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks) rollouts.insert(step, current_state, action.data, value.data, reward, masks) #Optimize agent # agent.update() next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # #Save model # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #Print updates if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # final_rewards.mean(), # final_rewards.median(), # final_rewards.min(), # final_rewards.max(), # end - start))#, -dist_entropy.data[0], # # value_loss.data[0], action_loss.data[0])) # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}". # format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start)) if j % (args.log_interval*30) == 0: print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start))
def main(): os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.start_container) for i in range(args.num_processes) ] test_envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.start_container) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) test_envs = SubprocVecEnv(test_envs) else: envs = DummyVecEnv(envs) test_envs = DummyVecEnv(test_envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.saved_encoder_model: obs_shape = (args.num_stack, args.latent_space_size) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize print(str(actor_critic)) print('Total model size: %d' % modelSize) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.resume_experiment: print("\n############## Loading saved model ##############\n") actor_critic, ob_rms = torch.load( os.path.join(save_path, args.env_name + args.save_tag + ".pt")) tr.load(os.path.join(log_path, args.env_name + args.save_tag + ".p")) if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) print(obs_shape) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) rollouts_test = RolloutStorage(args.num_steps_test, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) current_obs_test = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs, test=False): shape_dim0 = envs.observation_space.shape[0] if args.saved_encoder_model: shape_dim0 = 1 obs, _ = vae.encode(Variable(torch.cuda.FloatTensor(obs))) obs = obs.data.cpu().numpy() obs = torch.from_numpy(obs).float() if not test: if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs else: if args.num_stack > 1: current_obs_test[:, : -shape_dim0] = current_obs_test[:, shape_dim0:] current_obs_test[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) reward_avg = 0 if args.cuda: current_obs = current_obs.cuda() current_obs_test = current_obs_test.cuda() rollouts.cuda() rollouts_test.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observation, reward and next obs obs, reward, done, info = envs.step(cpu_actions) # Maxime: clip the reward within [0,1] for more reliable training # This code deals poorly with large reward values reward = np.clip(reward, a_min=0, a_max=None) / 400 reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks tr.episodes_done += args.num_processes - masks.sum() if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) tr.iterations_done += 1 if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save( save_model, os.path.join(save_path, args.env_name + args.save_tag + ".pt")) total_test_reward_list = [] step_test_list = [] for _ in range(args.num_tests): test_obs = test_envs.reset() update_current_obs(test_obs, test=True) rollouts_test.observations[0].copy_(current_obs_test) step_test = 0 total_test_reward = 0 while step_test < args.num_steps_test: value_test, action_test, action_log_prob_test, states_test = actor_critic.act( Variable(rollouts_test.observations[step_test], volatile=True), Variable(rollouts_test.states[step_test], volatile=True), Variable(rollouts_test.masks[step_test], volatile=True)) cpu_actions_test = action_test.data.squeeze( 1).cpu().numpy() # Observation, reward and next obs obs_test, reward_test, done_test, info_test = test_envs.step( cpu_actions_test) # masks here doesn't really matter, but still masks_test = torch.FloatTensor( [[0.0] if done_test_ else [1.0] for done_test_ in done_test]) # Maxime: clip the reward within [0,1] for more reliable training # This code deals poorly with large reward values reward_test = np.clip(reward_test, a_min=0, a_max=None) / 400 total_test_reward += reward_test[0] reward_test = torch.from_numpy( np.expand_dims(np.stack(reward_test), 1)).float() update_current_obs(obs_test) rollouts_test.insert(step_test, current_obs_test, states_test.data, action_test.data, action_log_prob_test.data,\ value_test.data, reward_test, masks_test) step_test += 1 if done_test: break #rollouts_test.reset() # Need to reinitialise with .cuda(); don't forget total_test_reward_list.append(total_test_reward) step_test_list.append(step_test) append_to(tr.test_reward, tr, sum(total_test_reward_list) / args.num_tests) append_to(tr.test_episode_len, tr, sum(step_test_list) / args.num_tests) logger.log_scalar_rl( "test_reward", tr.test_reward[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "test_episode_len", tr.test_episode_len[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) # Saving all the MyContainer variables tr.save( os.path.join(log_path, args.env_name + args.save_tag + ".p")) if j % args.log_interval == 0: reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean() end = time.time() tr.global_steps_done = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, tr.global_steps_done, int(tr.global_steps_done / (end - start)), reward_avg, dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) append_to(tr.pg_loss, tr, action_loss.data[0]) append_to(tr.val_loss, tr, value_loss.data[0]) append_to(tr.entropy_loss, tr, dist_entropy.data[0]) append_to(tr.train_reward_avg, tr, reward_avg) logger.log_scalar_rl( "train_pg_loss", tr.pg_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_val_loss", tr.val_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_entropy_loss", tr.entropy_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_reward_avg", tr.train_reward_avg[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) """ print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0]) ) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass