def run_tests(): """ Runs tests from .yaml file, saves results plots and .csv file. Args: None. Returns: results: Test results dataframe. """ with open(FILENAME) as file: # Loads the test hyper-parameters as dictionaries. tests = yaml.safe_load(file) # create a dataframe to keep the results test_dict = tests['Tests'] results = pd.DataFrame(test_dict) results["Episode"] = "" results['Max average score'] = "" for i, test in enumerate(tests['Tests']): env = gym.make(test['env']) env.reset() actor_critic = ActorCritic(env, test['episodes'], test['max_score'], test['hidden_size'], test['gamma'], test['save']) ## run training best_score, episode, rew_hist = actor_critic.train() results.loc[i, 'Episode'] = episode results.loc[i, 'Max average score'] = best_score plot_graphs(test, rew_hist) # save results to csv file filename = 'results/' + 'test_table.csv' results.to_csv(filename) return results
def train(args, dynet): torch.manual_seed(args.seed) embedding_size = args.embedding_size lstm_size = args.lstm_size num_modules = len(dynet.library) + 1 libr = librarian.SimpleLibrarian(num_modules, embedding_size) print type(libr) model = ActorCritic(num_modules, libr, lstm_size) env = learning_env.Environment(args, dynet, libr) optimizer = optim.Adam(model.parameters(), lr=args.ac_lr) model.train() values = [] log_probs = [] state = env.reset() #state = torch.from_numpy(state) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model # model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, lstm_size)) hx = Variable(torch.zeros(1, lstm_size)) else: cx = Variable(cx.data) hx = Variable(hx.data) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model((state.unsqueeze(0)), (hx, cx)) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.numpy()[0, 0]) done = done or episode_length >= args.num_steps reward = max(min(reward, 1), -1) if done: episode_length = 0 state = env.reset() #state = torch.from_numpy(state) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1) if not done: value, _, _ = model((state.unsqueeze(0)), (hx, cx)) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - 0.01 * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() global_norm = 0 for param in model.parameters(): global_norm += param.grad.data.pow(2).sum() global_norm = math.sqrt(global_norm) ratio = 40 / global_norm if ratio < 1: for param in model.parameters(): param.grad.data.mul_(ratio) optimizer.step()
class PPO(nn.Module): def __init__(self, state_dim, action_dim, eps=0.2, gamma=0.99, lambda_=0.95, K_epoch=80, batch_size=64): super(PPO, self).__init__() self.eps = eps self.gamma = gamma self.lambda_ = lambda_ self.K_epoch = K_epoch self.batch_size = batch_size self.model = ActorCritic(state_dim, action_dim) self.model_old = ActorCritic(state_dim, action_dim) for param in self.model_old.parameters(): param.requires_grad = False self.copy_weights() def forward(self, x): self.pi, self.v = self.model_old(x) return self.pi, self.v def copy_weights(self): self.model_old.load_state_dict(self.model.state_dict()) def update(self, buffer, optimizer): self.model.train() self.model_old.eval() self.advantage_fcn(buffer.data) batch_loss, batch_clip_loss, batch_vf_loss = [], [], [] for epoch in range(self.K_epoch): for state, action, next_s, reward, log_prob_old, entropy, advantage in buffer.get_data( self.batch_size): pi, v = self.model(state) log_prob_pi = pi.log_prob(action) prob_ratio = torch.exp(log_prob_pi - log_prob_old) first_term = prob_ratio * advantage second_term = self.clip_by_value(prob_ratio) * advantage loss_clip = (torch.min(first_term, second_term)).mean() _, v_next = self.model_old(next_s) v_target = reward + self.gamma * v_next loss_vf = ((v - v_target)**2).mean( ) # squared error loss: (v(s_t) - v_target)**2 loss = -(loss_clip - loss_vf ) #-(loss_clip - 0.5*loss_vf + 0.01*entropy.mean()) optimizer.zero_grad() loss.backward() optimizer.step() batch_loss.append(loss.detach().numpy()) batch_clip_loss.append(loss_clip.detach().numpy()) batch_vf_loss.append(loss_vf.detach().numpy()) self.copy_weights() buffer.reset() def advantage_fcn(self, buffer, normalize=True): _, v_st1 = self.model(torch.stack(buffer['next_s'])) _, v_s = self.model(torch.stack(buffer['s'])) deltas = torch.stack(buffer['r']) + self.gamma * v_st1 - v_s advantage, temp = [], 0 idxs = torch.tensor(range(len(deltas) - 1, -1, -1)) #reverse reverse_deltas = deltas.index_select(0, idxs) for delta_t in reverse_deltas: temp = delta_t + self.lambda_ * self.gamma * temp advantage.append(temp) advantage = torch.as_tensor(advantage[::-1]) #re-reverse if normalize: advantage = (advantage - advantage.mean()) / advantage.std() buffer['advantage'] = advantage.unsqueeze(1) def clip_by_value(self, x): return x.clamp(1 - self.eps, 1 + self.eps) # clamp(min, max)