Esempio n. 1
0
def run_tests():
    """ Runs tests from .yaml file, saves results plots and .csv file.

        Args:
            None.

        Returns:
            results: Test results dataframe. 

    """
    with open(FILENAME) as file:

        # Loads the test hyper-parameters as dictionaries.
        tests = yaml.safe_load(file)

    # create a dataframe to keep the results
    test_dict = tests['Tests']
    results = pd.DataFrame(test_dict)
    results["Episode"] = ""
    results['Max average score'] = ""

    for i, test in enumerate(tests['Tests']):

        env = gym.make(test['env'])
        env.reset()

        actor_critic = ActorCritic(env, test['episodes'], test['max_score'],
                                   test['hidden_size'], test['gamma'],
                                   test['save'])

        ## run training
        best_score, episode, rew_hist = actor_critic.train()

        results.loc[i, 'Episode'] = episode
        results.loc[i, 'Max average score'] = best_score

        plot_graphs(test, rew_hist)

        # save results to csv file
        filename = 'results/' + 'test_table.csv'
        results.to_csv(filename)

    return results
Esempio n. 2
0
def train(args, dynet):
    torch.manual_seed(args.seed)

    embedding_size = args.embedding_size

    lstm_size = args.lstm_size

    num_modules = len(dynet.library) + 1

    libr = librarian.SimpleLibrarian(num_modules, embedding_size)
    print type(libr)
    model = ActorCritic(num_modules, libr, lstm_size)

    env = learning_env.Environment(args, dynet, libr)

    optimizer = optim.Adam(model.parameters(), lr=args.ac_lr)

    model.train()

    values = []
    log_probs = []

    state = env.reset()
    #state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        # model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, lstm_size))
            hx = Variable(torch.zeros(1, lstm_size))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model((state.unsqueeze(0)), (hx, cx))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy()[0, 0])
            done = done or episode_length >= args.num_steps
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                state = env.reset()

            #state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0)), (hx, cx))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()

        global_norm = 0
        for param in model.parameters():
            global_norm += param.grad.data.pow(2).sum()
        global_norm = math.sqrt(global_norm)
        ratio = 40 / global_norm
        if ratio < 1:
            for param in model.parameters():
                param.grad.data.mul_(ratio)
        optimizer.step()
Esempio n. 3
0
class PPO(nn.Module):
    def __init__(self,
                 state_dim,
                 action_dim,
                 eps=0.2,
                 gamma=0.99,
                 lambda_=0.95,
                 K_epoch=80,
                 batch_size=64):
        super(PPO, self).__init__()
        self.eps = eps
        self.gamma = gamma
        self.lambda_ = lambda_
        self.K_epoch = K_epoch
        self.batch_size = batch_size

        self.model = ActorCritic(state_dim, action_dim)
        self.model_old = ActorCritic(state_dim, action_dim)
        for param in self.model_old.parameters():
            param.requires_grad = False
        self.copy_weights()

    def forward(self, x):
        self.pi, self.v = self.model_old(x)

        return self.pi, self.v

    def copy_weights(self):
        self.model_old.load_state_dict(self.model.state_dict())

    def update(self, buffer, optimizer):
        self.model.train()
        self.model_old.eval()
        self.advantage_fcn(buffer.data)

        batch_loss, batch_clip_loss, batch_vf_loss = [], [], []
        for epoch in range(self.K_epoch):
            for state, action, next_s, reward, log_prob_old, entropy, advantage in buffer.get_data(
                    self.batch_size):
                pi, v = self.model(state)
                log_prob_pi = pi.log_prob(action)

                prob_ratio = torch.exp(log_prob_pi - log_prob_old)

                first_term = prob_ratio * advantage
                second_term = self.clip_by_value(prob_ratio) * advantage
                loss_clip = (torch.min(first_term, second_term)).mean()

                _, v_next = self.model_old(next_s)
                v_target = reward + self.gamma * v_next
                loss_vf = ((v - v_target)**2).mean(
                )  # squared error loss: (v(s_t) - v_target)**2

                loss = -(loss_clip - loss_vf
                         )  #-(loss_clip - 0.5*loss_vf + 0.01*entropy.mean())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                batch_loss.append(loss.detach().numpy())
                batch_clip_loss.append(loss_clip.detach().numpy())
                batch_vf_loss.append(loss_vf.detach().numpy())

        self.copy_weights()
        buffer.reset()

    def advantage_fcn(self, buffer, normalize=True):
        _, v_st1 = self.model(torch.stack(buffer['next_s']))
        _, v_s = self.model(torch.stack(buffer['s']))
        deltas = torch.stack(buffer['r']) + self.gamma * v_st1 - v_s

        advantage, temp = [], 0
        idxs = torch.tensor(range(len(deltas) - 1, -1, -1))  #reverse
        reverse_deltas = deltas.index_select(0, idxs)
        for delta_t in reverse_deltas:
            temp = delta_t + self.lambda_ * self.gamma * temp
            advantage.append(temp)

        advantage = torch.as_tensor(advantage[::-1])  #re-reverse
        if normalize:
            advantage = (advantage - advantage.mean()) / advantage.std()

        buffer['advantage'] = advantage.unsqueeze(1)

    def clip_by_value(self, x):
        return x.clamp(1 - self.eps, 1 + self.eps)  # clamp(min, max)