Ejemplo n.º 1
0
def adversarial_discriminator(discriminator, policy, adversarial_step, d_steps,
                              d_epochs):
    discriminator.reset()
    discriminator.train()
    policy.eval()

    store.set('Discriminator Loss', [],
              attributes=[store.PLOTTABLE],
              if_exists=False)
    store.set('Discriminator Accuracy', [],
              attributes=[store.PLOTTABLE],
              if_exists=False)

    num_samples = config.num_real_samples * 2 * d_steps  # equal amount of generated data
    data_loader = loader.prepare_loader(num_samples, policy)

    for epoch in range(d_epochs):
        store.set('Discriminator Loss Per Batch', [])
        store.set('Discrmininator Accuracy Per Batch', [])

        print('Global Step {} - Discriminator Epoch {}'.format(
            adversarial_step, epoch))

        for images, labels in data_loader:
            images = images.to(config.device)
            labels = labels.to(config.device)

            discriminator.optimizer.zero_grad()
            outputs = discriminator(images)

            # output[:,0] P(x ~ real)
            # output[:,1] P(x ~ synthetic)

            loss = discriminator.criterion(outputs, labels.float())
            loss.backward()
            discriminator.optimizer.step()

            store.get('Discriminator Loss Per Batch').append(loss.item())
            store.get('Discrmininator Accuracy Per Batch').append(
                torch.sum((outputs > 0.5) == (labels == 1)).item() /
                outputs.shape[0])

        loss = store.get('Discriminator Loss Per Batch')
        acc = store.get('Discrmininator Accuracy Per Batch')
        store.get('Discriminator Loss').append(sum(loss) / len(loss))
        store.get('Discriminator Accuracy').append(sum(acc) / len(acc))
Ejemplo n.º 2
0
def finish(policy, discriminator):
    """
    This function creates a directory with the current timestamp at the application path set in the configurations.
    All experimental result data of a run such as weight parameters and log files will be saved. Additionally 100
    example images and sequences will be saved in this directory.

    :param policy: The policy net used in the experiment and for which the example data should be generated.
    :param discriminator: The discriminating net used in the experiment.
    """

    folder = store.folder

    policy.save(folder + '/policy-net.pt')
    discriminator.save(folder + '/discriminator-net.pt')

    save_policy_examples(folder, policy)

    for tag, value in [(t, v) for (t, v) in store
                       if store.PLOTTABLE in store.attributes(t)]:
        path = '{}/{}'.format(folder, tag)
        plot_simple(path, value, tag, 'Steps', '', 'plot')

    os.makedirs(folder + '/policy_steps')

    for step, policy in enumerate(
            store.get('List: Mean Policies Per Generator Step')):
        if step % 10 == 0:
            path = '{}/policy_steps/policy_step_{}'.format(folder, step)
            plot_simple(path, policy, 'Generator Policy Step {}'.format(step),
                        'Tokens', 'Probabilities', 'bar')

    action_infos = store.get('List: Action Info Dicts')

    plot_action_infos(folder, action_infos, 10)
    plot_action_deltas(folder, action_infos, 10)

    plot_action_infos(folder, action_infos, 10, without_count=True)
    plot_action_deltas(folder, action_infos, 10, without_count=True)
    plot_action_deltas(folder,
                       action_infos,
                       10,
                       without_count=True,
                       with_last_reward=True)

    store.save()
    ray.shutdown()
Ejemplo n.º 3
0
def policy_gradient(policy):
    policy.optimizer.zero_grad()

    # weight state action values by log probability of action
    total = torch.zeros(config.batch_size, device=config.device)
    reward_with_log_prob = torch.zeros(config.batch_size, device=config.device)
    reward_without_log_prob = torch.zeros(config.batch_size,
                                          device=config.device)

    log_probs = store.get('List: Log Probabilites Per Actions Of Single Step')
    rewards = store.get('List: Rewards Per Single Step')

    assert len(rewards) == len(log_probs)
    assert all(tensor.size() == (config.batch_size, ) for tensor in log_probs)
    assert all(tensor.size() == (config.batch_size, ) for tensor in rewards)

    for log_prob, reward in zip(log_probs, rewards):
        total = total + (reward - config.g_baseline)
        reward_with_log_prob = reward_with_log_prob + (log_prob * total)
        reward_without_log_prob = reward_without_log_prob + total

    # average over batchsize
    reward_without_log_prob = torch.sum(
        reward_without_log_prob) / config.batch_size
    reward_with_log_prob = torch.sum(reward_with_log_prob) / config.batch_size

    # negate for gradient descent and substract entropy
    entropies = store.get('List: Mean Entropies Per Single Step')
    entropy = 0.01 * sum(entropies) / len(entropies)

    loss = -(reward_with_log_prob + entropy)
    loss.backward()
    policy.optimizer.step()

    prediction = sum(rewards[-1]) / config.batch_size
    store_results(loss.item(), reward_without_log_prob.item(), entropy.item(),
                  prediction.item(), policy)
Ejemplo n.º 4
0
def step(policy, batch, hidden, save_prob=False):
    """
    This function performs a single step on the given policy net give a batch of unfinished subsequences.

    :param policy: The policy net which guides the decision making process.
    :param batch: The batch of input sequences size (batch size, sequence length, onehot length).
    :param hidden: The hidden state of the policy net.
    :param save_prob: If true, the probabilities for the chosen action will be saved for the policy net. Should be true
        if it is a step in the policy net training and false if it is a rollout or sample step.
    :return: Returns batch, hidden with the new encoding tensors for the chosen actions.
    """

    # avoid feeding whole sequences redundantly
    state = batch[:, -1, :][:, None, :]
    policies, hidden = policy(state, hidden)

    # sample next actions
    distributions = torch.distributions.Categorical(policies)
    actions = distributions.sample()

    # save log probabilities for gradient computation
    if save_prob:
        store.get('List: Mean Policies Per Single Step').append(torch.mean(policies, dim=0))
        store.get('List: Mean Entropies Per Single Step').append(torch.mean(distributions.entropy(), dim=0))
        store.get('List: Sampled Actions Per Single Step').append(actions)
        store.get('List: Log Probabilites Per Actions Of Single Step').append(distributions.log_prob(actions))

    # concat onehot tokens with the batch of sequences
    encodings = torch.tensor([tokens.onehot(id) for id in actions], device=config.device)
    encodings = encodings[:, None, :].float()
    batch = torch.cat((batch, encodings), dim=1)

    # if batch still has the empty start token remove it
    if torch.sum(batch[:, 0, :]) == 0:
        batch = batch[:, 1:, :]

    return batch, hidden
Ejemplo n.º 5
0
def adversarial_generator(policy, rollout, discriminator, adversarial_step,
                          g_steps):
    rollout.set_parameters_to(policy)
    policy.train()
    rollout.eval()
    discriminator.eval()

    # results of a training step
    store.set('List: Mean Losses Per Generator Step', [],
              attributes=[store.PLOTTABLE],
              if_exists=False)
    store.set('List: Mean Rewards Per Generator Step', [],
              attributes=[store.PLOTTABLE],
              if_exists=False)
    store.set('List: Mean Entropies Per Generator Step', [],
              attributes=[store.PLOTTABLE],
              if_exists=False)
    store.set('List: Mean Predictions Per Generator Step', [],
              attributes=[store.PLOTTABLE],
              if_exists=False)

    store.set('List: Action Info Dicts', [], if_exists=False)
    store.set('List: Mean Policies Per Generator Step', [], if_exists=False)
    store.set('List: Action Counts Per Generator Step', [], if_exists=False)
    store.set('List: Formular Examples', [], if_exists=False)

    for step in range(g_steps):

        print('Global Step {} - Generator Step {}'.format(
            adversarial_step, step))

        # temporary store - necessary for loss calculation - should be overwritten each step
        store.set('List: Log Probabilites Per Actions Of Single Step', [])
        store.set('List: Rewards Per Single Step', [])
        store.set('List: Mean Entropies Per Single Step', [])

        # temporary store - not necessary for loss calculation - should be overwritten each step
        store.set('List: Sampled Actions Per Single Step', [])
        store.set('List: Mean Policies Per Single Step', [])

        batch, hidden = policy.initial()

        for length in range(config.sequence_length):

            # generate a single next token given the sequences generated so far
            batch, hidden = generator.step(policy,
                                           batch,
                                           hidden,
                                           save_prob=True)
            q_values = torch.empty([config.batch_size, 0],
                                   device=config.device)

            # compute the Q(token,subsequence) values with monte carlo approximation
            if not batch.shape[1] < config.sequence_length:
                for _ in range(config.montecarlo_trials):
                    samples = generator.rollout(rollout, batch, hidden)
                    reward = collect_reward(discriminator, samples)
                    q_values = torch.cat([q_values, reward], dim=1)
            else:
                reward = collect_reward(discriminator, batch)
                q_values = torch.cat([q_values, reward], dim=1)

            # average the reward over all trials
            q_values = torch.mean(q_values, dim=1)
            store.get('List: Rewards Per Single Step').append(q_values)

            # generator.policy_gradient_update(policy)  # TODO comment out to reward like in SeqGAN
            # batch, hidden = (batch.detach(), hidden.detach())  # TODO comment out to reward like in SeqGAN

        store.get('List: Formular Examples').append(', '.join(
            tree.to_latex(batch[-3:].tolist())))
        policy_gradient(policy)
Ejemplo n.º 6
0
def store_results(loss, reward_without_log_prob, entropy, prediction, policy):
    store.get('List: Mean Losses Per Generator Step').append(loss)
    store.get('List: Mean Rewards Per Generator Step').append(
        reward_without_log_prob)
    store.get('List: Mean Entropies Per Generator Step').append(entropy)
    store.get('List: Mean Predictions Per Generator Step').append(prediction)

    mean_policies = store.get('List: Mean Policies Per Single Step')
    mean_policies = torch.mean(torch.stack(mean_policies, dim=0),
                               dim=0).cpu().detach()
    store.get('List: Mean Policies Per Generator Step').append(mean_policies)

    # calculate tuples of (action_id, count, average probability, average reward)
    sampled_actions = store.get('List: Sampled Actions Per Single Step')
    log_probs = store.get('List: Log Probabilites Per Actions Of Single Step')
    rewards = store.get('List: Rewards Per Single Step')

    action_counts = {}
    action_probs = {}
    action_rewards = {}

    assert len(sampled_actions) == len(log_probs) == len(rewards)
    batchsize = sampled_actions[0].shape[0]

    for step in range(len(log_probs)):
        for sample_id in range(batchsize):
            action = sampled_actions[step][sample_id]
            log_prob = log_probs[step][sample_id]
            reward = rewards[step][sample_id]

            if action in action_probs.keys():
                action_probs[action].append(log_prob)
                action_rewards[action].append(reward)
                action_counts[action] += 1
            else:
                action_probs[action] = [log_prob]
                action_rewards[action] = [reward]
                action_counts[action] = 1

    for action in action_counts.keys():
        action_probs[action] = (sum(action_probs[action]) /
                                len(action_probs[action])).item()
        action_rewards[action] = (sum(action_rewards[action]) /
                                  len(action_rewards[action])).item()

    tuples = {
        a.item(): (action_counts[a], action_probs[a], action_rewards[a])
        for a in action_counts.keys()
    }
    store.get('List: Action Info Dicts').append(tuples)

    step = store.get('Policy Step')
    store.set('Policy Step', step + 1)

    if step % 10 == 0:
        policy.save('{}/policies/{}'.format(store.folder, step))