def adversarial_discriminator(discriminator, policy, adversarial_step, d_steps, d_epochs): discriminator.reset() discriminator.train() policy.eval() store.set('Discriminator Loss', [], attributes=[store.PLOTTABLE], if_exists=False) store.set('Discriminator Accuracy', [], attributes=[store.PLOTTABLE], if_exists=False) num_samples = config.num_real_samples * 2 * d_steps # equal amount of generated data data_loader = loader.prepare_loader(num_samples, policy) for epoch in range(d_epochs): store.set('Discriminator Loss Per Batch', []) store.set('Discrmininator Accuracy Per Batch', []) print('Global Step {} - Discriminator Epoch {}'.format( adversarial_step, epoch)) for images, labels in data_loader: images = images.to(config.device) labels = labels.to(config.device) discriminator.optimizer.zero_grad() outputs = discriminator(images) # output[:,0] P(x ~ real) # output[:,1] P(x ~ synthetic) loss = discriminator.criterion(outputs, labels.float()) loss.backward() discriminator.optimizer.step() store.get('Discriminator Loss Per Batch').append(loss.item()) store.get('Discrmininator Accuracy Per Batch').append( torch.sum((outputs > 0.5) == (labels == 1)).item() / outputs.shape[0]) loss = store.get('Discriminator Loss Per Batch') acc = store.get('Discrmininator Accuracy Per Batch') store.get('Discriminator Loss').append(sum(loss) / len(loss)) store.get('Discriminator Accuracy').append(sum(acc) / len(acc))
def finish(policy, discriminator): """ This function creates a directory with the current timestamp at the application path set in the configurations. All experimental result data of a run such as weight parameters and log files will be saved. Additionally 100 example images and sequences will be saved in this directory. :param policy: The policy net used in the experiment and for which the example data should be generated. :param discriminator: The discriminating net used in the experiment. """ folder = store.folder policy.save(folder + '/policy-net.pt') discriminator.save(folder + '/discriminator-net.pt') save_policy_examples(folder, policy) for tag, value in [(t, v) for (t, v) in store if store.PLOTTABLE in store.attributes(t)]: path = '{}/{}'.format(folder, tag) plot_simple(path, value, tag, 'Steps', '', 'plot') os.makedirs(folder + '/policy_steps') for step, policy in enumerate( store.get('List: Mean Policies Per Generator Step')): if step % 10 == 0: path = '{}/policy_steps/policy_step_{}'.format(folder, step) plot_simple(path, policy, 'Generator Policy Step {}'.format(step), 'Tokens', 'Probabilities', 'bar') action_infos = store.get('List: Action Info Dicts') plot_action_infos(folder, action_infos, 10) plot_action_deltas(folder, action_infos, 10) plot_action_infos(folder, action_infos, 10, without_count=True) plot_action_deltas(folder, action_infos, 10, without_count=True) plot_action_deltas(folder, action_infos, 10, without_count=True, with_last_reward=True) store.save() ray.shutdown()
def policy_gradient(policy): policy.optimizer.zero_grad() # weight state action values by log probability of action total = torch.zeros(config.batch_size, device=config.device) reward_with_log_prob = torch.zeros(config.batch_size, device=config.device) reward_without_log_prob = torch.zeros(config.batch_size, device=config.device) log_probs = store.get('List: Log Probabilites Per Actions Of Single Step') rewards = store.get('List: Rewards Per Single Step') assert len(rewards) == len(log_probs) assert all(tensor.size() == (config.batch_size, ) for tensor in log_probs) assert all(tensor.size() == (config.batch_size, ) for tensor in rewards) for log_prob, reward in zip(log_probs, rewards): total = total + (reward - config.g_baseline) reward_with_log_prob = reward_with_log_prob + (log_prob * total) reward_without_log_prob = reward_without_log_prob + total # average over batchsize reward_without_log_prob = torch.sum( reward_without_log_prob) / config.batch_size reward_with_log_prob = torch.sum(reward_with_log_prob) / config.batch_size # negate for gradient descent and substract entropy entropies = store.get('List: Mean Entropies Per Single Step') entropy = 0.01 * sum(entropies) / len(entropies) loss = -(reward_with_log_prob + entropy) loss.backward() policy.optimizer.step() prediction = sum(rewards[-1]) / config.batch_size store_results(loss.item(), reward_without_log_prob.item(), entropy.item(), prediction.item(), policy)
def step(policy, batch, hidden, save_prob=False): """ This function performs a single step on the given policy net give a batch of unfinished subsequences. :param policy: The policy net which guides the decision making process. :param batch: The batch of input sequences size (batch size, sequence length, onehot length). :param hidden: The hidden state of the policy net. :param save_prob: If true, the probabilities for the chosen action will be saved for the policy net. Should be true if it is a step in the policy net training and false if it is a rollout or sample step. :return: Returns batch, hidden with the new encoding tensors for the chosen actions. """ # avoid feeding whole sequences redundantly state = batch[:, -1, :][:, None, :] policies, hidden = policy(state, hidden) # sample next actions distributions = torch.distributions.Categorical(policies) actions = distributions.sample() # save log probabilities for gradient computation if save_prob: store.get('List: Mean Policies Per Single Step').append(torch.mean(policies, dim=0)) store.get('List: Mean Entropies Per Single Step').append(torch.mean(distributions.entropy(), dim=0)) store.get('List: Sampled Actions Per Single Step').append(actions) store.get('List: Log Probabilites Per Actions Of Single Step').append(distributions.log_prob(actions)) # concat onehot tokens with the batch of sequences encodings = torch.tensor([tokens.onehot(id) for id in actions], device=config.device) encodings = encodings[:, None, :].float() batch = torch.cat((batch, encodings), dim=1) # if batch still has the empty start token remove it if torch.sum(batch[:, 0, :]) == 0: batch = batch[:, 1:, :] return batch, hidden
def adversarial_generator(policy, rollout, discriminator, adversarial_step, g_steps): rollout.set_parameters_to(policy) policy.train() rollout.eval() discriminator.eval() # results of a training step store.set('List: Mean Losses Per Generator Step', [], attributes=[store.PLOTTABLE], if_exists=False) store.set('List: Mean Rewards Per Generator Step', [], attributes=[store.PLOTTABLE], if_exists=False) store.set('List: Mean Entropies Per Generator Step', [], attributes=[store.PLOTTABLE], if_exists=False) store.set('List: Mean Predictions Per Generator Step', [], attributes=[store.PLOTTABLE], if_exists=False) store.set('List: Action Info Dicts', [], if_exists=False) store.set('List: Mean Policies Per Generator Step', [], if_exists=False) store.set('List: Action Counts Per Generator Step', [], if_exists=False) store.set('List: Formular Examples', [], if_exists=False) for step in range(g_steps): print('Global Step {} - Generator Step {}'.format( adversarial_step, step)) # temporary store - necessary for loss calculation - should be overwritten each step store.set('List: Log Probabilites Per Actions Of Single Step', []) store.set('List: Rewards Per Single Step', []) store.set('List: Mean Entropies Per Single Step', []) # temporary store - not necessary for loss calculation - should be overwritten each step store.set('List: Sampled Actions Per Single Step', []) store.set('List: Mean Policies Per Single Step', []) batch, hidden = policy.initial() for length in range(config.sequence_length): # generate a single next token given the sequences generated so far batch, hidden = generator.step(policy, batch, hidden, save_prob=True) q_values = torch.empty([config.batch_size, 0], device=config.device) # compute the Q(token,subsequence) values with monte carlo approximation if not batch.shape[1] < config.sequence_length: for _ in range(config.montecarlo_trials): samples = generator.rollout(rollout, batch, hidden) reward = collect_reward(discriminator, samples) q_values = torch.cat([q_values, reward], dim=1) else: reward = collect_reward(discriminator, batch) q_values = torch.cat([q_values, reward], dim=1) # average the reward over all trials q_values = torch.mean(q_values, dim=1) store.get('List: Rewards Per Single Step').append(q_values) # generator.policy_gradient_update(policy) # TODO comment out to reward like in SeqGAN # batch, hidden = (batch.detach(), hidden.detach()) # TODO comment out to reward like in SeqGAN store.get('List: Formular Examples').append(', '.join( tree.to_latex(batch[-3:].tolist()))) policy_gradient(policy)
def store_results(loss, reward_without_log_prob, entropy, prediction, policy): store.get('List: Mean Losses Per Generator Step').append(loss) store.get('List: Mean Rewards Per Generator Step').append( reward_without_log_prob) store.get('List: Mean Entropies Per Generator Step').append(entropy) store.get('List: Mean Predictions Per Generator Step').append(prediction) mean_policies = store.get('List: Mean Policies Per Single Step') mean_policies = torch.mean(torch.stack(mean_policies, dim=0), dim=0).cpu().detach() store.get('List: Mean Policies Per Generator Step').append(mean_policies) # calculate tuples of (action_id, count, average probability, average reward) sampled_actions = store.get('List: Sampled Actions Per Single Step') log_probs = store.get('List: Log Probabilites Per Actions Of Single Step') rewards = store.get('List: Rewards Per Single Step') action_counts = {} action_probs = {} action_rewards = {} assert len(sampled_actions) == len(log_probs) == len(rewards) batchsize = sampled_actions[0].shape[0] for step in range(len(log_probs)): for sample_id in range(batchsize): action = sampled_actions[step][sample_id] log_prob = log_probs[step][sample_id] reward = rewards[step][sample_id] if action in action_probs.keys(): action_probs[action].append(log_prob) action_rewards[action].append(reward) action_counts[action] += 1 else: action_probs[action] = [log_prob] action_rewards[action] = [reward] action_counts[action] = 1 for action in action_counts.keys(): action_probs[action] = (sum(action_probs[action]) / len(action_probs[action])).item() action_rewards[action] = (sum(action_rewards[action]) / len(action_rewards[action])).item() tuples = { a.item(): (action_counts[a], action_probs[a], action_rewards[a]) for a in action_counts.keys() } store.get('List: Action Info Dicts').append(tuples) step = store.get('Policy Step') store.set('Policy Step', step + 1) if step % 10 == 0: policy.save('{}/policies/{}'.format(store.folder, step))