コード例 #1
0
def optimizer_step(optimizer, before, change1, max_part, change2, after,
                   pos_embeddings, sentence_embeddings, attention_mask,
                   temperature, embedding_map, model):
  """Optimize the sentence towards the target activation.

  Args:
    optimizer: The optimizer to be used.
    before: The tensor for everything before the modifyable content.
    change1: Modifyable content before the word to be maximized for.
    max_part: The static tensor around the word to be maximized for.
    change2: Modifyable content after the word to be maximized for.
    after: The tensor for everything after the modifiable content.
    pos_embeddings: The positional embeddings used for inference.
    sentence_embeddings: The sentence embeddings for inference.
    attention_mask: The attention mask used for inference.
    temperature: The temperature used for making the softmax spike.
    embedding_map: Holding all the token embeddings for BERT.
    model: Model to run inference on.

  Returns:
    max_values: The maximal values for the current token representations.
    token_ids: The token ids of the current representation.
    prediction: The current prediction score of the word to be maximized.
  """
  # Reset the gradient
  optimizer.zero_grad()
  # Softmax over the one-hots
  one_hots_sm_1 = one_hots_helper.softmax_one_hots(change1, temperature,
                                                   FLAGS.gumbel)
  one_hots_sm_2 = one_hots_helper.softmax_one_hots(change2, temperature,
                                                   FLAGS.gumbel)
  fused_one_hots = torch.cat([before, one_hots_sm_1, max_part, one_hots_sm_2,
                              after], dim=1)
  # Get the prediction
  prediction_score = inference_helper.run_inference_mlm(
      fused_one_hots, pos_embeddings, sentence_embeddings, attention_mask,
      embedding_map, model)
  prediction = get_prediction(prediction_score, FLAGS.maximize_word,
                              FLAGS.maximize_id, FLAGS.normalize)
  # Calculate the loss as an inverse activation of the layer to be optimised for
  # (adam wants to minimize this value, we want to maximize it)
  loss = -prediction
  # Backpropagate the loss
  loss.backward(retain_graph=True)
  # Optimize the word vector based on that lossone_hotsone_hots
  optimizer.step()
  # Get the actual tokens and distances to the embedding for this modified
  # embedding
  one_hots_sm_1 = one_hots_helper.softmax_one_hots(change1, temperature,
                                                   FLAGS.gumbel)
  one_hots_sm_2 = one_hots_helper.softmax_one_hots(change2, temperature,
                                                   FLAGS.gumbel)
  fused_one_hots = torch.cat([before, one_hots_sm_1, max_part, one_hots_sm_2,
                              after], dim=1)
  max_values, token_ids = one_hots_helper.get_tokens_from_one_hots(
      fused_one_hots)
  return max_values, token_ids, prediction
コード例 #2
0
def step_towards_activation(optimizer, before, modify, after, pos_embeddings,
                            sentence_embeddings, att_mask, temperature,
                            iteration, gumbel, write_top_k, k_value, data,
                            word_id, neuron_id, layer_id, modify_start,
                            modify_end, tokenizer, embedding_map, model,
                            target_activation):
    """Optimize the sentence towards the target activation.

  Args:
    optimizer: The optimizer to be used.
    before: The tensor for everything before the modifyable content.
    modify: The tensor of the modifyable content.
    after: The tensor for everything after the modifiable content.
    pos_embeddings: The positional embeddings used for inference.
    sentence_embeddings: The sentence embeddings for inference.
    att_mask: The attention mask used for inference.
    temperature: The temperature used for making the softmax spike.
    iteration: Current iteration number of the optimization process.
    gumbel: Whether to use gumbel noise.
    write_top_k: Whether to write the top-rated tokens per iteration.
    k_value: How many tokens to write to top_k.
    data: Placeholder for the top_k data.
    word_id: Word to get the activation for.
    neuron_id: Neuron to get the activation for.
    layer_id: Layer to get the activation for.
    modify_start: The start index of the modifiable content.
    modify_end: The end index of the modifyable content.
    tokenizer: Used for converting between tokens and ids.
    embedding_map: Holding all the token embeddings for BERT.
    model: Model to run inference on.
    target_activation: The activation we are aiming towards.

  Returns:
    max_values: The maximal values for the current token representations.
    token_ids: The token ids of the current representation.
    loss: The current loss towards the target activation.
  """
    # Reset the gradient
    optimizer.zero_grad()
    # Softmax over the one-hots
    one_hots_sm = one_hots_helper.softmax_one_hots(modify, temperature, gumbel)
    fused_one_hots = torch.cat([before, one_hots_sm, after], dim=1)
    # Check if top_k should be written
    if write_top_k:
        output_helper.write_top_ks(fused_one_hots, k_value, iteration, data,
                                   modify_start, modify_end, tokenizer)
    # Get the activation
    layer_activations = inference_helper.run_inference(before, one_hots_sm,
                                                       after, pos_embeddings,
                                                       sentence_embeddings,
                                                       att_mask, embedding_map,
                                                       model)
    activation = activation_helper.get_activations(layer_activations, word_id,
                                                   neuron_id, layer_id)
    # Calculate the loss as an inverse activation of the layer to be optimised for
    # (adam wants to minimize the training loss, we want to maximize the
    # activation)
    loss = F.mse_loss(activation, target_activation)
    # Backpropagate the loss
    loss.backward(retain_graph=True)
    # Optimize the word vector based on that loss
    optimizer.step()
    # Get the actual tokens and distances to the embedding for this modified
    # embedding
    one_hots_sm = one_hots_helper.softmax_one_hots(modify, temperature, gumbel)
    fused_one_hots = torch.cat([before, one_hots_sm, after], dim=1)
    max_values, token_ids = one_hots_helper.get_tokens_from_one_hots(
        fused_one_hots)
    return max_values, token_ids, loss
コード例 #3
0
def deep_dream(data, results, params, device, tokenizer, embedding_map, model):
    """Iteratively modifying the embedding using gradient descent.

  Args:
    data: Holds the top-k values.
    results: Holds the results of the run.
    params: Holds the parameters of the run.
    device: The device to store the variables on.
    tokenizer: The tokenizer to transform the input.
    embedding_map: Holding all token embeddings.
    model: The model that should dream.
  """
    # An embedding for the tokens is obtained
    tokens = tokenization_helper.tokenize_input_sentence(
        tokenizer, FLAGS.sentence, FLAGS.sentence2)
    tokens_tensor, segments_tensor = tokenization_helper.tensors_from_tokens(
        tokenizer, tokens, device)
    _, pos_embeddings, sentence_embeddings = embeddings_helper.get_embeddings(
        tokens_tensor, segments_tensor, model)
    # Correct the end of the dream if necessary
    if FLAGS.dream_end == 0:
        FLAGS.dream_end = len(tokens) - 2
    # Write the parameters to a file
    output_helper.get_params(params,
                             FLAGS,
                             tokens,
                             embedding_ana=FLAGS.embedding_analysis)
    # Get the smooth one-hot vector that is to be optimized, split into static and
    # modifiable parts
    before, modify, after = one_hots_helper.get_one_hots(
        tokens_tensor.data.cpu().numpy(), FLAGS.dream_start, FLAGS.dream_end,
        device)
    # Obtain the default attention mask to be able to run the model
    attention_mask = attention_mask_helper.get_attention_mask(tokens_tensor)
    # The optimizer used to modify the input embedding
    optimizer = torch.optim.Adam([modify], lr=FLAGS.learning_rate)
    # Init temperature for Gumbel
    temperature = torch.tensor(FLAGS.start_temp,
                               device=device,
                               requires_grad=False)

    # Obtain the properties of the initial embedding
    one_hots_sm = one_hots_helper.softmax_one_hots(modify, temperature,
                                                   FLAGS.gumbel)
    max_values, tokens_ids = one_hots_helper.get_tokens_from_one_hots(
        torch.cat([before, one_hots_sm, after], dim=1))
    numpy_max_values = max_values.data.cpu().numpy()
    ids = tokens_ids.data.cpu().numpy()[0]
    tokens = tokenizer.convert_ids_to_tokens(ids)
    ids_activation = activation_helper.get_ids_activation(ids,
                                                          pos_embeddings,
                                                          sentence_embeddings,
                                                          attention_mask,
                                                          FLAGS.dream_start,
                                                          FLAGS.dream_end,
                                                          FLAGS.word_id,
                                                          FLAGS.neuron_id,
                                                          FLAGS.layer_id,
                                                          FLAGS.normalize,
                                                          embedding_map,
                                                          model,
                                                          device,
                                                          average=True)
    output_helper.init_results(results)

    # Optimize the embedding for i iterations and update the properties to
    # evaluate the result in each step
    for i in range(FLAGS.num_iterations):
        max_vals, tokens_ids, activation, emb_tok, emb_act = optimizer_step(
            optimizer, before, modify, after, pos_embeddings,
            sentence_embeddings, attention_mask, temperature, i, data,
            tokenizer, embedding_map, model, device)
        # Write the properties of the last step
        if (i % FLAGS.metrics_frequency) == 0:
            output_helper.get_metrics(tokens,
                                      i,
                                      temperature,
                                      numpy_max_values,
                                      results,
                                      activation=activation,
                                      ids_activation=ids_activation,
                                      emb_tokens=emb_tok,
                                      emb_activation=emb_act,
                                      emb_ana=FLAGS.embedding_analysis,
                                      iterations=FLAGS.num_iterations)
        # Set the numpy max values
        numpy_max_values = max_vals.data.cpu().numpy()
        # Obtain the activation property for the id-array that would result from the
        # optimization
        ids = tokens_ids.data.cpu().numpy()[0]
        tokens = tokenizer.convert_ids_to_tokens(ids)
        # Calculate the activation using the highest scoring words
        ids_activation = activation_helper.get_ids_activation(
            ids,
            pos_embeddings,
            sentence_embeddings,
            attention_mask,
            FLAGS.dream_start,
            FLAGS.dream_end,
            FLAGS.word_id,
            FLAGS.neuron_id,
            FLAGS.layer_id,
            FLAGS.normalize,
            embedding_map,
            model,
            device,
            average=True)
        # Check if the temperature needs to decrease
        if i > FLAGS.warmup:
            temperature = torch.clamp(temperature * FLAGS.anneal,
                                      FLAGS.end_temp)
    # Calculate the final activation just as before, but without backprop
    if (FLAGS.num_iterations % FLAGS.metrics_frequency) == 0:
        with torch.no_grad():
            one_hots_sm = one_hots_helper.softmax_one_hots(
                modify, temperature, FLAGS.gumbel)
            fused_one_hots = torch.cat([before, one_hots_sm, after], dim=1)
            if FLAGS.write_top_k:
                output_helper.get_top_ks(fused_one_hots,
                                         FLAGS.k,
                                         FLAGS.num_iterations,
                                         data,
                                         FLAGS.dream_start,
                                         FLAGS.dream_end,
                                         tokenizer,
                                         activation=activation)
            layer_activations = inference_helper.run_inference(
                before, one_hots_sm, after, pos_embeddings,
                sentence_embeddings, attention_mask, embedding_map, model)
            activation = activation_helper.get_activation(
                layer_activations, FLAGS.word_id, FLAGS.neuron_id,
                FLAGS.layer_id, FLAGS.normalize)
            emb_tok, emb_act = embeddings_helper.analyze_current_embedding(
                fused_one_hots, embedding_map, FLAGS.dream_start,
                FLAGS.dream_end, device, pos_embeddings, sentence_embeddings,
                attention_mask, model, FLAGS.word_id, FLAGS.neuron_id,
                FLAGS.layer_id, FLAGS.normalize, tokenizer)
            output_helper.get_metrics(tokens,
                                      FLAGS.num_iterations,
                                      temperature,
                                      numpy_max_values,
                                      results,
                                      activation=activation,
                                      ids_activation=ids_activation,
                                      emb_tokens=emb_tok,
                                      emb_activation=emb_act,
                                      emb_ana=FLAGS.embedding_analysis,
                                      iterations=FLAGS.num_iterations)
コード例 #4
0
def optimizer_step(optimizer, before, modify, after, pos_embeddings,
                   sentence_embeddings, attention_mask, temperature, iteration,
                   data, tokenizer, embedding_map, model, device):
    """Make a step along the gradient of the optimizer.

  Args:
    optimizer: The optimizer that is used for gradient decent.
    before: Embeddings of everything up to the modifyable content.
    modify: Embeddings of the modifyable content.
    after: Embeddings of everything after the modifyable content.
    pos_embeddings: Positional embeddings of the current sequence.
    sentence_embeddings: Sentence embeddings of the current sequence.
    attention_mask: Attention mask to be used with the current sequence.
    temperature: Current temperature of the softmax function.
    iteration: Current iteration of the optimization.
    data: Top-k data to be written after optimization.
    tokenizer: Converts between tokens and their ids.
    embedding_map: Holding the embeddings for each token.
    model: The model to be used with this optimization.
    device: Where to store the variables.

  Returns:
    max_values: The values of the tokens with the highest softmax value.
    token_ids: The ids of the tokens with the highest softmax value.
    activation: The activation of the current input representation.
    emb_tokens: The tokens of the closest embedding representing real tokens.
    emb_activation: Activation for closest embedding representing real tokens.
  """
    # Reset the gradient
    optimizer.zero_grad()
    # Softmax over the one-hots
    one_hots_sm = one_hots_helper.softmax_one_hots(modify, temperature,
                                                   FLAGS.gumbel)
    fused_one_hots = torch.cat([before, one_hots_sm, after], dim=1)
    # Check if the embedding analysis is to be done
    emb_tokens = None
    emb_activation = None
    if FLAGS.embedding_analysis != 0:
        if iteration % FLAGS.embedding_analysis == 0:
            tok, act = embeddings_helper.analyze_current_embedding(
                fused_one_hots, embedding_map, FLAGS.dream_start,
                FLAGS.dream_end, device, pos_embeddings, sentence_embeddings,
                attention_mask, model, FLAGS.word_id, FLAGS.neuron_id,
                FLAGS.layer_id, FLAGS.normalize, tokenizer)
            emb_tokens = tok
            emb_activation = act
    # Get the activation
    layer_activations = inference_helper.run_inference(before, one_hots_sm,
                                                       after, pos_embeddings,
                                                       sentence_embeddings,
                                                       attention_mask,
                                                       embedding_map, model)
    activation = activation_helper.get_activation(layer_activations,
                                                  FLAGS.word_id,
                                                  FLAGS.neuron_id,
                                                  FLAGS.layer_id,
                                                  FLAGS.normalize)
    # Check if top_k should be written
    if FLAGS.write_top_k:
        output_helper.get_top_ks(fused_one_hots,
                                 FLAGS.k,
                                 iteration,
                                 data,
                                 FLAGS.dream_start,
                                 FLAGS.dream_end,
                                 tokenizer,
                                 activation=activation)
    # Calculate the loss as an inverse activation of the layer to be optimised for
    # (adam wants to minimize this value, we want to maximize it)
    loss = -activation
    # Backpropagate the loss
    loss.backward(retain_graph=True)
    # Optimize the word vector based on that loss
    optimizer.step()
    # Get the actual tokens and distances to the embedding for this modified
    # embedding
    one_hots_sm = one_hots_helper.softmax_one_hots(modify, temperature,
                                                   FLAGS.gumbel)
    fused_one_hots = torch.cat([before, one_hots_sm, after], dim=1)
    max_values, token_ids = one_hots_helper.get_tokens_from_one_hots(
        fused_one_hots)
    return max_values, token_ids, activation, emb_tokens, emb_activation
コード例 #5
0
def deep_dream(data, results, params, device, tokenizer, embedding_map, model):
    """Deep dream to a target activation.

  Args:
    data: Holds the top-k values.
    results: Holds the results of the run.
    params: Holds the parameters of the run.
    device: Where to place new variables.
    tokenizer: Used to convert between ids and tokens.
    embedding_map: Holding all BERT token embeddings.
    model: The model used for this dream.
  """
    # An embedding for the tokens is obtained
    tokens = tokenization_helper.tokenize_input_sentence(
        tokenizer, FLAGS.sentence, FLAGS.sentence2)
    tokens_tensor, segments_tensor = tokenization_helper.tensors_from_tokens(
        tokenizer, tokens, device)
    _, pos_embeddings, sentence_embeddings = embeddings_helper.get_embeddings(
        tokens_tensor, segments_tensor, model)
    # Correct the end of the dream if necessary
    if FLAGS.dream_end == 0:
        FLAGS.dream_end = len(tokens) - 2
    # Write the parameters to a file
    output_helper.get_params(params, FLAGS, tokens)
    # Get the smooth one-hot vector that is to be optimized, split into static and
    # modifiable parts
    before, modify, after = one_hots_helper.get_one_hots(
        tokens_tensor.data.cpu().numpy(), FLAGS.dream_start, FLAGS.dream_end,
        device)
    modify = torch.randn(modify.shape, device=device, requires_grad=True)
    # Obtain the default attention mask to be able to run the model
    att_mask = attention_mask_helper.get_attention_mask(tokens_tensor)
    # The optimizer used to modify the input embedding
    optimizer = torch.optim.Adam([modify], lr=FLAGS.learning_rate)
    # Init temperature for Gumbel
    temperature = torch.tensor(FLAGS.start_temp,
                               device=device,
                               requires_grad=False)
    # Obtain the target activation we try to optimize towards.
    target_ids = tokens_tensor.data.cpu().numpy()[0]
    target_activation = activation_helper.get_ids_activation(
        target_ids, pos_embeddings, sentence_embeddings, att_mask,
        FLAGS.dream_start, FLAGS.dream_end, FLAGS.word_id, FLAGS.neuron_id,
        FLAGS.layer_id, False, embedding_map, model, device)
    target_activation = change_target_activation(target_activation, device)
    target_activation = target_activation.clone().detach().requires_grad_(
        False)
    # Obtain the properties of the initial embedding
    one_hots_sm = one_hots_helper.softmax_one_hots(modify, temperature,
                                                   FLAGS.gumbel)
    max_values, token_ids = one_hots_helper.get_tokens_from_one_hots(
        torch.cat([before, one_hots_sm, after], dim=1))
    numpy_max_values = max_values.data.cpu().numpy()
    ids = token_ids.data.cpu().numpy()[0]
    tokens = tokenizer.convert_ids_to_tokens(ids)
    ids_activation = activation_helper.get_ids_activation(
        ids, pos_embeddings, sentence_embeddings, att_mask, FLAGS.dream_start,
        FLAGS.dream_end, FLAGS.word_id, FLAGS.neuron_id, FLAGS.layer_id, False,
        embedding_map, model, device)
    # Write the initial stuff for the results file
    output_helper.init_results(results)

    # Optimize the embedding for i iterations and update the properties to
    # evaluate the result in each step
    for i in range(FLAGS.num_iterations):
        # Do an optimization step
        max_vals, token_ids, loss = optimization_helper.step_towards_activation(
            optimizer, before, modify, after, pos_embeddings,
            sentence_embeddings, att_mask, temperature, i, FLAGS.gumbel,
            FLAGS.write_top_k, FLAGS.k, data, FLAGS.word_id, FLAGS.neuron_id,
            FLAGS.layer_id, FLAGS.dream_start, FLAGS.dream_end, tokenizer,
            embedding_map, model, target_activation)
        # Write the properties of the last step
        ids_loss = F.mse_loss(ids_activation, target_activation)
        if (i % FLAGS.metrics_frequency) == 0:
            output_helper.get_metrics(tokens,
                                      i,
                                      temperature,
                                      numpy_max_values,
                                      results,
                                      loss=loss,
                                      ids_loss=ids_loss)
        # Set the numpy max values
        numpy_max_values = max_vals.data.cpu().numpy()
        # Obtain the activation property for the id-array that would result from the
        # optimization
        ids = token_ids.data.cpu().numpy()[0]
        tokens = tokenizer.convert_ids_to_tokens(ids)
        # Calculate the activation using the highest scoring words
        ids_activation = activation_helper.get_ids_activation(
            ids, pos_embeddings, sentence_embeddings, att_mask,
            FLAGS.dream_start, FLAGS.dream_end, FLAGS.word_id, FLAGS.neuron_id,
            FLAGS.layer_id, False, embedding_map, model, device)
        # Check if the temperature needs to decrease
        if i > FLAGS.warmup:
            temperature = torch.clamp(temperature * FLAGS.anneal,
                                      FLAGS.end_temp)

    # Calculate the final activation just as before, but without backprop
    if (FLAGS.num_iterations % FLAGS.metrics_frequency) == 0:
        with torch.no_grad():
            one_hots_sm = one_hots_helper.softmax_one_hots(
                modify, temperature, FLAGS.gumbel)
            fused_one_hots = torch.cat([before, one_hots_sm, after], dim=1)
            if FLAGS.write_top_k:
                output_helper.write_top_ks(fused_one_hots, FLAGS.k,
                                           FLAGS.num_iterations, data,
                                           FLAGS.dream_start, FLAGS.dream_end,
                                           tokenizer)
            layers = inference_helper.run_inference(before, one_hots_sm, after,
                                                    pos_embeddings,
                                                    sentence_embeddings,
                                                    att_mask, embedding_map,
                                                    model)
            activation = activation_helper.get_activations(
                layers, FLAGS.word_id, FLAGS.neuron_id, FLAGS.layer_id)
            loss = F.mse_loss(activation, target_activation)
            ids_loss = F.mse_loss(ids_activation, target_activation)
            output_helper.get_metrics(tokens,
                                      FLAGS.num_iterations,
                                      temperature,
                                      numpy_max_values,
                                      results,
                                      loss=loss,
                                      ids_loss=ids_loss)
コード例 #6
0
def deep_dream(results, params, device, tokenizer, embedding_map, model):
  """Deep dream to maximally activate the class probability for a token.

  Args:
    results: Holds the results of the run.
    params: Holds the parameters of the run.
    device: The device to store the variables on.
    tokenizer: The tokenizer to transform the input.
    embedding_map: Holding all token embeddings.
    model: The model that should dream.
  """
  # An embedding for the tokens is obtained
  tokens = tokenization_helper.tokenize_input_sentence(
      tokenizer, FLAGS.sentence, FLAGS.sentence2,
      mask_word=FLAGS.maximize_word)
  tokens_tensor, segments_tensor = tokenization_helper.tensors_from_tokens(
      tokenizer, tokens, device)
  _, pos_embeddings, sentence_embeddings = embeddings_helper.get_embeddings(
      tokens_tensor, segments_tensor, model.bert)
  # Write the parameters to a file
  output_helper.get_params_mlm(params, FLAGS, tokens)
  # Get the smooth one-hot vector that is to be optimized, split into static and
  # modifiable parts
  before, change1, max_part, change2, after = one_hots_helper.get_one_hots_mlm(
      tokens_tensor.data.cpu().numpy(), FLAGS.dream_before_start,
      FLAGS.dream_before_end, FLAGS.dream_after_start, FLAGS.dream_after_end,
      device)
  # Obtain the default attention mask to be able to run the model
  attention_mask = attention_mask_helper.get_attention_mask(tokens_tensor)
  # The optimizer used to modify the input embedding
  optimizer = torch.optim.Adam([change1, change2], lr=FLAGS.learning_rate)
  # Init temperature for Gumbel
  temperature = torch.tensor(FLAGS.start_temp, device=device,
                             requires_grad=False)

  # Obtain the properties of the initial embedding
  one_hots_sm_1 = one_hots_helper.softmax_one_hots(change1, temperature,
                                                   FLAGS.gumbel)
  one_hots_sm_2 = one_hots_helper.softmax_one_hots(change2, temperature,
                                                   FLAGS.gumbel)
  max_values, tokens_ids = one_hots_helper.get_tokens_from_one_hots(
      torch.cat([before, one_hots_sm_1, max_part, one_hots_sm_2, after], dim=1))
  numpy_max_values = max_values.data.cpu().numpy()
  ids = tokens_ids.data.cpu().numpy()[0]
  tokens = tokenizer.convert_ids_to_tokens(ids)
  ids_prediction = get_ids_prediction(
      ids, pos_embeddings, sentence_embeddings, attention_mask,
      FLAGS.maximize_word, FLAGS.maximize_id, FLAGS.normalize, embedding_map,
      model, device, FLAGS.dream_before_start, FLAGS.dream_before_end,
      FLAGS.dream_after_start, FLAGS.dream_after_end)
  output_helper.init_results(results)

  # Optimize the embedding for i iterations and update the properties to
  # evaluate the result in each step
  for i in range(FLAGS.num_iterations):
    max_vals, tokens_ids, prediction = optimizer_step(
        optimizer, before, change1, max_part, change2, after, pos_embeddings,
        sentence_embeddings, attention_mask, temperature, embedding_map, model)
    # Write the properties of the last step
    if (i % FLAGS.metrics_frequency) == 0:
      output_helper.get_metrics_mlm(
          tokens, prediction, ids_prediction, i, temperature, numpy_max_values,
          results)
    # Set the numpy max values
    numpy_max_values = max_vals.data.cpu().numpy()
    # Obtain the activation property for the id-array that would result from the
    # optimization
    ids = tokens_ids.data.cpu().numpy()[0]
    tokens = tokenizer.convert_ids_to_tokens(ids)
    # Calculate the activation using the highest scoring words
    ids_prediction = get_ids_prediction(
        ids, pos_embeddings, sentence_embeddings, attention_mask,
        FLAGS.maximize_word, FLAGS.maximize_id, FLAGS.normalize, embedding_map,
        model, device, FLAGS.dream_before_start, FLAGS.dream_before_end,
        FLAGS.dream_after_start, FLAGS.dream_after_end)
    # Check if the temperature needs to decrease
    if i > FLAGS.warmup:
      temperature = torch.clamp(temperature * FLAGS.anneal, FLAGS.end_temp)

  # Calculate the final activation just as before, but without backprop
  if (FLAGS.num_iterations % FLAGS.metrics_frequency) == 0:
    with torch.no_grad():
      one_hots_sm_1 = one_hots_helper.softmax_one_hots(change1, temperature,
                                                       FLAGS.gumbel)
      one_hots_sm_2 = one_hots_helper.softmax_one_hots(change2, temperature,
                                                       FLAGS.gumbel)

      fused = torch.cat([before, one_hots_sm_1, max_part, one_hots_sm_2, after],
                        dim=1)
      prediction_score = inference_helper.run_inference_mlm(
          fused, pos_embeddings, sentence_embeddings, attention_mask,
          embedding_map, model)
      prediction = get_prediction(prediction_score, FLAGS.maximize_word,
                                  FLAGS.maximize_id, FLAGS.normalize)
      output_helper.get_metrics_mlm(
          tokens, prediction, ids_prediction, FLAGS.num_iterations, temperature,
          numpy_max_values, results)