Example #1
0
 def reward_fn(code_string):
   return misc.RewardInfo(
       episode_rewards=[float(ord(c)) for c in code_string],
       input_case=[],
       correct_output=[],
       code_output=[],
       input_type=misc.IOType.integer,
       output_type=misc.IOType.integer,
       reason='none')
Example #2
0
 def _score_string(self, string):
     actions = misc.bf_string_to_tokens(string)
     reward, correct = self.reward_fn(actions)
     return misc.RewardInfo(episode_rewards=[0.0] * (len(string) - 1) +
                            [reward],
                            input_case=None,
                            correct_output=None,
                            code_output=actions,
                            input_type=None,
                            output_type=misc.IOType.integer,
                            reason='correct' if correct else 'wrong')
  def _score_code(self, code):
    """Run test cases on code and compute reward.

    Args:
      code: A single BF code string.

    Returns:
      misc.RewardInfo namedtuple instance containing reward and code execution
          information, including inputs, expected outputs, code outputs, input
          and output types, and reason for the reward obtained.
    """
    # Get list of 2-tuples, each containing an input sequence and an output
    # sequence.
    io_seqs = self.task.make_io_set()
    terminal_reward = 0.0
    results = []
    reason = 'correct'
    for input_seq, output_seq in io_seqs:
      eval_result = bf.evaluate(
          code, input_buffer=input_seq, timeout=0.1,
          max_steps=self.max_execution_steps,
          base=self.task.base,
          require_correct_syntax=self.require_correct_syntax)
      result, success = eval_result.output, eval_result.success
      if not success:
        # Code execution timed out.
        terminal_reward = self.failure_reward
        results = []
        reason = eval_result.failure_reason
        break
      else:
        terminal_reward += self.reward_fn(result, output_seq, self.task.base)
        if result == output_seq:
          terminal_reward += self.correct_bonus  # Bonus for correct answer.

          # Only add additional reward for shorter code. Subtracting reward
          # interferes with the main objective. Only optimize for length once
          # any solution is found.
          if self.min_code_length == self.max_code_length:
            terminal_reward += self.code_length_bonus
          else:
            terminal_reward += self.code_length_bonus * clipped_linear(
                x=len(code), x0=self.min_code_length, y0=1.0,
                slope=-self.time_penalty, y_range=(0.0, 1.0))

          # reason remains 'correct' if it is already
        elif reason == 'correct':
          reason = 'wrong'
      results.append(result)

    # Return list of rewards, one for each char in the code. All are 0 except
    # for the terminal reward.
    terminal_reward /= self.best_reward
    return misc.RewardInfo(
        episode_rewards=[0.0] * (len(code) - 1) + [terminal_reward],
        input_case=misc.IOTuple(i for i, o in io_seqs),
        correct_output=misc.IOTuple(o for i, o in io_seqs),
        code_output=misc.IOTuple(results),
        input_type=self.input_type,
        output_type=self.output_type,
        reason=reason)