def _process_step_inputs(self, inputs, maxlen=None): """Turn a list of MemoryInputTuple into one MemoryInputTuple. Args: inputs: a list of MemoryInputTuple, like [MemTuple(1, 2, [1,2,3]), MemTuple(1, 2, [1,2,3])...]. maxlen: Maximum length of a program. Returns: processed_inputs: a MemoryInputTuple like MemTuple(np.array([1, 1, ...]), np.array([2, 2, ...]), np.array([[1, 2, 3, -1, ...], [1, 2, 3, -1,...]))). """ read_ind = np.array([[x[0].read_ind for x in seq] for seq in inputs]) write_ind = np.array([[x[0].write_ind for x in seq] for seq in inputs]) valid_indices = np.array([[ _pad_list(x[0].valid_indices, -1, self.max_n_valid_indices) for x in seq ] for seq in inputs]) output_features = np.array( [[_pad_list(x[1], [0], self.max_n_valid_indices) for x in seq] for seq in inputs]) read_ind_batch, sequence_length = data_utils.convert_seqs_to_batch( read_ind, maxlen) output_feature_batch, _ = data_utils.convert_seqs_to_batch( output_features, maxlen) write_ind_batch, _ = data_utils.convert_seqs_to_batch(write_ind, maxlen) valid_indices_batch, _ = data_utils.convert_seqs_to_batch( valid_indices, maxlen) processed_inputs = tf_utils.MemoryInputTuple( read_ind_batch, write_ind_batch, valid_indices_batch) return (processed_inputs, sequence_length), (output_feature_batch, sequence_length)
def reset(self): self.actions = [] self.mapped_actions = [] self.rewards = [] self.done = False valid_actions = self.de_vocab.lookup(self.interpreter.valid_tokens()) if self.use_cache: new_valid_actions = [] for ma in valid_actions: partial_program = self.de_vocab.lookup(self.mapped_actions + [ma], reverse=True) if not self.cache.check(partial_program): new_valid_actions.append(ma) valid_actions = new_valid_actions self.valid_actions = valid_actions self.start_ob = (tf_utils.MemoryInputTuple(self.de_vocab.decode_id, -1, valid_actions), [self.id_feature_dict[a] for a in valid_actions]) self.obs = [self.start_ob]
def step(self, action, debug=False): self.actions.append(action) if debug: print('-' * 50) print(self.de_vocab.lookup(self.valid_actions, reverse=True)) print('pick #{} valid action'.format(action)) print('history:') print(self.de_vocab.lookup(self.mapped_actions, reverse=True)) # print('env: {}, cache size: {}'.format(self.name, len(self.cache._set))) print('obs') pprint.pprint(self.obs) if action < len(self.valid_actions) and action >= 0: mapped_action = self.valid_actions[action] else: print('-' * 50) # print('env: {}, cache size: {}'.format(self.name, len(self.cache._set))) print('action out of range.') print('action:') print(action) print('valid actions:') print(self.de_vocab.lookup(self.valid_actions, reverse=True)) print('pick #{} valid action'.format(action)) print('history:') print(self.de_vocab.lookup(self.mapped_actions, reverse=True)) print('obs') pprint.pprint(self.obs) print('-' * 50) mapped_action = self.valid_actions[action] self.mapped_actions.append(mapped_action) result = self.interpreter.read_token( self.de_vocab.lookup(mapped_action, reverse=True)) self.done = self.interpreter.done # Only when the proram is finished and it doesn't have # extra work or we don't care, its result will be # scored, and the score will be used as reward. if self.done and not (self.punish_extra_work and self.interpreter.has_extra_work()): reward = self.score_fn(self.interpreter.result, self.answer) else: reward = 0.0 if self.done and self.interpreter.result == [ computer_factory.ERROR_TK ]: self.error = True if result is None or self.done: new_var_id = -1 else: new_var_id = self.de_vocab.lookup( self.interpreter.namespace.last_var) valid_tokens = self.interpreter.valid_tokens() valid_actions = self.de_vocab.lookup(valid_tokens) # For each action, check the cache for the program, if # already tried, then not valid anymore. if self.use_cache: new_valid_actions = [] cached_actions = [] partial_program = self.de_vocab.lookup(self.mapped_actions, reverse=True) for ma in valid_actions: new_program = partial_program + [ self.de_vocab.lookup(ma, reverse=True) ] if not self.cache.check(new_program): new_valid_actions.append(ma) else: cached_actions.append(ma) valid_actions = new_valid_actions self.valid_actions = valid_actions self.rewards.append(reward) ob = (tf_utils.MemoryInputTuple(read_ind=mapped_action, write_ind=new_var_id, valid_indices=self.valid_actions), [self.id_feature_dict[a] for a in valid_actions]) # If no valid actions are available, then stop. if not self.valid_actions: self.done = True self.error = True # If the program is not finished yet, collect the # observation. if not self.done: # Add the actions that are filtered by cache into the # training example because at test time, they will be # there (no cache is available). if self.use_cache: valid_actions = self.valid_actions + cached_actions true_ob = (tf_utils.MemoryInputTuple( read_ind=mapped_action, write_ind=new_var_id, valid_indices=valid_actions), [self.id_feature_dict[a] for a in valid_actions]) self.obs.append(true_ob) else: self.obs.append(ob) elif self.use_cache: # If already finished, save it in the cache. self.cache.save( self.de_vocab.lookup(self.mapped_actions, reverse=True)) return ob, reward, self.done, {}