def consistent_programs(self, action, state, timestep): """Returns a list of WeightedPrograms that are consistent with the action at (state, timestep). Args: action (MiniWoBAction): state (MiniWoBState) timestep (int) Returns: a_given_p ({WeightedProgram: float}): the keys are all the consistent programs. the probability of the action given the consistent program """ assert timestep < len(self._demo) w_programs = self._demo.programs(timestep) env = ExecutionEnvironment(state) a_given_p = {} for w_program in w_programs: program = w_program.program if program is None: # Skip skip actions continue # Non-executable programs are consistent with MiniWoBTerminate if isinstance(action, MiniWoBTerminate): try: num_execution_paths = program.execution_paths(env) if num_execution_paths == 0: a_given_p[w_program] = 1. except ProgramExecutionException as e: a_given_p[w_program] = 1. else: # Regular action was played try: # Sometimes programs cannot execute if program is not None and program.consistent(env, action): num_execution_paths = program.execution_paths(env) a_given_p[w_program] = 1. / num_execution_paths except ProgramExecutionException as e: logging.info( "consistent_programs({}, {}, {}, {}): {}".format( self, action, state, timestep, e)) return a_given_p
def _init_weights(self, labeled_demo): weight = float(self._config.weight_init) for i in range(len(labeled_demo)): weighted_programs = labeled_demo.programs(i) if not weighted_programs: continue if weight == 0.: for weighted_program in weighted_programs: weighted_program.set_weight(0.) else: # Clipping-based: give high weight to precise programs env = ExecutionEnvironment(labeled_demo.state(i)) num_results = [] for weighted_program in weighted_programs: program = weighted_program.program if program is None: # Skip is good num_result = 1 else: try: num_result = program.execution_paths(env) assert num_result >= 0 except ProgramExecutionException as e: num_result = 999 num_results.append(num_result) # Find programs with minimal number of matches min_result = min(num_results) assert len(weighted_programs) == len(num_results) for program, num_result in zip(weighted_programs, num_results): if num_result == min_result: program.set_weight(weight) else: program.set_weight(0.) # TODO: This prunes programs randomly after choosing the most # restrictive pruned_programs = sorted( weighted_programs, reverse=True, key=lambda x: x.weight)[:self._config.max_programs] labeled_demo.set_programs(i, pruned_programs) labeled_demo.initialize_critics(float(self._config.init_v))
def _edges_to_programs(vertex): """Collect ActionEdges originating from the given StateVertex, and list all WeightedPrograms that could execute to the actions in those edges. Args: vertex (StateVertex) Returns: list[WeightedProgram] """ weighted_programs = [] env = ExecutionEnvironment(vertex.state) for action_edge in vertex.action_edges: action = action_edge.action state_incr = action_edge.end - action_edge.start if action is None: weighted_programs.append(WeightedProgram(None, 1., state_incr)) continue # All string tokens strings = [StringToken(s) for s in env.valid_strings] # All fields tokens fields = env.fields fields_tokens = [ FieldsValueSelectorToken(i) for i in range(len(fields.keys)) ] strings += fields_tokens # TODO: Support last. Hard because it depends on the actual exec # env. element_sets = [TagToken(tag) for tag in env.tags] # All of the Like element_sets += [ LikeToken(string_token) for string_token in strings ] element_sets += [ ExactMatchToken(string_token) for string_token in strings ] # Max one-level of Near, SameRow, SameCol classes = action.element.classes distance_programs = [ NearToken(elem_token, classes) for elem_token in element_sets ] distance_programs += [ SameRowToken(elem_token, classes) for elem_token in element_sets ] distance_programs += [ SameColToken(elem_token, classes) for elem_token in element_sets ] element_sets += distance_programs click_actions = [ ClickToken(element_token) for element_token in element_sets ] type_actions = [ FocusAndTypeToken(element_token, string_token) for element_token, string_token in itertools.product( element_sets, fields_tokens) ] # Random typing actions type_actions += [ FocusAndRandomFieldTypeToken(element_token) for element_token in element_sets ] if isinstance(action, MiniWoBElementClick): consistent_clicks = [ WeightedProgram(click, 1., state_incr) for click in click_actions if click.consistent(env, action) ] weighted_programs.extend(consistent_clicks) elif isinstance(action, MiniWoBFocusAndType): consistent_types = [ WeightedProgram(type_action, 1., state_incr) for type_action in type_actions if type_action.consistent(env, action) ] weighted_programs.extend(consistent_types) else: raise ValueError("Action: {} not supported.".format(action)) return weighted_programs
def next_action(self, state): """Returns a next sampled action from following this demonstration. If demonstration is already played through, returns FAIL. Args: state (MiniWoBState): the current state Returns: action (MiniWoBAction) """ # Update environment if self._env is None: self._env = ExecutionEnvironment(state) else: self._env.observe(state) # Greedy: choose the best action that executes if self._test: # NOTE: selected_programs and candidate_programs are not updated # because you should not be taking gradient steps on test. action, new_cursor = self._get_best_action(state, self._cursor) self._cursor = new_cursor return action else: # Sample until you get a concrete action justifications = [] while True: selected_w_program = self._sample_program(state, self._cursor) # Update book-keeping weighted_programs, probs = self._programs_and_probs( self._cursor) if len(weighted_programs) > 0: self._trajectory_cursors.append(self._cursor) self._selected_programs.append(selected_w_program) self._candidate_programs.append(weighted_programs) state_value = self._demo.critics[self._cursor] else: # Sampled action is a terminate state_value = None self._cursor += selected_w_program.state_incr program = selected_w_program.program if program is None: # Skip action justifications.append( DemoJustification(weighted_programs, probs, selected_w_program, ElementSet.EMPTY, state_value)) else: # Regular weighted program elem_set = ElementSet.EMPTY try: action = program.execute(self._env) elem_set = program.possible_elements(self._env) except ProgramExecutionException as e: logging.info("DemoPlayer: %s", e) action = MiniWoBTerminate() justifications.append( DemoJustification(weighted_programs, probs, selected_w_program, elem_set, state_value)) action.justification = DemoJustificationList( justifications) return action
class DemoPlayer(object): """Wraps a demo, execution env, and cursor inside of a demonstration. Args: demo (LabeledDemonstration) policy (ProgramPolicy) test (bool) """ def __init__(self, demo, policy, test=False): self._demo = demo self._policy = policy self._test = test self._env = None self._cursor = 0 # list[int] self._trajectory_cursors = [] # list[WeightedProgram] self._selected_programs = [] # list[list[WeightedProgram]] self._candidate_programs = [] def next_action(self, state): """Returns a next sampled action from following this demonstration. If demonstration is already played through, returns FAIL. Args: state (MiniWoBState): the current state Returns: action (MiniWoBAction) """ # Update environment if self._env is None: self._env = ExecutionEnvironment(state) else: self._env.observe(state) # Greedy: choose the best action that executes if self._test: # NOTE: selected_programs and candidate_programs are not updated # because you should not be taking gradient steps on test. action, new_cursor = self._get_best_action(state, self._cursor) self._cursor = new_cursor return action else: # Sample until you get a concrete action justifications = [] while True: selected_w_program = self._sample_program(state, self._cursor) # Update book-keeping weighted_programs, probs = self._programs_and_probs( self._cursor) if len(weighted_programs) > 0: self._trajectory_cursors.append(self._cursor) self._selected_programs.append(selected_w_program) self._candidate_programs.append(weighted_programs) state_value = self._demo.critics[self._cursor] else: # Sampled action is a terminate state_value = None self._cursor += selected_w_program.state_incr program = selected_w_program.program if program is None: # Skip action justifications.append( DemoJustification(weighted_programs, probs, selected_w_program, ElementSet.EMPTY, state_value)) else: # Regular weighted program elem_set = ElementSet.EMPTY try: action = program.execute(self._env) elem_set = program.possible_elements(self._env) except ProgramExecutionException as e: logging.info("DemoPlayer: %s", e) action = MiniWoBTerminate() justifications.append( DemoJustification(weighted_programs, probs, selected_w_program, elem_set, state_value)) action.justification = DemoJustificationList( justifications) return action # TODO: Define a SkipToken? def _sample_program(self, state, cursor): """Returns a WeightedProgram sampled at the current cursor. The program in the WeightedProgram may be None, indicating a skip action. Args: state (MiniWoBState): concrete state cursor (int): index of the current demo state Returns: WeightedProgram: If the WeightedProgram is None, no programs were available for sampling. """ weighted_programs, probs = self._programs_and_probs(cursor) if not weighted_programs: # No programs available for sampling. return WeightedProgram(TerminateToken(), 0.) weighted_program = np.random.choice(weighted_programs, p=probs) return weighted_program def _get_best_action(self, state, cursor): """Execute the highest scoring program that executes to produce an action. The justification for the action includes zero or more justifications for skip actions, which just advance the cursor. Args: state (MiniWoBState): concrete state cursor (int): index of the current demo state Returns: action (ProgramAction) new_cursor (int): the new cursor position """ def helper(state, cursor, justifications): """Returns action, new cursor position keeping track of justifications in a list. """ weighted_programs, probs = self._programs_and_probs(cursor) assert len(weighted_programs) == len(probs) ranked = sorted(zip(weighted_programs, probs), key=lambda x: x[1], reverse=True) state_value = self._demo.critics[self._cursor] \ if len(weighted_programs) > 0 else None for weighted_program, prob in ranked: program = weighted_program.program if program is not None: # Regular program # See if the program executes try: action = weighted_program.program.execute(self._env) except ProgramExecutionException as e: logging.info("DemoPlayer: %s", e) continue new_cursor = cursor + weighted_program.state_incr # Compute justification element_set = program.possible_elements(self._env) justifications.append( DemoJustification(weighted_programs, probs, weighted_program, element_set, state_value)) action.justification = DemoJustificationList( justifications) return action, new_cursor else: # Skip edge new_cursor = cursor + weighted_program.state_incr # Compute justification justifications.append( DemoJustification(weighted_programs, probs, weighted_program, ElementSet.EMPTY, state_value)) return helper(state, new_cursor, justifications) action = MiniWoBTerminate() justifications.append( DemoJustification(weighted_programs, probs, None, ElementSet.EMPTY, state_value)) action.justification = DemoJustificationList(justifications) return action, cursor return helper(state, cursor, []) def _programs_and_probs(self, cursor): """Returns three parallel lists of weighted programs and their probabilities at the current cursor. Args: cursor (int) Returns: list[WeightedProgram] list[float] """ # Past the end of the demo if cursor >= len(self._demo): return [], [] weighted_programs = self._demo.programs(cursor) if not weighted_programs: return [], [] probs = self._policy.compute_program_probs(weighted_programs) return weighted_programs, probs def consistent_programs(self, action, state, timestep): """Returns a list of WeightedPrograms that are consistent with the action at (state, timestep). Args: action (MiniWoBAction): state (MiniWoBState) timestep (int) Returns: a_given_p ({WeightedProgram: float}): the keys are all the consistent programs. the probability of the action given the consistent program """ assert timestep < len(self._demo) w_programs = self._demo.programs(timestep) env = ExecutionEnvironment(state) a_given_p = {} for w_program in w_programs: program = w_program.program if program is None: # Skip skip actions continue # Non-executable programs are consistent with MiniWoBTerminate if isinstance(action, MiniWoBTerminate): try: num_execution_paths = program.execution_paths(env) if num_execution_paths == 0: a_given_p[w_program] = 1. except ProgramExecutionException as e: a_given_p[w_program] = 1. else: # Regular action was played try: # Sometimes programs cannot execute if program is not None and program.consistent(env, action): num_execution_paths = program.execution_paths(env) a_given_p[w_program] = 1. / num_execution_paths except ProgramExecutionException as e: logging.info( "consistent_programs({}, {}, {}, {}): {}".format( self, action, state, timestep, e)) return a_given_p @property def demo(self): """Returns the LabeledDemonstration object.""" return self._demo @property def trajectory_cursors(self): """Returns the list[int] of cursors at each selected program.""" return self._trajectory_cursors @property def selected_programs(self): """Returns the list[WeightedPrograms] that were played in order.""" return self._selected_programs @property def candidate_programs(self): """Returns the list[list[WeightedPrograms]] of candidate programs.""" return self._candidate_programs @property def fields(self): """Returns the Fields associated with this demo.""" return self._demo.fields