Example #1
0
    def reconfigureConditionalInitialValues(self):
        """
        @brief Redo initial configuration calculatios based on properties.

        Useful for derived classes that want to initialize some values later in the __init__ method.
        """
        self.num_actions = len(self.action_list)
        self.updatePrimitiveActionIndices()
        if not self.states:
            self.num_agents = 0
        else:
            if type(self.states[0]) is tuple:
                self.num_agents = len(self.states[0])
            else:
                self.num_agents = 1
        self.num_states = len(self.states)
        self.setObservableStates(self.states)
        if self.grid_map is not None:
            self.num_cells = self.grid_map.size
            self.grid_dtype = DataHelp.getSmallestNumpyUnsignedIntType(self.num_cells)
            self.grid_cell_vec = self.grid_map.ravel()
        else:
            self.grid_dtype = self.prob_dtype
            self.grid_cell_vec = np.array([], self.grid_dtype)
        self.act_prob_row_idx_of_grid_cell = dict.fromkeys(self.grid_cell_vec.tolist())
        self.precomputeGridCellActProbMatRows()

        # Only rebuild 'prob' if it's an empty dictionary and we have information to build it from the action
        # probabilities ('act_prob').
        if not any(self.prob) and any(self.act_prob):
            self.buildProbDict()

        if self.num_states > 0:
            self.setInitialProbDist(self.init if self.init_set is None else self.init_set)
def rolloutInferSolve(arena_mdp,
                      robot_idx,
                      env_idx,
                      num_batches=10,
                      num_trajectories_per_batch=100,
                      num_steps_per_traj=15,
                      inference_method='gradientAscentGaussianTheta',
                      infer_dtype=np.float64,
                      num_theta_samples=2000,
                      SGA_eps=0.00001,
                      SGA_log_prob_thresh=np.log(0.8)):

    # Create a reference to the mdp used for inference.
    infer_mdp = arena_mdp.infer_env_mdp
    true_env_policy_vec = infer_mdp.getPolicyAsVec(
        policy_to_convert=arena_mdp.env_policy[env_idx])
    infer_mdp.theta = np.zeros(len(infer_mdp.phi))
    infer_mdp.theta_std_dev = np.ones(infer_mdp.theta.size)

    winning_reward = {act: 0.0 for act in arena_mdp.action_list}
    winning_reward['0_Empty'] = 1.0

    # Data types are constant for every batch.
    hist_dtype = DataHelper.getSmallestNumpyUnsignedIntType(
        arena_mdp.num_observable_states)
    observation_dtype = DataHelper.getSmallestNumpyUnsignedIntType(
        arena_mdp.num_actions)

    # Create a dictionary of observable states for printing.
    policy_keys_to_print = deepcopy([
        (state[0], arena_mdp.dra.get_transition(arena_mdp.L[state], state[1]))
        for state in arena_mdp.states if 'q0' in state
    ])

    # Variables for logging data
    inferred_policy = {}
    inferred_policy_L1_norm = {}
    inferred_policy_variance = {}

    for batch in range(num_batches):
        batch_start_time = time.time()
        ### Roll Out ###
        run_histories = np.zeros(
            [num_trajectories_per_batch, num_steps_per_traj], dtype=hist_dtype)
        observed_action_indeces = np.empty(
            [num_trajectories_per_batch, num_steps_per_traj],
            dtype=observation_dtype)
        for episode in xrange(num_trajectories_per_batch):
            # Create time-history for this episode.
            _, run_histories[episode, 0] = arena_mdp.resetState()
            for t_step in xrange(1, num_steps_per_traj):
                # Take step
                _, run_histories[episode, t_step] = arena_mdp.step()
                # Record observed action.
                prev_state_idx = run_histories[episode, t_step - 1]
                prev_state = arena_mdp.observable_states[prev_state_idx]
                this_state_idx = run_histories[episode, t_step]
                this_state = arena_mdp.observable_states[this_state_idx]
                observed_action_indeces[
                    episode, t_step] = infer_mdp.graph.getObservedAction(
                        prev_state, this_state)

        ### Infer ###
        theta_vec = infer_mdp.inferPolicy(
            method=inference_method,
            histories=run_histories,
            do_print=False,
            reference_policy_vec=true_env_policy_vec,
            use_precomputed_phi=True,
            monte_carlo_size=num_theta_samples,
            precomputed_observed_action_indeces=observed_action_indeces,
            print_iterations=True,
            eps=0.00001,
            thresh_prob=0.3,
            theta_0=infer_mdp.theta)
        # Print Inference error
        infered_policy_L1_norm_error = MDP.getPolicyL1Norm(
            true_env_policy_vec, infer_mdp.getPolicyAsVec())
        print('Batch {}: L1-norm from ref to inferred policy: {}.'.format(
            batch, infered_policy_L1_norm_error))
        print('L1-norm as a fraction of max error: {}.'.format(
            infered_policy_L1_norm_error / 2 / len(true_env_policy_vec)))

        # Go through and pop keys from policy_uncertainty into a dict built from policy_keys_to_print.
        arena_mdp.configureReward(winning_reward,
                                  bonus_reward_at_state=makeBonusReward(
                                      infer_mdp.policy_uncertainty))

        arena_mdp.solve(do_print=False,
                        method='valueIteration',
                        write_video=False,
                        policy_keys_to_print=policy_keys_to_print)
        batch_stop_time = time.time()
        print('Batch {} runtime {} sec.'.format(
            batch, batch_stop_time - batch_start_time))
Example #3
0
# by the robot. (Repulsive factors are buried in the method below). The method below updates the VI_mdp.env_policy
# dictionary.
ExperimentConfigs.convertSingleAgentEnvPolicyToMultiAgent(VI_mdp, labels, state_env_idx=env_idx,
                                                          new_kernel_weight=1.0, new_phi_sigma=1.0, plot_policies=False,
                                                          alphabet_dict=alphabet_dict,
                                                          fixed_obstacle_labels=fixed_obs_labels)

########################################################################################################################
# Demonstrate Trajectories
########################################################################################################################
demo_mdp = VI_mdp
if gather_new_data:
    # Use policy to simulate and record results.
    #
    # Current policy E{T|R} 6.7. Start by simulating 10 steps each episode.
    hist_dtype = DataHelper.getSmallestNumpyUnsignedIntType(demo_mdp.num_observable_states)
    run_histories = np.zeros([num_episodes, steps_per_episode], dtype=hist_dtype)
    for episode in range(num_episodes):
        # Create time-history for this episode.
        _, run_histories[episode, 0] = demo_mdp.resetState()
        for t_step in range(1, steps_per_episode):
            _, run_histories[episode, t_step] = demo_mdp.step()
    pickled_episodes_file = DataHelper.pickleEpisodes(variables_to_save=[run_histories], name_prefix=pickled_mdp_file,
                                                      num_episodes=num_episodes, steps_per_episode=steps_per_episode)
else:
    # Load pickled episodes. Note that trailing comma on assignment automatically unpacks run_histories from a list.
    (run_histories, pickled_episodes_file) = DataHelper.loadPickledEpisodes(pickled_episodes_file_to_load)
    num_episodes = run_histories.shape[0]
    steps_per_episode = run_histories.shape[1]

if print_history_analysis:
Example #4
0
    def __init__(self, init=None, action_list=[], states=[], prob=dict([]), gamma=.9, AP=set([]), L=dict([]),
                 reward=dict([]), grid_map=None, act_prob=dict([]), sink_action=None, sink_list=[], init_set=None,
                 prob_dtype=np.float64):
        self.prob_dtype = prob_dtype
        self.init=init # Initial state
        self.action_list = action_list
        self.controllable_agent_idx = 0
        self.executable_action_dict = {self.controllable_agent_idx: self.action_list}
        self.num_actions = len(self.action_list)
        self.updatePrimitiveActionIndices()
        self.states=states
        # Determine Number of agents based on length of state elements.
        if not self.states:
            self.num_agents = 0
        else:
            if type(self.states[0]) is tuple:
                self.num_agents = len(self.states[0])
            else:
                self.num_agents = 1
        self.num_states = len(self.states)
        self.current_state = None

        # cell_state_slicer: used to extract indeces from the tuple of states. The joint-state tuples are used as
        # dictionary keys and derived classes should change this value as necessary. Eg. A MDPxDRA with multiple agents
        # might represent a state as ((s_r, s_e), q_i). This slice extracts ((s_r, s_e),).
        self.state_slice_length = None
        self.cell_state_slicer = slice(None, self.state_slice_length, None)

        self.gamma=gamma
        self.reward=reward
        self.grid_map=grid_map
        if self.grid_map is not None:
            self.num_cells = self.grid_map.size
            self.grid_dtype = DataHelp.getSmallestNumpyUnsignedIntType(self.num_cells)
            self.grid_cell_vec = self.grid_map.ravel()
        else:
            self.grid_dtype = self.prob_dtype
            self.grid_cell_vec = np.array([], self.grid_dtype)
        self.act_prob_row_idx_of_grid_cell = dict.fromkeys(self.grid_cell_vec.tolist())
        self.precomputeGridCellActProbMatRows()
        self.neighbor_dict = None
        self.prob=prob
        if any(prob):
            self.prob=prob
        elif any(act_prob):
            # Build it now Assume format of act_prob is {act: [rows(location-class) x cols(act_list_order)]}.
            self.act_prob = act_prob
            self.buildProbDict()
        self.AP=AP # Atomic propositions
        self.L=L # Labels of states
        self.S = None # Initial probability distribution.
        # For EM Solving
        if self.num_actions > 0:
            self.makeUniformPolicy()
        self.init_set = init_set
        self.sink_action = sink_action
        self.sink_list=[]
        self.setSinks(sink_list)
        # Configure a uniform distribution across the states listed in init_set if that input is not None, otherwise
        # MDP.resetState will always return the `current_state` to self.init.
        self.setInitialProbDist(self.init if self.init_set is None else self.init_set)
        self.setObservableStates(self.states)
        DataHelp.writePolicyToCSV(EM_mdp.policy,
                                  policy_keys_to_print,
                                  file_name=pickled_mdp_file + '_EM_Policy')
        DataHelp.writePolicyToCSV(VI_mdp.policy,
                                  policy_keys_to_print,
                                  file_name=pickled_mdp_file + '_EM_Policy')

    # Choose which policy to use for demonstration.
    mdp = VI_mdp
    reference_policy_vec = mdp.getPolicyAsVec(policy_keys_to_print)

    if gather_new_data:
        # Use policy to simulate and record results.
        #
        # Current policy E{T|R} 6.7. Start by simulating 10 steps each episode.
        hist_dtype = DataHelp.getSmallestNumpyUnsignedIntType(mdp.num_states)
        run_histories = np.zeros([num_episodes, steps_per_episode],
                                 dtype=hist_dtype)
        for episode in range(num_episodes):
            # Create time-history for this episode.
            _, run_histories[episode, 0] = mdp.resetState()
            for t_step in range(1, steps_per_episode):
                _, run_histories[episode, t_step] = mdp.step()
        pickled_episodes_file = DataHelp.pickleEpisodes(
            variables_to_save=[run_histories],
            name_prefix=pickled_mdp_file,
            num_episodes=num_episodes,
            steps_per_episode=steps_per_episode)
    else:
        # Load pickled episodes. Note that trailing comma on assignment automatically unpacks run_histories from a list.
        (run_histories, pickled_episodes_file