Exemple #1
0
def build_subgoal_option_agent(mdp,
                               subgoals,
                               init_region,
                               agent=QLearningAgent,
                               vectors=None,
                               name='-abstr',
                               n_trajs=50,
                               n_steps=100,
                               classifier='list',
                               policy='vi'):
    # print('sbugoals=', subgoals)
    goal_based_options = aa_helpers.make_subgoal_options(mdp,
                                                         subgoals,
                                                         init_region,
                                                         vectors=vectors,
                                                         n_trajs=n_trajs,
                                                         n_steps=n_steps,
                                                         classifier=classifier,
                                                         policy=policy)
    goal_based_aa = ActionAbstraction(prim_actions=mdp.get_actions(),
                                      options=goal_based_options,
                                      use_prims=True)

    # num_feats = mdp.get_num_state_feats()
    option_agent = AbstractionWrapper(
        agent,
        agent_params={"actions": mdp.get_actions()},
        action_abstr=goal_based_aa,
        name_ext=name)

    return option_agent
    def generate_options(self):
        # TODO: Train the policy using the experience replay buffer instead of samling new trajectories.

        A, intToS = self.generate_matrix()
        known_region = list(intToS.values())
        if self.method == 'eigen':
            # TODO: how is A represented?
            # print('matrix= ', A)
            _, options, vectors = Eigenoptions(A, self.n_ops)
        elif self.method == 'fiedler':
            _, options, _, vectors = FiedlerOptions(A, self.n_ops)
        elif self.method == 'bet':
            _, options, vectors = BetweennessOptions(A, self.n_ops)
        else:
            assert (False)

        print('generated options: ')
        for i, o in enumerate(options):
            if type(o[0]) is list:
                print('inits:')
                for ss in o[0]:
                    print(intToS[ss])
                print('goals:')
                for ss in o[1]:
                    print(intToS[ss])
            else:
                print('init:', intToS[o[0]])
                print('goal:', intToS[o[1]])

        egoal_list = [[]] * (len(options) * 2)
        for i, o in enumerate(options):
            if type(o[0]) is list:
                for ss in o[0]:
                    egoal_list[i * 2].append(intToS[ss])
                for ss in o[1]:
                    egoal_list[i * 2 + 1].append(intToS[ss])
            else:
                egoal_list[i * 2] = [intToS[o[0]]]
                egoal_list[i * 2 + 1] = [intToS[o[1]]]

        evector_list = [dict()] * (len(options) * 2)
        for i, o in enumerate(options):
            for j in intToS.keys():
                # print('hash(', j, ')=', hash(intToS[j]))
                # print('s[j]=', intToS[j])
                # for i in intToS[j].data.flatten():
                #     if i > 0:
                #         print(i)
                evector_list[i * 2][hash(intToS[j])] = -vectors[i][j]
                evector_list[i * 2 + 1][hash(intToS[j])] = vectors[i][j]

        # TODO: policy is computed using vi right now.
        goal_options = aa_helpers.make_subgoal_options(
            self.mdp,
            egoal_list,
            known_region,
            vectors=evector_list,
            n_trajs=self.op_n_episodes,
            n_steps=self.op_n_steps,
            classifier='list',
            policy='vi')

        return goal_options