def collect_dataset(mdp, samples=10000, learning_agent=None):
    '''
    Args:
        mdp (simple_rl.MDP)
        samples (int)
        learning_agent (simple_rl.Agent): If None, a random agent is used.
            Otherwise collects data based on its learning.

    Returns:
        (set)
    '''
    if learning_agent is None:
        learning_agent = RandomAgent(mdp.get_actions())

    cur_state = mdp.get_init_state()
    reward = 0
    visited_states = set([cur_state])

    # Set initial state params.
    init_state_params = {}
    last_x = 0 + np.random.randn(1)[0]
    init_state_params["x"] = last_x
    init_state_params["x_dot"] = 0
    init_state_params["theta"] = 0
    init_state_params["theta_dot"] = 0

    for i in range(samples):
        action = learning_agent.act(cur_state, reward)
        reward, next_state = mdp.execute_agent_action(action)

        visited_states.add(next_state)
        if next_state.is_terminal():
            init_state_params["x"] = np.random.randn(1)[0]
            mdp.reset(init_state_params)
            learning_agent.end_of_episode()
            cur_state = mdp.get_init_state()
            reward = 0
        else:
            cur_state = next_state

    return visited_states
Example #2
0
class CoveringOption(OptionWrapper):
    """
    Wrapper to describe options
    """
    def __init__(self,
                 sess=None,
                 experience_buffer=None,
                 option_b_size=None,
                 sp_training_steps=100,
                 obs_dim=None,
                 obs_bound=None,
                 action_dim=None,
                 action_bound=None,
                 num_actions=None,
                 low_method='linear',
                 f_func='fourier',
                 n_units=16,
                 init_all=True,
                 reversed_dir=False,
                 init_around_goal=False,
                 init_dist=0.9,
                 term_dist=0.1,
                 restore=None,
                 name=None):
        self.init_dist = init_dist
        self.term_dist = term_dist

        if sess is None:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True  # TODO: conv dumps error without this
            self.sess = tf.Session(config=config)
        else:
            self.sess = sess
        self.option_b_size = option_b_size
        self.sp_training_steps = sp_training_steps

        self.low_method = low_method
        self.f_func = f_func
        self.n_units = n_units
        self.init_all = init_all
        self.reversed_dir = reversed_dir
        self.name = name  # self.name + "_inst" + str(self.curr_instances) + "_spc" + op_name

        self.obs_dim = obs_dim
        self.obs_bound = obs_bound
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.num_actions = num_actions

        self.init_around_goal = init_around_goal

        self.init_fn = None
        self.term_fn = None

        if restore is None:
            self.setup_networks()
        if experience_buffer is not None:
            self.train_f_function(experience_buffer)

    def setup_networks(self):
        print('f_func=', self.f_func)
        if self.f_func == 'fourier':
            # low_bound = np.asarray([0.0, 0.0, -2.0, -2.0])
            # up_bound = np.asarray([1.0, 1.0, 2.0, 2.0])
            features = Fourier(state_dim=self.obs_dim,
                               bound=self.obs_bound,
                               order=4)
            self.f_function = SpectrumFourier(obs_dim=self.obs_dim,
                                              feature=features,
                                              name=self.name)
        elif self.f_func == 'nn':
            self.f_function = SpectrumNetwork(self.sess,
                                              obs_dim=self.obs_dim,
                                              n_units=self.n_units,
                                              name=self.name)
        elif self.f_func == 'nnf':
            features = Monte()
            self.f_function = SpectrumNetwork(self.sess,
                                              obs_dim=self.obs_dim,
                                              feature=features,
                                              n_units=self.n_units,
                                              name=self.name)
        elif self.f_func == 'nns':
            features = Subset(state_dim=self.obs_dim,
                              feature_indices=[0, 1])  # TODO: parameterize
            self.f_function = SpectrumNetwork(self.sess,
                                              obs_dim=self.obs_dim,
                                              feature=features,
                                              n_units=self.n_units,
                                              name=self.name)
        elif self.f_func == 'nnc':
            # Convolutions
            self.f_function = SpectrumNetwork(self.sess,
                                              obs_dim=self.obs_dim,
                                              n_units=self.n_units,
                                              conv=True,
                                              name=self.name)
        elif self.f_func == 'rand':
            self.f_function = None
        elif self.f_func == 'agent':
            features = AgentPos(game='Freeway')
            self.f_function = SpectrumFourier(obs_dim=self.obs_dim,
                                              feature=features,
                                              name=self.name)
        else:
            print('f_func =', self.f_func)
            # print('len(ffnc)=', len(self.f_func))
            assert (False)

        if self.f_function is not None:
            self.f_function.initialize()

        if self.low_method == 'linear':
            # low_bound = np.asarray([0.0, 0.0, -2.0, -2.0])
            # up_bound = np.asarray([1.0, 1.0, 2.0, 2.0])
            features = Fourier(state_dim=self.obs_dim,
                               bound=self.obs_bound,
                               order=3)
            self.agent = LinearQAgent(actions=range(self.num_actions),
                                      feature=features,
                                      name=self.name)
        elif self.low_method == 'ddpg':
            # TODO: Using on-policy method is not good for options? is DDPG off-policy?
            self.agent = DDPGAgent(self.sess,
                                   obs_dim=self.obs_dim,
                                   action_dim=self.action_dim,
                                   action_bound=self.action_bound,
                                   name=self.name)
        elif self.low_method == 'dqn':
            self.agent = DQNAgent(self.sess,
                                  obs_dim=self.obs_dim,
                                  num_actions=self.num_actions,
                                  gamma=0.99,
                                  name=self.name)
        elif self.low_method == 'rand':
            if self.num_actions is None:
                self.agent = RandomContAgent(action_dim=self.action_dim,
                                             action_bound=self.action_bound,
                                             name=self.name)
            else:
                self.agent = RandomAgent(range(self.num_actions),
                                         name=self.name)
        else:
            print('low_method=', self.low_method)
            assert (False)
        self.agent.reset()

    def is_initiation(self, state):
        assert (isinstance(state, State))
        if self.init_fn is None:
            return True
        elif self.init_all:
            # The option can be initialized anywhere except its termination state
            return not self.is_terminal(state)
        else:
            # TODO: We want to make it to "if > min f + epsilon"
            # print('fvalue = ', self.f_function(np.reshape(state, (1, state.shape[0]))))
            # state_d = state.data.flatten()
            # f_value = self.f_function(np.reshape(state_d, (1, state_d.shape[0]))).flatten()[0]
            f_value = self.f_function(state)[0][0]
            # print('is_init: val=', f_value)
            return self.init_fn(f_value)

    def is_terminal(self, state):
        assert (isinstance(state, State))

        if self.term_fn is None:
            return True
        else:
            f_value = self.f_function(state)[0][0]

            bound = self.lower_th

            # print('f_value, bound = ', f_value, bound)
            # if f_value < bound:
            #     print('f<b so terminates')
            # else:
            #     print('f>b so continue')
            # state_d = state.data.flatten()
            # f_value = self.f_function(np.reshape(state_d, (1, state_d.shape[0]))).flatten()[0]
            # print('is_term: val=', f_value)
            # return f_value < 0.03
            return self.term_fn(f_value)

    def act(self, state):
        return self.agent.act(state, 0, learning=False)

    def train_f_function(self, experience_buffer):
        assert (self.option_b_size is not None)

        self.f_function.initialize()

        for _ in range(self.sp_training_steps):
            s, a, r, s2, t = experience_buffer.sample(self.option_b_size)

            # Even if we switch the order of s and s2, we get the same eigenfunction.
            # next_f_value = self.f_function(s)
            # self.f_function.train(s2, next_f_value)

            next_f_value = self.f_function(s2)
            self.f_function.train(s, next_f_value)

        self.upper_th, self.lower_th = self.sample_f_val(
            experience_buffer, self.init_dist, self.term_dist)

        # print('init_th, term_th = ', init_th, term_th)
        if self.reversed_dir:
            self.term_fn = lambda x: x > self.upper_th
            if self.init_around_goal:
                self.init_fn = lambda x: x > self.lower_th
            else:
                self.init_fn = lambda x: x < self.lower_th
        else:
            self.term_fn = lambda x: x < self.lower_th
            if self.init_around_goal:
                self.init_fn = lambda x: x < self.lower_th
            else:
                self.init_fn = lambda x: x > self.upper_th

    def sample_f_val(self, experience_buffer, upper, lower):
        buf_size = experience_buffer.size()

        # n_samples = min(buf_size, 1024)
        n_samples = buf_size

        s = [
            experience_buffer.buffer[i][0]
            for i in range(experience_buffer.size())
        ]

        # s, _, _, _, _ = experience_buffer.sample(n_samples)
        f_values = self.f_function(s)
        if type(f_values) is list:
            f_values = np.asarray(f_values)
        f_values = f_values.flatten()

        f_srt = np.sort(f_values)

        print('f_srt=', f_srt)

        init_th = f_srt[int(n_samples * upper)]
        term_th = f_srt[int(n_samples * lower)]

        print('init_th, term_th=', init_th, term_th)

        assert (init_th > term_th)
        return init_th, term_th

    def train(self, experience_buffer, batch_size):
        # Training the policy of the agent
        s, a, r, s2, t = experience_buffer.sample(batch_size)

        if self.f_function is None:
            self.agent.train_batch(s, a, r, s2, t, batch_size=batch_size)
        else:
            r_shaped = []

            for i in range(batch_size):
                # Reward is given if it minimizes the f-value
                # r_s = self.f_function(np.reshape(s[i].data, (1, s[i].data.shape[0]))) - self.f_function(np.reshape(s2[i].data, (1, s2[i].data.shape[0]))) + r[i]

                if self.reversed_dir:
                    r_s = self.f_function(s2[i]) - self.f_function(s[i]) + r[i]
                else:
                    r_s = self.f_function(s[i]) - self.f_function(s2[i]) + r[i]
                r_shaped.append(r_s)
                # print('reward=',  r[i] ,' shaped-reward=', r_s)
            self.agent.train_batch(s,
                                   a,
                                   r_shaped,
                                   s2,
                                   t,
                                   batch_size=batch_size)

    def restore(self, directory):
        # Restore
        # 1. f function
        # 2. init threshold, term threshold
        # 3. agent
        with open(directory + '/meta', 'r') as f:
            self.f_func = f.readline().split(' ')[1].strip()
            self.upper_th = float(f.readline().split(' ')[1].strip())
            self.lower_th = float(f.readline().split(' ')[1].strip())
            self.low_method = f.readline().split(' ')[1].strip()
            self.init_all = f.readline().split(' ')[1].strip() == 'True'
            self.reversed_dir = f.readline().split(' ')[1].strip() == 'True'

        if self.reversed_dir:
            print('restored reversed direction')
            self.init_fn = lambda x: x < self.lower_th
            self.term_fn = lambda x: x > self.upper_th
        else:
            self.init_fn = lambda x: x > self.upper_th
            self.term_fn = lambda x: x < self.lower_th

        # print('f_func=', self.f_func)

        self.setup_networks()

        self.f_function.restore(directory)
        # self.agent.restore(directory, rev=self.reversed_dir)
        self.agent.restore(directory)

        # print(self.f_function)

    def save(self, directory, rev=False):
        if not os.path.exists(directory):
            os.mkdir(directory)
        with open(directory + '/meta', 'w') as f:
            f.write('f_func: ' + self.f_func + '\n')
            f.write('upper_th: ' + str(self.upper_th) + '\n')
            f.write('lower_th: ' + str(self.lower_th) + '\n')
            f.write('low_method: ' + self.low_method + '\n')
            f.write('init_all: ' + str(self.init_all) + '\n')
            f.write('reversed_dir: ' + str(self.reversed_dir) + '\n')

        # Save f-function
        self.f_function.save(directory)
        # Save agent policy
        if rev:
            self.agent.save(directory, name=self.name + 'rev')
        else:
            self.agent.save(directory)