Esempio n. 1
0
    def run(self, num_epochs=1, num_episodes=1):
        num_tasks = len(self.tasks)
        for epoch in range(num_epochs):
            # choose task based on weights.
            ti = -1
            if npr.rand() < self.mab_gamma:
                ti = npr.choice(range(num_tasks), 1)[0]
            else:
                p = np.exp(prob.normalize_log(self.log_weights))
                ti = npr.choice(range(num_tasks), 1, replace=True, p=p)[0]
            task = self.tasks[ti]

            # (TODO) this breaks away the abstraction.
            self.deepQlearn.task = task
            self.dqn.task = task

            # run training.
            self.deepQlearn.run(num_episodes)

            # update weights.
            self.cumulative_epochs += 1
            if self.cumulative_epochs >= self.mab_batch_size:
                self.log_weights[:] = 0.
            else:
                for ti, task in enumerate(self.tasks):
                    performance_gain = eval_policy_reward(self.dqn, task, num_episodes=10000)
                    self.log_weights[ti] += self.mab_gamma * self.mab_scale * performance_gain / num_tasks
Esempio n. 2
0
    def run(self, task=None, num_epochs=10, num_episodes=100, tol=1e-4):
        if task:
            self.reset(task)

        task = self.last_task
        for ei in range(num_epochs):
            # run DQN on task for #episodes.
            self.run_task(task, num_episodes=num_episodes, tol=tol)
            task.reset()

            # compute average td error after learning.
            ex_buffer = self._filter_experience_by_task(task)
            td = self._average_td_error(ex_buffer)

            # learn the meta-model.
            feat = self.feat_func(task)
            self.meta_model.learn(feat, td)

            # sample a new task based on the meta-model.
            task_nb = self.edit_func(task)
            task_nb.append(task) # include this task.
            val_nb = []
            for new_task in task_nb:
                new_task_feat = self.feat_func(new_task)
                val_nb.append(self.meta_model.get(new_task_feat))
            print 'val_nb', val_nb

            log_prob = prob.normalize_log(np.array(val_nb) * 1.)
            p = np.exp(log_prob)
            print 'probability', p

            next_task = prob.choice(task_nb, 1, replace=True, p=p)[0]
            print 'new_task', next_task
            task = next_task
Esempio n. 3
0
 def _get_softmax_action_distribution(self, state, temperature, valid_actions=None):
     if valid_actions == None:
         valid_actions = range(self.num_actions)
     qvals = self.table[state, valid_actions]
     qvals = qvals / temperature
     p = np.exp(prob.normalize_log(qvals))
     return p
Esempio n. 4
0
 def _get_softmax_action_distribution(self, state, temperature, valid_actions=None):
     if valid_actions == None:
         valid_actions = range(self.num_actions)
     state = state.reshape(1, *state.shape)
     qvals = self.fprop(state).reshape(-1)[valid_actions]
     qvals = qvals / temperature
     p = np.exp(prob.normalize_log(qvals))
     return p
Esempio n. 5
0
def compute_Qfunc_logprob(qfunc, task, softmax_t = 1.):
    '''
        the Qfuncs are normalized to a softmax destribution.
    '''
    table = np.zeros((task.get_num_states(), task.get_num_actions()))
    states = task.get_valid_states()
    for state in states:
        for action in range(task.get_num_actions()):
            table[state, action] = qfunc(state, action) / softmax_t
        table[state, :] = prob.normalize_log(table[state, :])
    return table
Esempio n. 6
0
 def _get_softmax_action_distribution(self, state, temperature, valid_actions):
     action_values = self.av(state)
     if not action_values:
         return np.ones(len(valid_actions)) / float(len(valid_actions))
     ind = [action for action in valid_actions if action in action_values]
     qvals = np.array([action_values[action] for action in valid_actions if action in action_values])
     qvals = qvals / temperature
     p = np.exp(prob.normalize_log(qvals))
     pv = np.zeros(len(valid_actions))
     pv[ind] = p
     return pv