def run(self, num_epochs=1, num_episodes=1): num_tasks = len(self.tasks) for epoch in range(num_epochs): # choose task based on weights. ti = -1 if npr.rand() < self.mab_gamma: ti = npr.choice(range(num_tasks), 1)[0] else: p = np.exp(prob.normalize_log(self.log_weights)) ti = npr.choice(range(num_tasks), 1, replace=True, p=p)[0] task = self.tasks[ti] # (TODO) this breaks away the abstraction. self.deepQlearn.task = task self.dqn.task = task # run training. self.deepQlearn.run(num_episodes) # update weights. self.cumulative_epochs += 1 if self.cumulative_epochs >= self.mab_batch_size: self.log_weights[:] = 0. else: for ti, task in enumerate(self.tasks): performance_gain = eval_policy_reward(self.dqn, task, num_episodes=10000) self.log_weights[ti] += self.mab_gamma * self.mab_scale * performance_gain / num_tasks
def run(self, task=None, num_epochs=10, num_episodes=100, tol=1e-4): if task: self.reset(task) task = self.last_task for ei in range(num_epochs): # run DQN on task for #episodes. self.run_task(task, num_episodes=num_episodes, tol=tol) task.reset() # compute average td error after learning. ex_buffer = self._filter_experience_by_task(task) td = self._average_td_error(ex_buffer) # learn the meta-model. feat = self.feat_func(task) self.meta_model.learn(feat, td) # sample a new task based on the meta-model. task_nb = self.edit_func(task) task_nb.append(task) # include this task. val_nb = [] for new_task in task_nb: new_task_feat = self.feat_func(new_task) val_nb.append(self.meta_model.get(new_task_feat)) print 'val_nb', val_nb log_prob = prob.normalize_log(np.array(val_nb) * 1.) p = np.exp(log_prob) print 'probability', p next_task = prob.choice(task_nb, 1, replace=True, p=p)[0] print 'new_task', next_task task = next_task
def _get_softmax_action_distribution(self, state, temperature, valid_actions=None): if valid_actions == None: valid_actions = range(self.num_actions) qvals = self.table[state, valid_actions] qvals = qvals / temperature p = np.exp(prob.normalize_log(qvals)) return p
def _get_softmax_action_distribution(self, state, temperature, valid_actions=None): if valid_actions == None: valid_actions = range(self.num_actions) state = state.reshape(1, *state.shape) qvals = self.fprop(state).reshape(-1)[valid_actions] qvals = qvals / temperature p = np.exp(prob.normalize_log(qvals)) return p
def compute_Qfunc_logprob(qfunc, task, softmax_t = 1.): ''' the Qfuncs are normalized to a softmax destribution. ''' table = np.zeros((task.get_num_states(), task.get_num_actions())) states = task.get_valid_states() for state in states: for action in range(task.get_num_actions()): table[state, action] = qfunc(state, action) / softmax_t table[state, :] = prob.normalize_log(table[state, :]) return table
def _get_softmax_action_distribution(self, state, temperature, valid_actions): action_values = self.av(state) if not action_values: return np.ones(len(valid_actions)) / float(len(valid_actions)) ind = [action for action in valid_actions if action in action_values] qvals = np.array([action_values[action] for action in valid_actions if action in action_values]) qvals = qvals / temperature p = np.exp(prob.normalize_log(qvals)) pv = np.zeros(len(valid_actions)) pv[ind] = p return pv