Example #1
0
    def learn(self, num_iter=100, temperature=1., print_lag=None):
        for it in range(num_iter):
            dqn = self.dqn_mt
            bprop = self.bprop
            samples = prob.choice(self.experiences,
                                    self.minibatch_size, replace=True) # draw with replacement.

            # sample a minibatch.
            is_valids = []
            targets = []
            states = []
            actions = np.zeros(self.minibatch_size, dtype=int)

            for idx, sample in enumerate(samples):
                # randomly choose a goal.
                goal = prob.choice(self.goals, 1)[0]
                dqn = self.dqn_by_goal[goal]

                state, last_action, next_state, reward, meta = sample
                valid_actions = meta['last_valid_actions']
                num_actions = meta['num_actions']
                raw_state = np.array(state['raw_state'])
                raw_state[1, goal[0], goal[1]] = 1.

                states.append(raw_state)

                is_valid = [1. for action in range(num_actions) if action in set(valid_actions)]

                if self.loss == 'KL':
                    target = dqn._get_softmax_action_distribution(raw_state, temperature=temperature, valid_actions=valid_actions)
                elif self.loss == 'l2' or self.loss == 'l1' or self.loss == 'l1-exp':
                    target = dqn.av(raw_state)
                elif self.loss == 'l1-action':
                    target = [dqn.av(raw_state)[last_action]]
                    is_valid  = [is_valid[last_action]]

                is_valids.append(is_valid)
                targets.append(target)
                actions[idx] = last_action

            states = np.array(states)
            targets = np.array(targets)
            is_valids = np.array(is_valids)

            score = self.bprop(states, actions, targets, is_valids)

            if print_lag and print_lag > 0 and it % print_lag == 0:
                print 'iter = ', it, 'score = ', score
Example #2
0
 def reset(self):
     self.ale.reset_game()
     self.frame_id = 0
     self.cum_reward = 0
     if self.skip_frame:
         for frame_i in range(self.skip_frame):
             self.step(choice(self.valid_actions, 1)[0])
Example #3
0
    def train(self, num_iter=100):
        '''
        supervised learning on the experience buffer.
        '''
        states = [None] * self.minibatch_size
        is_valids = [None] * self.minibatch_size
        probs = [None] * self.minibatch_size

        experience = sum(self.experience.values(), [])

        for it in range(num_iter):
            samples = prob.choice(experience, self.minibatch_size, replace=True)
            for idx, sample in enumerate(samples):
                state, p, is_valid = sample

                states[idx] = state
                is_valids[idx] = is_valid
                probs[idx] = p

            # convert into numpy array.
            states = np.array(states)
            is_valids = np.array(is_valids)
            probs = np.array(probs)

            error = self.bprop(states, probs, is_valids)
            print 'error', error
Example #4
0
 def __init__(self, rom_path, num_frames=4, live=False, skip_frame=0, mode='normal'):
     self.ale = ALEInterface()
     if live:
         USE_SDL = True
         if USE_SDL:
             if sys.platform == 'darwin':
                 import pygame
                 pygame.init()
                 self.ale.setBool('sound', False) # Sound doesn't work on OSX
             elif sys.platform.startswith('linux'):
                 self.ale.setBool('sound', True)
         self.ale.setBool('display_screen', True)
     self.mode = mode
     self.live = live
     self.ale.loadROM(rom_path)
     self.num_frames = num_frames
     self.frames = []
     self.frame_id = 0
     self.cum_reward = 0
     self.skip_frame = skip_frame
     if mode == 'small':
         img = T.matrix('img')
         self.max_pool = theano.function([img], max_pool_2d(img, [4, 4]))
         self.img_shape = (16, 16)
     else:
         self.img_shape = (84, 84) # image shape according to DQN Nature paper.
     while len(self.frames) < 4:
         self.step(choice(self.valid_actions, 1)[0])
     self.reset()
Example #5
0
    def run(self, task=None, num_epochs=10, num_episodes=100, tol=1e-4):
        if task:
            self.reset(task)

        task = self.last_task
        for ei in range(num_epochs):
            # run DQN on task for #episodes.
            self.run_task(task, num_episodes=num_episodes, tol=tol)
            task.reset()

            # compute average td error after learning.
            ex_buffer = self._filter_experience_by_task(task)
            td = self._average_td_error(ex_buffer)

            # learn the meta-model.
            feat = self.feat_func(task)
            self.meta_model.learn(feat, td)

            # sample a new task based on the meta-model.
            task_nb = self.edit_func(task)
            task_nb.append(task) # include this task.
            val_nb = []
            for new_task in task_nb:
                new_task_feat = self.feat_func(new_task)
                val_nb.append(self.meta_model.get(new_task_feat))
            print 'val_nb', val_nb

            log_prob = prob.normalize_log(np.array(val_nb) * 1.)
            p = np.exp(log_prob)
            print 'probability', p

            next_task = prob.choice(task_nb, 1, replace=True, p=p)[0]
            print 'new_task', next_task
            task = next_task
Example #6
0
    def _update_net(self):
        '''
            sample from the memory dataset and perform gradient descent on
            (target - Q(s, a))^2
        '''
        # don't update the network until sufficient experience has been
        # accumulated
        # removing this might cause correlation for early samples. suggested to be used in curriculums.
        #if len(self.experience) < self.memory_size:
        #    return
        for nn_bi in range(self.nn_num_batch):
            states = [None] * self.minibatch_size
            next_states = [None] * self.minibatch_size
            actions = np.zeros(self.minibatch_size, dtype=int)
            rewards = np.zeros(self.minibatch_size)
            nvas = []

            # sample and process minibatch
            # samples = random.sample(self.experience, self.minibatch_size) # draw without replacement.
            samples = prob.choice(self.experience, self.minibatch_size, replace=True) # draw with replacement.
            terminals = []
            for idx, sample in enumerate(samples):
                state, action, next_state, reward, nva = sample

                states[idx] = state
                actions[idx] = action
                rewards[idx] = reward
                nvas.append(nva)

                if next_state is not None:
                    next_states[idx] = next_state
                else:
                    next_states[idx] = state
                    terminals.append(idx)

            # convert states into tensor.
            states = np.array(states)
            next_states = np.array(next_states)

            # compute target reward + \gamma max_{a'} Q(ns, a')
            # Ensure target = reward when NEXT_STATE is terminal
            next_qvals = self.dqn.fprop(next_states)
            next_vs = np.zeros(self.minibatch_size)
            for idx in range(self.minibatch_size):
                if idx not in terminals:
                    next_vs[idx] = np.max(next_qvals[idx, nvas[idx]])

            targets = rewards + self.gamma * next_vs

            ## diagnostics.
            #print 'targets', targets
            #print 'next_qvals', next_qvals
            #print 'pure prop', self.dqn.fprop(states)
            #print 'prop', self.dqn.fprop(states)[range(states.shape[0]), actions]
            #print 'actions', actions
            nn_error = []
            for nn_it in range(self.nn_num_iter):
                error = self.bprop(states, actions, targets.flatten())
                nn_error.append(float(error))
            self.diagnostics['nn-error'].append(nn_error)
Example #7
0
    def run(self, tasks, num_epochs=1, num_episodes=1):
        for ei in range(num_epochs):
            t = self.t

            # task selection.
            if t == 0: # no prior experience, choose randomly.
                task = prob.choice(tasks, 1)[0]
            else:
                # GP-t.
                N = len(self.ims)
                KXX = np.zeros((N, N))
                y = np.zeros(N)

                for (t_i, (task_i, im_i)) in enumerate(self.ims):
                    for (t_j, (task_j, im_j)) in enumerate(self.ims):
                        KXX[t_i, t_j] = self.kernel_func(t_i, task_i, t_j, task_j)

                for (ti, (task_i, im_i)) in enumerate(self.ims):
                    y[ti] = im_i

                M = len(tasks)
                KXsX = np.zeros((M, N))
                KXsXs = np.zeros((M, M))

                for (t_i, task_i) in enumerate(tasks):
                    for (t_j, (task_j, im_j)) in enumerate(self.ims):
                        KXsX[t_i, t_j] = self.kernel_func(t, task_i, t_j, task_j)
                    KXsXs[t_i, t_i] = self.kernel_func(t, task_i, t, task_i)

                KXXinv = npla.inv(KXX + self.gpt_sigma ** 2 * np.eye(N))

                pred_mean = np.dot(KXsX, np.dot(KXXinv, y))
                pred_cov = KXsXs - np.dot(KXsX, np.dot(KXXinv, np.transpose(KXsX)))
                pred_sigma = np.sqrt(np.diag(pred_cov))

                pred_ucb = pred_mean + self.gpt_kappa * pred_sigma

                best_ti = np.argmax(pred_ucb)
                task = tasks[best_ti]

                # store information for diagnosis.
                self.diagnostics['mean'] = {str(task): mean for (task, mean) in zip(tasks, pred_mean)}
                self.diagnostics['sigma'] = {str(task): sigma for (task, sigma) in zip(tasks, pred_sigma)}
                self.diagnostics['ucb'] = {str(task): ucb for (task, ucb) in zip(tasks, pred_ucb)}

            score_before = self.eval_func(task)
            self.train_func(task)
            score_after = self.eval_func(task)
            im = score_after - score_before

            self.diagnostics['chosen_task'] = str(task)
            self.diagnostics['im'] = im

            self.ims.append((task, im))
            self.t += 1
Example #8
0
    def learn(self, num_iter=100, temperature=1., print_lag=None):
        for it in range(num_iter):
            dqn = self.dqn_mt
            bprop = self.bprop
            samples = prob.choice(self.experiences,
                                    self.minibatch_size, replace=True) # draw with replacement.

            # sample a minibatch.
            is_valids = []
            probs = []
            states = []

            for idx, sample in enumerate(samples):
                # randomly choose a goal.
                goal = prob.choice(self.goals, 1)[0]
                dqn = self.dqn_by_goal[goal]

                state, action, next_state, reward, meta = sample
                valid_actions = meta['last_valid_actions']
                num_actions = meta['num_actions']
                raw_state = np.array(state['raw_state'])
                raw_state[1, goal[0], goal[1]] = 1.

                states.append(raw_state)

                is_valid = [1. for action in range(num_actions) if action in set(valid_actions)]
                is_valids.append(is_valid)

                prob_vec = dqn._get_softmax_action_distribution(raw_state, temperature=temperature, valid_actions=valid_actions)
                probs.append(prob_vec)

            states = np.array(states)
            probs = np.array(probs)
            is_valids = np.array(is_valids)

            score = self.bprop(states, probs, is_valids)

            if print_lag and print_lag > 0 and it % print_lag == 0:
                print 'iter = ', it, 'score = ', score
Example #9
0
    def learn(self, num_iter=10, print_lag=50, dqn_mt=None):
        for it in range(num_iter):
            for (goal, dqn) in self.dqn_by_goal.items():
                bprop = self.bprop_by_goal[goal]
                samples = prob.choice(self.experiences,
                                      self.minibatch_size, replace=True) # draw with replacement.

                # sample a minibatch.
                states = [None] * self.minibatch_size
                next_states = [None] * self.minibatch_size
                actions = np.zeros(self.minibatch_size, dtype=int)
                rewards = np.zeros(self.minibatch_size)
                nvas = []
                terminals = []

                for idx, sample in enumerate(samples):
                    state, action, next_state, reward, meta = sample
                    nva = meta['curr_valid_actions']

                    states[idx] = np.array(state['raw_state'])
                    states[idx][1, goal[0], goal[1]] = 1.
                    actions[idx] = action
                    reward = next_state['pos'][goal[0], goal[1]] # TODO: hack for gridworld.
                    rewards[idx] = reward
                    nvas.append(nva)

                    next_states[idx] = np.array(next_state['raw_state'])
                    next_states[idx][1, goal[0], goal[1]] = 1.
                    if reward > 0.:
                        terminals.append(idx)

                states = np.array(states)
                next_states = np.array(next_states)

                # learn through backpropagation.
                shared_values = dqn_mt.fprop(next_states)[range(len(actions)), actions]
                next_qvals = dqn.fprop(next_states)
                next_vs = np.zeros(self.minibatch_size)
                for idx in range(self.minibatch_size):
                    if idx not in terminals:
                        next_vs[idx] = np.max(next_qvals[idx, nvas[idx]])


                targets = rewards + self.gamma * next_vs


                error = bprop(states, actions, targets.flatten(), shared_values)

            if print_lag and print_lag > 0 and it % print_lag == 0:
                print 'iter = ', it, 'error = ', error
Example #10
0
    def run(self, tasks, num_epochs=1):
        for ni in range(num_epochs):
            sub_tasks = prob.choice(tasks, size=self.num_sample, replace=False)
            ims = []
            self.diagnostics['im_task'] = {}
            for task in sub_tasks:
                im = self._eval_im(task)
                ims.append(im)
                self.diagnostics['im_task'][task] = im

            max_ind = np.argmax(ims)
            chosen_task = sub_tasks[max_ind]
            self.diagnostics['chosen_task'] = str(chosen_task)

            self.train_func(self.learner, chosen_task)
Example #11
0
    def sample(self, **kwargs):
        if self.coord_data:
            (_, octopus_x, octopus_y) = prob.choice(self.coord_data, 1)[0]
        else:
            octopus_x = npr.randint(0, SCREEN_WIDTH)
            octopus_y = npr.randint(0, SCREEN_HEIGHT)

        task_id = os.path.join('octopus', str(octopus_x) + '_' + str(octopus_y))
        absolute_task_path = os.path.join(LEVEL_PATH, task_id + '.txt')
        with open(absolute_task_path, 'w') as f:
            for (obj, x, y) in self.data:
                if obj == 'octopus':
                    f.write(','.join([obj, str(octopus_x), str(octopus_y)]) + '\n')
                else:
                    f.write(','.join([obj, str(x), str(y)]) + '\n')
        return OctopusTask(level=task_id, **kwargs)
Example #12
0
    def learn(self, experience_by_dqn, num_iter=100, temperature=1., print_lag=None):
        experiences = []
        for (dqn, experience) in experience_by_dqn.items():
            experiences.extend([(dqn, ex) for ex in experience])

        for it in range(num_iter):
            bprop = self.bprop
            samples = prob.choice(self.experiences,
                                    self.minibatch_size, replace=True) # draw with replacement.

            # sample a minibatch.
            is_valids = []
            targets = []
            states = []
            actions = np.zeros(self.minibatch_size, dtype=int)

            for idx, sample in enumerate(samples):
                (dqn, sample) = sample
                state, last_action, next_state, reward, meta = sample
                valid_actions = meta['last_valid_actions']
                num_actions = meta['num_actions']

                states.append(state)

                is_valid = [1. for action in range(num_actions) if action in set(valid_actions)]

                if self.loss == 'KL':
                    target = dqn._get_softmax_action_distribution(state, temperature=temperature, valid_actions=valid_actions)
                elif self.loss == 'l2' or self.loss == 'l1' or self.loss == 'l1-exp':
                    target = dqn.av(state)
                elif self.loss == 'l1-action':
                    target = [dqn.av(state)[last_action]]
                    is_valid  = [is_valid[last_action]]

                is_valids.append(is_valid)
                targets.append(target)
                actions[idx] = last_action

            states = np.array(states)
            targets = np.array(targets)
            is_valids = np.array(is_valids)

            score = self.bprop(states, actions, targets, is_valids)

            if print_lag and print_lag > 0 and it % print_lag == 0:
                print 'iter = ', it, 'score = ', score
Example #13
0
    def _learn(self, next_state, reward, next_valid_actions):
        '''
        need next_valid_actions to compute appropriate V = max_a Q(s', a).
        '''
        self._add_to_experience(self.last_state, self.last_action,
                                next_state, reward, next_valid_actions)

        samples = prob.choice(self.experience, self.minibatch_size, replace=True) # draw with replacement.

        for idx, sample in enumerate(samples):
            state, action, next_state, reward, nva = sample

            self.qfunc.table[state, action] *= (1 - self.alpha)

            if next_state is not None:
                self.qfunc.table[state, action] += self.alpha * (reward
                                            + self.gamma * np.max(self.qfunc.table[next_state, nva]))
            else:
                self.qfunc.table[state, action] += self.alpha * reward
Example #14
0
    def _on_screen_update(self, _, *args, **kwargs):
        self.total_frames += 1
        is_end = self.is_end()

        if not is_end and (self.total_frames-1) % self.frames_per_action > 0:
            if self.callback: # TODO: callback on skip steps. now callback is only used for videos.
                self.callback()

            return


        score = self.get_score()
        reward = score - self.curr_score
        self.cum_reward += reward
        self.curr_score = score

        if self.state_type == 'pixel':
            self.curr_screen_rgb = pygame.surfarray.array3d(pygame.display.get_surface())

        frame = self._get_frame()
        self.frames.append(frame)

        if len(self.frames) < self.num_frames:
            action = choice(self.valid_actions, 1)[0]
        else:
            if len(self.frames) > self.num_frames:
                self.frames = self.frames[-4:]
            curr_state = self._get_state()

            if self.callback:
                self.callback()

            if self.last_action != None:
                self.learner.send_feedback(reward, curr_state, self.valid_actions, is_end)
            if is_end:
                return
            
            action = self.learner.get_action(curr_state, self.valid_actions)
            self.total_steps += 1
            self.last_action = action

        self._last_keys_pressed = self._keys_pressed
        self._keys_pressed = [self.valid_events[action]]
Example #15
0
    def step(self, actionid):
        assert(actionid >= 0 and actionid < self.num_actions)
        action = self.ACTIONS[actionid]

        if action == 'move eye to hand':
            self.state['eye_pos'] = self.state['hand_pos']
        elif action == 'move eye to marker':
            self.state['eye_pos'] = self.state['mark_pos']
        elif action == 'move eye north':
            if self.state['eye_pos'][0] > 0:
                self.state['eye_pos'][0] -= 1
        elif action == 'move eye south':
            if self.state['eye_pos'][0] < self.size - 1:
                self.state['eye_pos'][0] += 1
        elif action == 'move eye west':
            if self.state['eye_pos'][1] > 0:
                self.state['eye_pos'][1] -= 1
        elif action == 'move eye east':
            if self.state['eye_pos'][1] < self.size - 1:
                self.state['eye_pos'][1] += 1
        elif action == 'move eye to a random object':
            pos = prob.choice(self.object_pos, 1)[0]
            self.state['eye_pos'] = pos
        elif action == 'move hand to eye':
            self.state['hand_pos'] = self.state['eye_pos']
        elif action == 'move marker to eye':
            self.state['mark_pos'] = self.state['eye_pos']
        elif action == 'touch object' and self._can_touch_object():
            if self.state['eye_pos'] == self.state['red_button_pos']:
                self.state['music'] = False
            elif self.state['eye_pos'] == self.state['blue_button_pos']:
                self.state['music'] = True
            elif self.state['eye_pos'] == self.state['switch_pos']:
                self.state['light'] = not self.state['light']
            elif (self.state['eye_pos'] == self.state['ball_pos']
                  and (self.state['mark_pos'][0] == self.state['ball_pos'][0]
                       or self.state['mark_pos'][1] == self.state['ball_pos'][1])
                  ): # kick the ball if ball and mark are on a straight line.
                self.state['ball_pos'] = self.state['mark_pos']

        return 0.
Example #16
0
    def _get_uct_action(self, state_vector, uct, param_c, valid_actions, debug=False):
        init_count = 1. # initial count for all actions.
        action_values = {action: self.av(state_vector)[action] for action in valid_actions}
        uct_values = {action: uct.count_sa(state_vector, action) for action in valid_actions}
        uct_state_values = {action: uct.count_s(state_vector) for action in valid_actions}
        ucb = {action: action_values[action] + param_c * np.sqrt(np.log((len(valid_actions) * init_count + uct_state_values[action])) \
                            / (init_count + uct_values[action])) for action in valid_actions}
        max_val = -float('inf')
        max_actions = []
        for (action, value) in ucb.items():
            if value > max_val:
                max_val = value
                max_actions = [action]
            if value == max_val:
                max_actions.append(action)

        if debug:
            print 'action_values', action_values
            print 'uct_values', uct_values
            print 'uct_state_values', uct_state_values
            print 'ucb', ucb

        return prob.choice(max_actions, 1)[0]
Example #17
0
    def run(self, tasks, num_epochs=1):
        # set local variables.
        K = self.K
        all_settings = set()
        for task in tasks:
            all_settings.add(self.feat_func(task))
        all_settings = list(all_settings)

        for task in tasks:
            if task not in self.task_score:
                self.task_score[task] = self.eval_func(task)

        for ni in range(num_epochs):
            im_pred = {}
            im_sigma = {}
            im_ucb = {}

            if len(self.task_im) < 1:
                # select task based on prior.
                if not self.init_setting:
                    chosen_task = prob.choice(tasks, 1)
                else:
                    chosen_task = self.sample_func(self.init_setting)
            else:
                # select task based on GP.
                im = [(self.feat_func(task), im) for (task, im) in self.task_im.items()]

                # use Gaussian Process to estimate potential function.
                N = len(im)
                KXX = np.zeros((N, N))
                y = np.zeros(N)
                for (ti, (setting_i, im_i)) in enumerate(im):
                    for (tj, (setting_j, im_j)) in enumerate(im):
                        KXX[ti, tj] = self.kernel_func(setting_i, setting_j)
                for (ti, (setting_i, im_i)) in enumerate(im):
                    y[ti] = im_i

                M = len(all_settings)
                KXsX = np.zeros((M, N))
                KXsXs = np.zeros((M, M))
                for (ti, setting_i) in enumerate(all_settings):
                    for (tj, (setting_j, im_j)) in enumerate(im):
                        KXsX[ti, tj] = self.kernel_func(setting_i, setting_j)
                    KXsXs[ti, ti] = self.kernel_func(setting_i, setting_i)

                KXXinv = npla.inv(KXX + self.sigma_n**2 * np.eye(N))

                pred_mean = np.dot(KXsX, np.dot(KXXinv, y))
                pred_cov = KXsXs - np.dot(KXsX, np.dot(KXXinv, np.transpose(KXsX)))
                pred_sigma = np.sqrt(np.diag(pred_cov))

                for (ti, setting) in enumerate(all_settings):
                    im_pred[setting] = pred_mean[ti]
                    im_sigma[setting] = pred_sigma[ti]
                    im_ucb[setting] = pred_mean[ti] + self.eta * pred_sigma[ti]

                new_settings = sorted(all_settings, key=lambda setting: im_ucb[setting], reverse=True)
                new_setting = new_settings[0]

                chosen_task = self.sample_func(new_setting)

            self.train_func(chosen_task)

            for task in tasks:
                score = self.eval_func(task)
                self.task_im[task] = score - self.task_score[task]
                self.task_score[task] = score

            # collect diagnostics.
            self.diagnostics['task_im'] = self.task_im
            self.diagnostics['task_score'] = self.task_score
            self.diagnostics['pred'] = im_pred
            self.diagnostics['sigma'] = im_sigma
            self.diagnostics['ucb'] = im_ucb
            self.diagnostics['task'] = chosen_task
            self.diagnostics['setting'] = self.feat_func(chosen_task)
Example #18
0
    def run(self, tasks, num_epochs=1):
        if len(self.active_tasks) == 0: # initial round.
            # chose a set of active tasks uniformly at random.
            self.active_tasks = prob.choice(tasks, size=self.K, replace=True)

        # set local variables.
        active_tasks = self.active_tasks
        passive_tasks = self.passive_tasks
        K = self.K
        K0 = self.K0
        K1 = self.K1

        for ni in range(num_epochs):
            # compute old score if necessary.
            for task in active_tasks:
                if task not in self.task_score:
                    self.task_score[task] = self.eval_func(task)

            # learn on each task.
            for task in active_tasks:
                self.train_func(task)

            if len(passive_tasks) >= K1:
                selected_passive_tasks = prob.choice(self.passive_tasks, size=K1, replace=False)
                for task in selected_passive_tasks:
                    self.train_func(task)

            # evaluate improvement.
            im = {}
            for task in active_tasks:
                new_score = self.eval_func(task)
                im[task] = new_score - self.task_score[task]
                self.task_score[task] = new_score

            # create candidate set.
            candidate_set = set()
            for task in active_tasks:
                candidate_set = candidate_set.union(set(self.expand_func(task)))
            candidate_set = candidate_set.union(set(prob.choice(tasks, size=K0, replace=False)))
            candidate_set = list(candidate_set)

            new_tasks = candidate_set

            # use Gaussian Process to estimate potential function.
            N = len(im)
            KXX = np.zeros((N, N))
            y = np.zeros(N)
            for (ti, (task_i, im_i)) in enumerate(im.items()):
                for (tj, (task_j, im_j)) in enumerate(im.items()):
                    KXX[ti, tj] = self.kernel_func(task_i, task_j)
            for (ti, (task_i, im_i)) in enumerate(im.items()):
                y[ti] = im_i

            M = len(new_tasks)
            KXsX = np.zeros((M, N))
            KXsXs = np.zeros((M, M))
            for (ti, task_i) in enumerate(new_tasks):
                for (tj, (task_j, im_j)) in enumerate(im.items()):
                    KXsX[ti, tj] = self.kernel_func(task_i, task_j)
                KXsXs[ti, ti] = self.kernel_func(task_i, task_i)

            KXXinv = npla.inv(KXX + self.sigma_n**2 * np.eye(N))

            pred_mean = np.dot(KXsX, np.dot(KXXinv, y))
            pred_cov = KXsXs - np.dot(KXsX, np.dot(KXXinv, np.transpose(KXsX)))
            pred_sigma = np.sqrt(np.diag(pred_cov))

            im_pred = {}
            im_sigma = {}
            im_ucb = {}
            for (ti, task) in enumerate(new_tasks):
                im_pred[task] = pred_mean[ti]
                im_sigma[task] = pred_sigma[ti]
                im_ucb[task] = pred_mean[ti] + self.eta * pred_sigma[ti]

            new_tasks = sorted(new_tasks, key=lambda task: im_ucb[task], reverse=True)
            new_tasks_selected = new_tasks[:K]

            self.passive_task = self.passive_tasks.union(self.active_tasks).difference(new_tasks_selected)
            self.active_tasks = new_tasks_selected

            # collect diagnostics.
            self.diagnostics['im'] = im
            self.diagnostics['pred'] = im_pred
            self.diagnostics['sigma'] = im_sigma
            self.diagnostics['ucb'] = im_ucb
            self.diagnostics['score'] = self.task_score
            self.diagnostics['new-tasks'] = new_tasks
            self.diagnostics['new-tasks-selected'] = new_tasks_selected
            self.diagnostics['active-tasks'] = active_tasks
Example #19
0
 def get_action(self, state, valid_actions, **kwargs):
     action = prob.choice(valid_actions, 1)[0]
     return action
Example #20
0
def generate_experience_mt(policy, tasks, budget_experience, budget_per_episode=None, state_attr='curr_state'):
    experiences = []
    while len(experiences) < budget_experience:
        task = prob.choice(tasks, 1)[0]
        experiences.extend(generate_experience(policy, task, budget_experience - len(experiences), budget_per_episode, budget_episodes=1, state_attr=state_attr))
    return experiences
Example #21
0
    def run(self, tasks, num_epochs=1):
        # set local variables.
        K = self.K
        K0 = self.K0

        # initial round.
        if len(self.active_tasks) == 0:
            # chose a set of active tasks uniformly at random.
            if self.init_tasks:
                self.active_tasks = set(prob.choice(self.init_tasks, size=K0, replace=False))
            else:
                self.active_tasks = set(prob.choice(tasks, size=K0, replace=False))

        for ni in range(num_epochs):
            active_tasks = self.active_tasks
            curr_tasks = active_tasks

            # compute old score if necessary.
            for task in curr_tasks:
                if task not in self.task_score:
                    self.task_score[task] = self.eval_func(task)

            # learn on each task.
            for task in active_tasks:
                self.train_func(task)

            # evaluate improvement.
            im = {}
            for task in curr_tasks:
                new_score = self.eval_func(task)
                improvement = new_score - self.task_score[task]
                self.task_score[task] = new_score
                if task not in self.im_mem:
                    self.im_mem[task] = []
                self.im_mem[task].append((self.time, improvement))

                im[task] = np.mean([i for (t, i) in self.im_mem[task] if self.time-t <= 3])

            # create candidate set.
            new_tasks = tasks
            if len(new_tasks) == 0:
                print 'WARNING: new tasks is empty in GP'

            # use Gaussian Process to estimate potential function.
            N = len(im)
            KXX = np.zeros((N, N))
            y = np.zeros(N)
            for (ti, (task_i, im_i)) in enumerate(im.items()):
                for (tj, (task_j, im_j)) in enumerate(im.items()):
                    KXX[ti, tj] = self.kernel_func(task_i, task_j)
            for (ti, (task_i, im_i)) in enumerate(im.items()):
                y[ti] = im_i

            M = len(new_tasks)
            KXsX = np.zeros((M, N))
            KXsXs = np.zeros((M, M))
            for (ti, task_i) in enumerate(new_tasks):
                for (tj, (task_j, im_j)) in enumerate(im.items()):
                    KXsX[ti, tj] = self.kernel_func(task_i, task_j)
                KXsXs[ti, ti] = self.kernel_func(task_i, task_i)

            KXXinv = npla.inv(KXX + self.sigma_n**2 * np.eye(N))

            pred_mean = np.dot(KXsX, np.dot(KXXinv, y))
            pred_cov = KXsXs - np.dot(KXsX, np.dot(KXXinv, np.transpose(KXsX)))
            pred_sigma = np.sqrt(np.diag(pred_cov))

            im_pred = {}
            im_sigma = {}
            im_ucb = {}
            for (ti, task) in enumerate(new_tasks):
                im_pred[task] = pred_mean[ti]
                im_sigma[task] = pred_sigma[ti]
                im_ucb[task] = pred_mean[ti] + self.eta * pred_sigma[ti]

            new_tasks = sorted(new_tasks, key=lambda task: im_ucb[task], reverse=True)
            new_tasks_selected = new_tasks[:K]

            self.active_tasks = self.active_tasks.union(set(new_tasks_selected))

            self.time += 1

            # collect diagnostics.
            self.diagnostics['im'] = im
            self.diagnostics['pred'] = im_pred
            self.diagnostics['sigma'] = im_sigma
            self.diagnostics['ucb'] = im_ucb
            self.diagnostics['score'] = self.task_score
            self.diagnostics['new-tasks'] = new_tasks
            self.diagnostics['new-tasks-selected'] = new_tasks_selected
            self.diagnostics['active-tasks'] = self.active_tasks
Example #22
0
    def _update_net(self):
        '''
            sample from the memory dataset and perform gradient descent on
            (target - Q(s, a))^2
        '''
        # don't update the network until sufficient experience has been
        # accumulated
        # removing this might cause correlation for early samples. suggested to be used in curriculums.
        if self.total_exp < self.skip_frame:
            return
        if self.total_exp % self.update_freq:
            return
        #if len(self.experience) < self.memory_size:
        #    return
        for nn_bi in range(self.nn_num_batch):
            states = [None] * self.minibatch_size
            next_states = [None] * self.minibatch_size
            actions = np.zeros(self.minibatch_size, dtype=int)
            rewards = np.zeros(self.minibatch_size)
            nvas = []

            # sample and process minibatch
            # samples = random.sample(self.experience, self.minibatch_size) # draw without replacement.
            samples = prob.choice(self.experience, self.minibatch_size, replace=True) # draw with replacement.
            terminals = []
            for idx, sample in enumerate(samples):
                state, action, next_state, reward, meta = sample
                nva = meta['next_valid_actions']

                states[idx] = state
                actions[idx] = action
                rewards[idx] = reward
                nvas.append(nva)

                if next_state is not None:
                    next_states[idx] = next_state
                else:
                    next_states[idx] = state
                    terminals.append(idx)

            # convert states into tensor.
            states = np.array(states).astype(floatX)
            next_states = np.array(next_states).astype(floatX)

            # compute target reward + \gamma max_{a'} Q(ns, a')
            # Ensure target = reward when NEXT_STATE is terminal
            if self.target_freq > 0:
                next_qvals = self.dqn_frozen.fprop(next_states)
            else:
                next_qvals = self.dqn.fprop(next_states)

            use_DDQN = False
            next_vs = np.zeros(self.minibatch_size).astype(floatX)
            if use_DDQN: # double DQN.
                next_qvals_unfrozen = self.dqn.fprop(next_states)
                for idx in range(self.minibatch_size):
                    if idx not in terminals:
                        next_action_index = np.argmax(next_qvals_unfrozen[idx, nvas[idx]])
                        next_vs[idx] = next_qvals[idx, nvas[idx][next_action_index]]
            else:
                for idx in range(self.minibatch_size):
                    if idx not in terminals:
                        next_vs[idx] = np.max(next_qvals[idx, nvas[idx]])

            targets = rewards + self.gamma * next_vs

            #if (targets > 100.).any():
            #    print 'error, target > 1', targets
            #    print 'rewards', rewards
            #    print 'next_vs', next_vs

            # using regularization.
            reg_vs = []
            reg = self.regularizer.get('dqn-q')
            if reg:
                dqn = reg['dqn']
                #dqn_avs = dqn.fprop(states)
                dqn_avs = self.dqn_frozen.fprop(states)
                #dqn_avs = next_qvals
                #for idx in range(self.minibatch_size):
                #    if idx not in terminals:
                #        dqn_avs[idx, :] = 0.
                reg_vs.append(dqn_avs)



            ## diagnostics.
            #print 'targets', targets
            #print 'next_qvals', next_qvals
            #print 'pure prop', self.dqn.fprop(states)
            #print 'prop', self.dqn.fprop(states)[range(states.shape[0]), actions]
            #print 'actions', actions
            nn_error = []
            for nn_it in range(self.nn_num_iter):
                if debug_flag and self.target_freq and self.total_exp % self.target_freq == 0:
                    print 'value before\n', self.dqn.fprop(states)[range(self.minibatch_size), actions]
                error = self.bprop(states, actions, targets.flatten(), *reg_vs)
                if debug_flag and self.target_freq and self.total_exp % self.target_freq == 0:
                    print 'nn_it', nn_it, 'error', error
                    print 'value after\n', self.dqn.fprop(states)[range(self.minibatch_size), actions]
                    print 'targets\n', targets
                    #print 'dqn vs\n', self.dqn.fprop(states)
                    #print 'dqn avs\n', dqn_avs
                    print 'next_qvals\n', next_qvals
                    print 'rewards', rewards
                    print 'total_exp', self.total_exp
                nn_error.append(float(error))
            self.diagnostics['nn-error'].append(nn_error)
Example #23
0
 def get_action(self, state, valid_actions=None):
     if not valid_actions:
         valid_actions = range(self.num_actions)
     action = prob.choice(valid_actions, 1)[0]
     return action
Example #24
0
from pyrl.tasks.pyale import PythonGame
from pyrl.tasks.pyale.pong import PongGame
from pyrl.utils import Timer
from pyrl.visualize.visualize import *
from pyrl.prob import choice

game = PongGame()

with Timer('valid actions'):
    for it in range(100):
        print 'valid_actions', game.valid_actions

vr = RawVideoRecorder('video.m4v', (640, 480))
for it in range(100):
    action = choice(range(game.num_actions), 1)[0]
    reward = game.step(action)
    print 'state', game.curr_state
    print 'is_end', game.is_end()
    #vr.write_frame(game.visualize_raw())
    print 'action', action, 'reward', reward
vr.stop()
Example #25
0

def callback(task):
    imgbuf = StringIO()
    task.visualize(fig=1, fname="__cache__.jpg", format="jpg")
    with open("__cache__.jpg", "rb") as imgbuf:
        data = imgbuf.read()
    vr.write_frame(data)


game = AtariGame("data/roms/pong.bin", live=True, skip_frame=65)
# plt.imshow(game._curr_frame, cmap='Greys_r', interpolation='none')
# plt.show()

vr = VideoRecorder("video.m4v")

buf = StringIO()
count = 0
while not game.is_end():
    count += 1
    a = choice(game.valid_actions, 1)[0]
    print "action", a
    print "game", game.valid_actions
    a = 12
    # game.visualize(fig=1, fname=buf, format='jpg')
    print game.curr_state.shape
    game.step(a)
    callback(game)

vr.stop()
Example #26
0
    def run(self, task, num_episodes=100, num_steps=float('inf'), tol=1e-4, debug=False):
        '''
        update qval every *num_epoch*
        for every *num_epoch*, run *num_episodes* of MCTS.
        '''
        cum_rewards = []
        total_steps = 0.

        for ei in range(num_episodes):
            count_steps = 0.
            cum_reward = 0.
            factor = 1.
            history = []
            phase_expansion = False

            task.reset()

            while True:
                if total_steps > num_steps or count_steps >= np.log(tol) / np.log(self.gamma) or task.is_end():
                    self.backprop(history)
                    break

                curr_state = task.curr_state
                meta = {}

                unvisited_actions = [action for action in task.valid_actions if self.qval.get(curr_state, action) == None]

                if not phase_expansion and unvisited_actions: # can we switch back to qval if unvisited is empty?
                    phase_expansion = True
                    action = prob.choice(unvisited_actions, 1)[0]
                    meta['phase'] = 'selection'
                elif phase_expansion: # expand.
                    meta['phase'] = 'expansion'
                    if self.default_policy == 'random':
                        action = self.random_policy.get_action(curr_state, valid_actions=task.valid_actions)
                    elif self.default_policy == 'rb-eps':
                        action = self.rb.get_action(curr_state, valid_actions=task.valid_actions, method='eps-greedy', epsilon=0.05)

                else: # select.
                    meta['phase'] = 'selection'
                    action = self.qval.get_action(curr_state, valid_actions=task.valid_actions, method='uct', uct=self.uct, param_c=self.param_c, debug=False)
                    # action = self.qval.get_action(curr_state, valid_actions=task.valid_actions, method='eps-greedy', epsilon=0.05)

                meta['valid_actions'] = task.valid_actions

                reward = task.step(action)
                cum_reward = cum_reward + factor * reward
                factor *= self.gamma

                history.append((curr_state, action, reward, meta))
                count_steps += 1
                total_steps += 1
                self.total_exp += 1

            cum_rewards.append(cum_reward)

            if total_steps > num_steps:
                break

        task.reset()
        print 'ei', ei
        print 'cum', cum_rewards
        return np.mean(cum_rewards)
Example #27
0
    def update_net(self, num_iter=1):
        '''
            sample from the memory dataset and perform gradient descent on
            (target - Q(s, a))^2
        '''
        #if self.total_exp_by_task[task] < self.memory_size:
        #    return

        # merge experience buffer.
        experience = []
        for task in self.ex_task:
            experience.extend(self.ex_task[task])

        errors = []

        for it in range(num_iter):
            # don't update the network until sufficient experience has been
            # accumulated
            states = [None] * self.minibatch_size
            next_states = [None] * self.minibatch_size
            actions = np.zeros(self.minibatch_size, dtype=int)
            rewards = np.zeros(self.minibatch_size)
            nvas = []

            # sample and process minibatch
            # samples = random.sample(self.experience, self.minibatch_size) # draw without replacement.
            samples = prob.choice(experience, self.minibatch_size, replace=True) # draw with replacement.
            terminals = []

            for idx, sample in enumerate(samples):
                state, action, next_state, reward, nva = sample

                states[idx] = state
                actions[idx] = action
                rewards[idx] = reward
                nvas.append(nva)

                if next_state is not None:
                    next_states[idx] = next_state
                else:
                    next_states[idx] = state
                    terminals.append(idx)

            # convert states into tensor.
            states = np.array(states)
            next_states = np.array(next_states)

            # compute target reward + \gamma max_{a'} Q(ns, a')
            # Ensure target = reward when NEXT_STATE is terminal
            next_qvals = self.dqn.fprop(next_states)
            next_vs = np.zeros(self.minibatch_size)
            for idx in range(self.minibatch_size):
                if idx not in terminals:
                    next_vs[idx] = np.max(next_qvals[idx, nvas[idx]])

            targets = rewards + self.gamma * next_vs

            ## diagnostics.
            #print 'targets', targets
            #print 'next_qvals', next_qvals
            #print 'pure prop', self.dqn.fprop(states)
            #print 'prop', self.dqn.fprop(states)[range(states.shape[0]), actions]
            #print 'actions', actions
            #for it in range(10):
            error = self.bprop(states, actions, targets.flatten())
            errors.append(error)

            #print 'it', it, 'error', error
        return np.mean(errors)
Example #28
0
    def run(self, num_epochs=1, num_episodes=1):
        cov_func = lambda task1, task2, t1, t2: self.gpt_v * np.exp(- (self.dist(task1, task2) ** 2 * self.gpt_r + self.gpt_eta * (t1 - t2) ** 2))
        for ei in range(num_epochs):
            # task selection.
            # complexity max(#task * history, history ** 2.3)
            if len(self.examples) == 0: # no prior experience, choose randomly.
                task = prob.choice(self.tasks, 1)[0]
            else:
                # GP-t.
                mu = np.zeros(self.num_tasks)
                sigma = np.zeros(self.num_tasks)
                ucb = np.zeros(self.num_tasks)
                # Kinv = npla.inv(self.K + self.gpt_sigma ** 2)
                # Kinv_y = np.dot(Kinv, self.y)
                Kinv_y = npla.solve(self.K + np.eye(self.t) * self.gpt_sigma ** 2, self.y)
                for ti, task in enumerate(self.tasks):
                    vec = np.zeros(self.t)
                    for ei in range(self.t):
                        (t_ei, task_ei, _) = self.examples[ei]
                        vec[ei] = cov_func(task, task_ei, self.t, t_ei)
                    mu[ti] = np.dot(vec, Kinv_y)
                    Kinv_vec = npla.solve(self.K + np.eye(self.t) * self.gpt_sigma ** 2, vec)
                    sigma[ti] = self.gpt_v + self.gpt_sigma ** 2 - np.dot(vec, Kinv_vec)
                    ucb[ti] = mu[ti] + self.gpt_kappa * sigma[ti]
                best_ti = np.argmax(ucb)
                task = self.tasks[best_ti]
                # store information for diagnosis.
                self.mu = mu
                self.sigma = sigma
                self.ucb = ucb

            # import pdb; pdb.set_trace()
            # run training.
            self._run_task(task, num_episodes=num_episodes)

            # evaluate performance.
            self.last_task_performance = np.zeros(self.num_tasks)
            for ti in range(self.num_tasks):
                self.last_task_performance[ti] = expected_reward_tabular_normalized(self.dqn, self.tasks[ti], tol=1e-4)
            performance = np.mean(self.last_task_performance)
            progress = performance - self.last_performance
            # update statistics.
            self.examples.append((self.t, task, progress))
            self.t += 1
            t = self.t

            new_K = np.zeros((t, t))
            new_y = np.zeros(t)
            if t > 1:
                new_K[:t - 1, :t - 1] = self.K
                new_y[:t - 1] = self.y
            new_K[t - 1, t - 1] = self.gpt_v
            new_y[t - 1] = progress
            for ei in range(t - 1):
                (t_ei, task_ei, _) = self.examples[ei]
                new_K[t - 1, ei] = cov_func(task_ei, task, t_ei, t - 1)
                new_K[ei, t - 1] = new_K[t - 1, ei] # symmetric.
            self.K = new_K
            self.y = new_y
            self.last_performance = performance
            self.last_progress = progress
            self.last_task = task
            self.last_task_ti = self.tasks.index(task)
Example #29
0
 def get_action(self, curr_state, valid_actions):
     action = choice(valid_actions, 1)[0]
     return action
Example #30
0
    def _get_relaxation_action(self, state_vector, dqn, uct, param_c, valid_actions, strategy='wa-state', debug=False):
        init_count = 1. # initial count for all actions.
        action_values = {action: self.av(state_vector)[action] for action in valid_actions}
        uct_values = {action: uct.count_sa(state_vector, action) for action in valid_actions}
        uct_state_values = {action: uct.count_s(state_vector) for action in valid_actions}
        # ucb = upper confidence bound.
        ucb = {action: action_values[action] + param_c * np.sqrt(np.log((len(valid_actions) * init_count + uct_state_values[action])) \
                            / (init_count + uct_values[action])) for action in valid_actions}
        # rb = relaxation bound.
        rb = {action: dqn.av(state_vector)[action] for action in valid_actions}

        print 'strategy', strategy
        # just use rb.
        if strategy == 'rb':
            finalb = rb

        # just use av.
        if strategy == 'av':
            finalb = action_values

        # min of upper bounds.
        if strategy == 'ucb-rb':
            finalb = {action: min(ucb[action], rb[action]) for action in valid_actions}

        #thres = 10
        #finalb = {action: ucb[action] if uct.count_sa(state_vector, action) > thres else rb[action]
        #          for action in valid_actions}

        #thres = 10
        #finalb = {action: ucb[action] if uct.count_s(state_vector) > thres else rb[action]
        #          for action in valid_actions}

        # weighted average.
        if strategy == 'wa-state':
            ratio = 1. / (1. + uct.count_s(state_vector))
            finalb = {action: ucb[action] * (1 - ratio) + rb[action] * ratio for action in valid_actions}

        # weighted average by state action.
        if strategy == 'wa':
            finalb = {}
            for action in valid_actions:
                ratio = 1. / (1. + uct.count_sa(state_vector, action))
                finalb[action] = action_values[action] * (1 - ratio) + rb[action] * ratio

        # duality-gap
        if strategy == 'duality-gap':
            gap2 = sum([(rb[action] - action_values[action]) **2 for action in valid_actions]) / len(valid_actions)
            ratio = max(0, 1 - np.std(rb.values()) **2  / gap2 / uct.count_s(state_vector))
            finalb = {action: ucb[action] * (1 - ratio) + rb[action] * ratio for action in valid_actions}
            if debug:
                print 'std of relaxation', np.std(rb.values())
                print 'mean gap', np.sqrt(gap2)
                print 'ratio', ratio

        # finalb = ucb

        # choose action.
        max_val = -float('inf')
        max_actions = []
        for (action, value) in finalb.items():
            if value > max_val:
                max_val = value
                max_actions = [action]
            if value == max_val:
                max_actions.append(action)

        if debug:
            print 'action_values', action_values
            print 'uct_values', uct_values
            print 'uct_state_values', uct_state_values
            print 'ucb', ucb
            print 'rb', rb
            print 'finalb', finalb

        return prob.choice(max_actions, 1)[0]