Beispiel #1
0
    def _get_data(self, index, meters, mode):
        #
        # Pdb().set_trace()
        feed_dict = self.data_iterator[mode].next()

        meters.update(number=feed_dict['n'].data.numpy().mean())
        if args.use_gpu:
            feed_dict = as_cuda(feed_dict)
        return feed_dict
    def __call__(self, ind):
        raw_img, img, img_embedding, cap, cap_ext = self.dataset[ind]
        img_embedding_precomp = self.model.img_enc(as_cuda(as_variable(img_embedding).unsqueeze(0)))

        img = as_variable(img)
        img.requires_grad = True
        img_embedding_a = img_embedding = self.image_encoder(as_cuda(img.unsqueeze(0)))
        img_embedding = self.model.img_enc(img_embedding)

        txt = [cap]
        txt.extend(cap_ext)
        txt_embeddings, txt_var = self.enc_txt(txt)

        return Record(
                raw_img, cap, cap_ext,
                img, img_embedding, img_embedding_precomp,
                txt_var, txt_embeddings[0], txt_embeddings[1:]
        )
Beispiel #3
0
    def _get_result_given_player(self, index, meters, number, player, mode):
        assert mode in ['train', 'test', 'mining', 'inherit']
        params = dict(eval_only=True,
                      number=number,
                      play_name='{}_epoch{}_episode{}'.format(
                          mode, self.current_epoch, index))
        backup = None
        if mode == 'train':
            params['eval_only'] = False
            params['dataset'] = self.valid_action_dataset
            params['entropy_beta'] = self.entropy_beta
            meters.update(lr=self.lr, entropy_beta=self.entropy_beta)
        elif mode == 'test':
            params['dump'] = True
            params['use_argmax'] = True
        else:
            backup = copy.deepcopy(player)
            params['use_argmax'] = self.is_candidate

        succ, score, traj, length = run_episode(player, self.model, **params)
        meters.update(number=number, succ=succ, score=score, length=length)

        if mode == 'train':
            feed_dict = make_data(traj, args.gamma)
            feed_dict['entropy_beta'] = as_tensor(self.entropy_beta).float()

            # content from valid_move dataset
            states, actions, labels = \
                self.valid_action_dataset.sample_batch(args.batch_size)
            feed_dict['pred_states'] = as_tensor(states)
            feed_dict['pred_actions'] = as_tensor(actions)
            feed_dict['valid'] = as_tensor(labels).float()
            if args.use_gpu:
                feed_dict = as_cuda(feed_dict)
            return feed_dict
        else:
            message = ('> {} iter={iter}, number={number}, succ={succ}, '
                       'score={score:.4f}, length={length}').format(
                           mode, iter=index, **meters.val)
            return message, dict(succ=succ, number=number, backup=backup)
Beispiel #4
0
    def _get_result_given_player(self, index, meters, number, player, mode):
        assert mode in ['train', 'test', 'mining', 'inherit']
        params = dict(eval_only=True,
                      number=number,
                      play_name='{}_epoch{}_episode{}'.format(
                          mode, self.current_epoch, index))
        backup = None
        if mode == 'train':
            params['eval_only'] = False
            params['entropy_beta'] = self.entropy_beta
            meters.update(lr=self.lr, entropy_beta=self.entropy_beta)
        elif mode == 'test':
            params['dump'] = True
            params['use_argmax'] = True
        else:
            backup = copy.deepcopy(player)
            params['use_argmax'] = self.is_candidate
        succ, score, traj, length, optimal = \
            run_episode(player, self.model, **params)
        meters.update(number=number,
                      succ=succ,
                      score=score,
                      length=length,
                      optimal=optimal)

        if mode == 'train':
            feed_dict = make_data(traj, args.gamma)
            feed_dict['entropy_beta'] = as_tensor(self.entropy_beta).float()

            if args.use_gpu:
                feed_dict = as_cuda(feed_dict)
            return feed_dict
        else:
            message = '> {} iter={iter}, number={number}, succ={succ}, \
score={score:.4f}, length={length}, optimal={optimal}'.format(mode,
                                                              iter=index,
                                                              **meters.val)
            return message, dict(succ=succ, number=number, backup=backup)
Beispiel #5
0
    def _get_result_given_player(self, index, meters, number, player, mode):
        assert mode in ['train', 'test', 'mining', 'mining-deter', 'mining-stoch', 'inherit', 'test-inter', 'test-inter-deter', 'test-deter']
        params = dict(
            eval_only=True,
            number=number,
            play_name='{}_epoch{}_episode{}'.format(mode, self.current_epoch, index))
        backup = None
        if mode == 'train':
            params['eval_only'] = False
            params['dataset'] = self.valid_action_dataset
            params['entropy_beta'] = self.entropy_beta
            meters.update(lr=self.lr, entropy_beta=self.entropy_beta)
        elif 'test' in mode:
            params['dump'] = True
            params['use_argmax'] = 'deter' in mode
        else:
            backup = copy.deepcopy(player)
            params['use_argmax'] = index < (args.mining_epoch_size//2)

        if mode == 'train':
            if args.use_gpu:
                self.model.cpu()

            mergedfc = []
            for i in range(args.ntrajectory):
                succ, score, traj, length, optimal = run_episode(player, self.model, mode, need_restart=(i!=0), **params)
                if args.task in ['sort', 'path']:
                    meters.update(number=number, succ=succ, score=score, length=length, optimal=optimal)
                else:
                    meters.update(number=number, succ=succ, score=score, length=length)
                feed_dict = make_data(traj, args.gamma)
                # content from valid_move dataset
                if args.pred_weight != 0.0:
                    states, actions, labels = self.valid_action_dataset.sample_batch(args.batch_size)
                    feed_dict['pred_states'] = as_tensor(states)
                    feed_dict['pred_actions'] = as_tensor(actions)
                    feed_dict['valid'] = as_tensor(labels).float()
                mergedfc.append(feed_dict)

            for k in feed_dict.keys():
                if k not in ["rewards", "entropy_beta"]:  # reward not used to update loss
                    if type(mergedfc[0][k]) is list:
                        f1 = [j[k][0] for j in mergedfc]
                        f2 = [j[k][1] for j in mergedfc]
                        feed_dict[k] = [torch.cat(f1, dim=0), torch.cat(f2, dim=0)]
                    else:
                        feed_dict[k] = torch.cat([j[k] for j in mergedfc], dim=0)
            feed_dict['entropy_beta'] = as_tensor(self.entropy_beta).float()
            feed_dict['training'] = as_tensor(True)

            if args.norm_rewards:
                if args.accum_grad > 1:
                    feed_dict['discount_rewards'] = self.model.rnorm.obs_filter(feed_dict['discount_rewards'])
                elif feed_dict['discount_rewards'].shape[0] > 1:
                    feed_dict['discount_rewards'] = (feed_dict['discount_rewards'] - feed_dict['discount_rewards'].mean()) / (feed_dict['discount_rewards'].std() + 10 ** -7)

            #dirty trick
            if args.accum_grad > 1:
                self.optimizer.provide_batch_size(feed_dict['discount_rewards'].shape[0])

            if args.use_gpu:
                feed_dict = as_cuda(feed_dict)
                self.model.cuda()
            self.model.train()
            return feed_dict
        else:
            if args.use_gpu:
                self.model.cpu()
            succ, score, traj, length, optimal = run_episode(player, self.model, mode, **params)
            if args.task in ['sort', 'path']:
                meters.update(number=number, succ=succ, score=score, length=length, optimal=optimal)
                message = ('> {} iter={iter}, number={number}, succ={succ}, '
                       'score={score:.4f}, length={length}, optimal={optimal}').format(mode, iter=index, **meters.val)
            else:
                meters.update(number=number, succ=succ, score=score, length=length)
                message = ('> {} iter={iter}, number={number}, succ={succ}, '
                       'score={score:.4f}, length={length}').format(mode, iter=index, **meters.val)
            return message, dict(succ=succ, number=number, backup=backup)
Beispiel #6
0
def run_episode(env,
                model,
                number,
                play_name='',
                dump=False,
                dataset=None,
                eval_only=False,
                use_argmax=False,
                need_restart=False,
                entropy_beta=0.0):
    """Run one episode using the model with $number blocks."""
    is_over = False
    traj = collections.defaultdict(list)
    score = 0
    if need_restart:
        env.restart()
    nr_objects = number + 1
    # If dump_play=True, store the states and actions in a json file
    # for visualization.
    dump_play = args.dump_play and dump
    if dump_play:
        array = env.unwrapped.current_state
        moves, new_pos, policies = [], [], []

    while not is_over:
        state = env.current_state
        feed_dict = dict(states=np.array([state]))
        feed_dict['entropy_beta'] = as_tensor(entropy_beta).float()
        feed_dict = as_tensor(feed_dict)
        if args.use_gpu:
            feed_dict = as_cuda(feed_dict)

        with torch.set_grad_enabled(not eval_only):
            output_dict = model(feed_dict)
        policy = output_dict['policy']
        p = as_numpy(policy.data[0])
        action = p.argmax() if use_argmax else random.choice(len(p), p=p)
        # Need to ensure that the env.utils.MapActionProxy is the outermost class.
        mapped_x, mapped_y = env.mapping[action]
        # env.unwrapped to get the innermost Env class.
        valid = env.unwrapped.world.moveable(mapped_x, mapped_y)
        reward, is_over = env.action(action)
        if dump_play:
            moves.append([mapped_x, mapped_y])
            res = tuple(env.current_state[mapped_x][2:])
            new_pos.append((int(res[0]), int(res[1])))

            logits = as_numpy(output_dict['logits'].data[0])
            tops = np.argsort(p)[-10:][::-1]
            tops = list(
                map(lambda x: (env.mapping[x], float(p[x]), float(logits[x])),
                    tops))
            policies.append(tops)

        # For now, assume reward=1 only when succeed, otherwise reward=0.
        # Manipulate the reward and get success information according to reward.
        if reward == 0 and args.penalty is not None:
            reward = args.penalty
        succ = 1 if is_over and reward > 0.99 else 0

        score += reward
        traj['states'].append(state)
        traj['rewards'].append(reward)
        traj['actions'].append(action)
        if not eval_only and dataset is not None and mapped_x != mapped_y:
            dataset.append(nr_objects, state, action, valid)

    # Dump json file as record of the playing.
    if dump_play and not (args.dump_fail_only and succ):
        array = array[:, 2:].astype('int32').tolist()
        array = [array[:nr_objects], array[nr_objects:]]
        json_str = json.dumps(
            # Let indent=True for an indented view of json files.
            dict(array=array, moves=moves, new_pos=new_pos, policies=policies))
        dump_file = os.path.join(
            args.current_dump_dir,
            '{}_blocks{}.json'.format(play_name, env.unwrapped.nr_blocks))
        with open(dump_file, 'w') as f:
            f.write(json_str)

    length = len(traj['rewards'])
    return succ, score, traj, length
Beispiel #7
0
def run_episode(env,
                model,
                number,
                play_name='',
                dump=False,
                eval_only=False,
                use_argmax=False,
                need_restart=False,
                entropy_beta=0.0):
    """Run one episode using the model with $number nodes/numbers."""
    is_over = False
    traj = collections.defaultdict(list)
    score = 0
    moves = []
    # If dump_play=True, store the states and actions in a json file
    # for visualization.
    dump_play = args.dump_play and dump

    if need_restart:
        env.restart()

    if args.is_path_task:
        optimal = env.unwrapped.dist
        relation = env.unwrapped.graph.get_edges()
        relation = np.stack([relation, relation.T], axis=-1)
        st, ed = env.current_state
        nodes_trajectory = [int(st)]
        destination = int(ed)
        policies = []
    elif args.is_sort_task:
        optimal = env.unwrapped.optimal
        array = [str(i) for i in env.unwrapped.array]

    while not is_over:
        if args.is_path_task:
            st, ed = env.current_state
            state = np.zeros((relation.shape[0], 2))
            state[st, 0] = 1
            state[ed, 1] = 1
            feed_dict = dict(states=np.array([state]),
                             relations=np.array([relation]))
        elif args.is_sort_task:
            state = env.current_state
            feed_dict = dict(states=np.array([state]))
        feed_dict['entropy_beta'] = as_tensor(entropy_beta).float()
        feed_dict = as_tensor(feed_dict)
        if args.use_gpu:
            feed_dict = as_cuda(feed_dict)

        with torch.set_grad_enabled(not eval_only):
            output_dict = model(feed_dict)

        policy = output_dict['policy']
        p = as_numpy(policy.data[0])
        action = p.argmax() if use_argmax else random.choice(len(p), p=p)
        reward, is_over = env.action(action)

        # collect moves information
        if dump_play:
            if args.is_path_task:
                moves.append(int(action))
                nodes_trajectory.append(int(env.current_state[0]))
                logits = as_numpy(output_dict['logits'].data[0])
                tops = np.argsort(p)[-10:][::-1]
                tops = list(
                    map(lambda x: (int(x), float(p[x]), float(logits[x])),
                        tops))
                policies.append(tops)
            if args.is_sort_task:
                # Need to ensure that env.utils.MapActionProxy is the outermost class.
                mapped_x, mapped_y = env.mapping[action]
                moves.append([mapped_x, mapped_y])

        # For now, assume reward=1 only when succeed, otherwise reward=0.
        # Manipulate the reward and get success information according to reward.
        if reward == 0 and args.penalty is not None:
            reward = args.penalty
        succ = 1 if is_over and reward > 0.99 else 0

        score += reward
        traj['states'].append(state)
        if args.is_path_task:
            traj['relations'].append(relation)
        traj['rewards'].append(reward)
        traj['actions'].append(action)

    # dump json file storing information of playing
    if dump_play and not (args.dump_fail_only and succ):
        if args.is_path_task:
            num = env.unwrapped.nr_nodes
            graph = relation[:, :, 0].tolist()
            coordinates = env.unwrapped.graph.get_coordinates().tolist()
            json_str = json.dumps(
                dict(graph=graph,
                     coordinates=coordinates,
                     policies=policies,
                     destination=destination,
                     current=nodes_trajectory,
                     moves=moves))
        if args.is_sort_task:
            num = env.unwrapped.nr_numbers
            json_str = json.dumps(dict(array=array, moves=moves))
        dump_file = os.path.join(args.current_dump_dir,
                                 '{}_size{}.json'.format(play_name, num))
        with open(dump_file, 'w') as f:
            f.write(json_str)

    length = len(traj['rewards'])
    return succ, score, traj, length, optimal