class CustomAgent:
    def __init__(self, verbose=False, **kwargs) -> None:
        # Load the config file
        config_file = kwargs['config_file_path'] if 'config_file_path' in kwargs else "config/config.yaml"
        with open(config_file) as reader:
            self.config = yaml.safe_load(reader)
        if 'update_config_fun' in kwargs and kwargs['update_config_fun'] is not None:
            self.config = kwargs['update_config_fun'](self.config)
        if verbose:
            pprint.pprint(self.config, width=1)

        # choose device
        self.device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu'
        if 'gpu' in kwargs and kwargs['gpu'] is not None:
            self.device = 'cuda:{}'.format(kwargs['gpu'])

        # training settings
        self.batch_size = self.config['training']['batch_size']
        self.max_nb_steps_per_episode = self.config['training']['max_nb_steps_per_episode']
        self.nb_epochs = self.config['training']['nb_epochs']

        # set the statistics
        self._episode_has_started = False
        self.last_done = None
        self.mode = "test"
        self.counter = StepCounter(self.batch_size, self.max_nb_steps_per_episode)

        # Init the models and its optimizer
        self.model = Model(hidden_size=self.config['model']['hidden_size'],
                           device=self.device,
                           bidirectional=self.config['model']['bidirectional'],
                           hidden_linear_size=self.config['model']['hidden_linear_size'])
        self.item_scorer = ItemScorer(device=self.device)
        self.navigation_model = Navigation(device=self.device)
        if 'optimizer' in self.config['training']:
            self.optimizer = optim.Adam(self.model.parameters(),
                                        self.config['training']['optimizer']['learning_rate'])
        self.model_updates = 0
        self.model_loss = 0.

        if verbose:
            print(self.model)
            print('Total Model Parameters: {}'.format(count_parameters(self.model)))

        # choose the agent
        self.agent = lambda device, model: HAgent(device=device, model=model, item_scorer=self.item_scorer,
                                                  hcp=self.config['general']['hcp'], navigation_model=self.navigation_model)
        # Command Queue
        self.command_q = None

        # Saving and Loading
        self.experiment_tag = self.config['checkpoint'].get('experiment_tag', 'NONAME')
        self.saver = Saver(model=self.model,
                           ckpt_path=self.config['checkpoint'].get('model_checkpoint_path', 'NOPATH'),
                           experiment_tag=self.experiment_tag,
                           load_pretrained=len(self.config['checkpoint']['pretrained_experiment_path']) > 0,
                           pretrained_model_path=os.path.join(_FILE_PREFIX, self.config['checkpoint']['pretrained_experiment_path']),
                           device=self.device,
                           save_frequency=self.config['checkpoint'].get('save_frequency', 1E10))

        # Logging Statistics
        tb_dir = None if 'tensorboard' not in self.config else os.path.join(self.config['tensorboard']['directory'],
                                                                            self.experiment_tag)
        self.statistics = StatisticsTracker(tb_dir=tb_dir)

        # EventHandler
        self.event_handler = EventHandler()
        self.event_handler.add(self.statistics.stats_episode_clear, Event.NEWEPISODE)
        self.event_handler.add(self.counter.new_episode, Event.NEWEPISODE)

    def _init_episode(self):
        """
        Initialize settings for the start of a new game.
        """
        self.event_handler(Event.NEWEPISODE)

        self._episode_has_started = True
        self.transitions = [[] for _ in range(self.batch_size)]
        self.model.reset_hidden()
        self.last_score = np.array([0] * self.batch_size)
        self.last_done = [False] * self.batch_size
        self.model_updates = 0
        self.model_loss = 0.

        self.agents = [self.agent(device=self.device, model=self.model) for _ in range(self.batch_size)]
        self.command_q = [[] for _ in range(self.batch_size)]



    def act_eval(self, obs: List[str], scores: List[int], dones: List[bool], infos: List[Dict]):
        """
        Agent step if its in test mode.
        """
        if all(dones):
            self._end_episode(obs, scores)
            return

        # individually for every agent in the batch
        for idx, (observation, score, done, info, cmd_q) in enumerate(zip(obs, scores, dones, infos, self.command_q)):
            if done:
                # placeholder command
                self.command_q[idx] = ['look']

            if len(cmd_q) == 0:
                # only if add new command if there is nothing left in the queue for this agent
                new_cmds, _ = self.agents[idx].step(observation=observation, info=info)
                [self.command_q[idx].append(cmd) for cmd in new_cmds]

        self.counter.step()
        return [cmd_q.pop(0) for cmd_q in self.command_q]


    def act(self, obs: List[str], scores: List[int], dones: List[bool], infos: Dict[str, List[Any]]) -> Optional[List[str]]:
        """
        Step of the agent.
        """
        # re-structure infos
        infos = [{k: v[i] for k, v in infos.items()} for i in range(len(obs))]

        if not self._episode_has_started:
            self._init_episode()

        if self.mode == 'test':
            return self.act_eval(obs, scores, dones, infos)
        elif self.mode == 'manual_eval':
            return self.manual_eval(obs, scores, dones, infos)

        current_score = []
        # individually for every agent in the batch
        for idx, (observation, score, done, last_done, info, cmd_q) in enumerate(zip(obs, scores, dones, self.last_done, infos, self.command_q)):
            just_finished = (last_done != done)
            if not done or just_finished:
                self.counter.increase_steps_taken(idx)

            if len(cmd_q) > 0:
                # has still commands to fire
                current_score.append(0.)
                continue

            if done and not just_finished:
                self.command_q[idx] = ['look']
                current_score.append(0.)
                continue
            else:
                self.agents[idx].update_score(score)

            # update score
            current_score.append(self.agents[idx].current_score)

            # add new command
            new_cmds, learning_info = self.agents[idx].step(observation=observation, info=info)
            [self.command_q[idx].append(cmd) for cmd in new_cmds]

            # update the model
            self.model_update(done=done,
                              index=learning_info.index,
                              output=learning_info.score,
                              value=learning_info.value,
                              score=self.agents[idx].current_score,
                              batch_idx=idx)

        self.last_done = dones
        self.statistics.stats_episode_append(score=np.mean(current_score))

        if all(dones):
            self._end_episode(obs, scores, cmds=[agent.cmd_memory for agent in self.agents])
            return
        self.saver.save(epoch=self.counter('epoch'), episode=self.counter('episode'))
        self.counter.step()
        return [cmd_q.pop(0) for cmd_q in self.command_q]


    def model_update(self, done, index, output, value, score, batch_idx):
        """
        Store the information for the model update. After invoking it 'update_frequency' times for a specific agent
        the a2c update is performed.
        """
        if self.transitions[batch_idx]:
            self.transitions[batch_idx][-1].reward = torch.Tensor([score])[0].type(torch.float).to(self.device)

        if len(self.transitions[batch_idx]) >= self.config['training']['update_frequency'] or done: # done == just_finished
            # do the update
            self._a2c_update(value, batch_idx)
        else:
            # add the transition
            self.transitions[batch_idx].append(Transition(reward=None,
                                                          index=index,
                                                          output=output,
                                                          value=value,
                                                          done=done))

    def _a2c_update(self, value, batch_idx):
        """
        Uses the stored model information from the last 'update_frequency' steps to perform an A2C update.
        """
        # compute the returns and advantages from the last 'update_frequency' model steps
        returns, advantages = self._discount_rewards(value, self.transitions[batch_idx])

        for transition, _return, advantage in zip(self.transitions[batch_idx], returns, advantages):
            reward, index, output, value, done = transition
            if done:
                continue

            advantage = advantage.detach()
            probs = F.softmax(output, dim=-1)
            log_probs = torch.log(probs)
            log_action_prob = log_probs[index]
            policy_loss = -log_action_prob * advantage
            value_loss = (.5 * (value - _return)**2)
            entropy = (-log_probs * probs).mean()

            # add up the loss over time
            self.model_loss += policy_loss + 0.5 * value_loss - 0.1 * entropy

            self.statistics.stats_episode_append(
                reward=reward,
                policy=policy_loss.item(),
                value=value_loss.item(),
                entropy=entropy.item(),
                confidence=torch.mean(torch.exp(log_action_prob)).item()
            )
        self.model_updates += 1

        self.transitions[batch_idx] = []

        if self.model_loss == 0 or self.model_updates % self.batch_size != 0:
            # print('skipped')
            return

        # Only if all of the agents in the batch have performed their update the backpropagation is invoked to reduce
        # computational complexity

        self.statistics.stats_episode_append(loss=self.model_loss.item())
        self.optimizer.zero_grad()
        self.model_loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(self.model.parameters(), self.config['training']['optimizer']['clip_grad_norm'])
        self.optimizer.step()

        self.model_loss = 0.



    def _discount_rewards(self, last_value, transitions):
        """
        Discounts the rewards of the agent over time to compute the returns and advantages.
        """
        returns, advantages = [], []
        R = last_value.data
        for t in reversed(range(len(transitions))):
            rewards, _, _, values, done = transitions[t]
            R = rewards + self.config['general']['discount_gamma'] * R
            adv = R - values
            returns.append(R)
            advantages.append(adv)

        return returns[::-1], advantages[::-1]

    def _end_episode(self, observation, scores, **kwargs):
        self._episode_has_started = False

        if self.mode != 'test':
            points, possible_points = self._get_points(observation, scores)
            self.statistics.flush_episode_statistics(possible_points=possible_points,
                                                     episode_no=self.counter('episode'),
                                                     steps=np.mean(self.counter('steps_taken')),
                                                     points=points,
                                                     **kwargs)

    def _get_points(self, obs, scores):
        """
        Parses the obtained points from the last observation.
        """
        batch_size = len(obs)
        points = []
        possible_points = None
        for i in range(batch_size):
            try:
                points.append(int(obs[i].split('You scored ')[1].split(' out of a possible')[0]))
                possible_points = int(obs[i].split('out of a possible ')[1].split(',')[0])
            except:
                points.append(scores[i])
        possible_points = possible_points if possible_points is not None else 5
        return points, possible_points

    def train(self) -> None:
        """ Tell the agent it is in training mode. """
        self.mode = 'train'

    def eval(self) -> None:
        """ Tell the agent it is in evaluation mode. """
        self.mode = 'test'
        self.model.reset_hidden()

    def select_additional_infos(self) -> EnvInfos:
        request_infos = EnvInfos()
        request_infos.description = True
        request_infos.inventory = True
        if self.config['general']['hcp'] >= 2:
            request_infos.entities = True
            request_infos.verbs = True
        if self.config['general']['hcp'] >= 4:
            request_infos.extras = ["recipe"]
        if self.config['general']['hcp'] >= 5:
            request_infos.admissible_commands = True


        # TEST
        request_infos.entities = True
        request_infos.verbs = True
        request_infos.extras = ["recipe", "walkthrough"]
        request_infos.admissible_commands = True

        return request_infos

    def started_new_epoch(self):
        """
        Call this function from outside to let the agent know that a new epoch has started.
        """
        self.counter.new_epoch()
Esempio n. 2
0
from slgep_lib import wrap_config
from utils import Saver
from mfea import mfea
import argparse
import yaml

# Load configuration
config = yaml.load(open('config.yaml').read())

# Load benchmark
benchmark = yaml.load(open('atari_benchmark/multitask-benchmark.yaml').read())

instances = []
for i in range(1, 41):
    if i not in [100]:
        instances.append('multi-' + str(i))

seeds = range(1, 21)

for seed in seeds:
    for instance in instances:
        data = benchmark[instance]
        config.update(data)

        config = wrap_config(config)
        saver = Saver(config, instance, seed)

        mfea(config, saver.append)
        saver.save()
Esempio n. 3
0
def train():
    olp = OneLinePrint()

    logger.info('start building batch data')

    vocab = Vocab(hps.vocab_file, hps.vocab_size)
    batcher = Batcher(hps.data_path, vocab, hps, hps.single_pass)

    logger.info('end building batch data')
    logger.info('vocab size: %s' % vocab.size())

    criterion = nn.NLLLoss(ignore_index=vocab.pad_id())

    model = Model(vocab, hps)
    if hps.use_cuda:
        model = model.cuda()
    if hps.restore:
        model.load_state_dict(torch.load(hps.restore))

    opt = optimzier(hps.opt, model.parameters())

    if hps.ckpt_name != '':
        saver = Saver(hps.ckpt_path, hps.ckpt_name, model)

    # for store summary
    if hps.store_summary:
        writer = SummaryWriter(comment='_' + hps.ckpt_name)

    # loss_sum = 0

    logger.info('----Start training----')
    timer = Timer()
    timer.start()
    for step in range(hps.start_step, hps.num_iters + 1):
        # # Decay learning rate
        # if step % hps.lr_decay_step == 0:
        #     olp.write(
        #         'decay learning rate to %f' % decay_lr(opt, step))

        # Forward -------------------------------------------------------------
        opt.zero_grad()

        batch = batcher.next_batch()
        (inputs, inp_lens, inp_pad, dec_inps, targets, dec_lens,
         dec_pad) = batch.expand(hps.use_cuda)

        outputs = model(dec_inps, dec_lens)  # output: (B*T*(1~3)U)
        loss = criterion(outputs.view(-1, vocab.size()), targets.view(-1))

        # Backward ------------------------------------------------------------
        loss.backward()
        # gradient clipping
        global_norm = nn.utils.clip_grad_norm(model.parameters(), hps.clip)
        opt.step()

        # loss_sum += loss.data[0]

        # Utils ---------------------------------------------------------------
        # save checkpoint
        if step % hps.ckpt_steps == 0 and hps.ckpt_name != '':
            saver.save(step, loss.data[0])
            olp.write('save checkpoint (step=%d)\n' % step)

        # print the train loss and ppl
        ppl = np.exp(loss.data[0])
        olp.write('step %s train loss: %f, ppl: %8.2f' %
                  (step, loss.data[0], ppl))
        olp.flush()

        # store summary
        if hps.store_summary and (step - 1) % hps.summary_steps == 0:
            writer.add_scalar('loss', loss, step)
            writer.add_scalar('ppl', ppl, step)
            writer.add_scalar('global_norm', global_norm, step)
            if step - 1 != 0:
                lap_time, _ = timer.lap('summary')
                steps = hps.summary_steps
                writer.add_scalar('avg time/step', lap_time / steps, step)

        # print output and target
        # if step % hps.summary_steps == 0:
        #     logger.info('\nstep:%d~%d avg loss: %f', step - hps.summary_steps,
        #                 step, loss_sum / hps.summary_steps)
        #     loss_sum = 0

    if hps.store_summary:
        writer.close()