class CustomAgent: def __init__(self, verbose=False, **kwargs) -> None: # Load the config file config_file = kwargs['config_file_path'] if 'config_file_path' in kwargs else "config/config.yaml" with open(config_file) as reader: self.config = yaml.safe_load(reader) if 'update_config_fun' in kwargs and kwargs['update_config_fun'] is not None: self.config = kwargs['update_config_fun'](self.config) if verbose: pprint.pprint(self.config, width=1) # choose device self.device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu' if 'gpu' in kwargs and kwargs['gpu'] is not None: self.device = 'cuda:{}'.format(kwargs['gpu']) # training settings self.batch_size = self.config['training']['batch_size'] self.max_nb_steps_per_episode = self.config['training']['max_nb_steps_per_episode'] self.nb_epochs = self.config['training']['nb_epochs'] # set the statistics self._episode_has_started = False self.last_done = None self.mode = "test" self.counter = StepCounter(self.batch_size, self.max_nb_steps_per_episode) # Init the models and its optimizer self.model = Model(hidden_size=self.config['model']['hidden_size'], device=self.device, bidirectional=self.config['model']['bidirectional'], hidden_linear_size=self.config['model']['hidden_linear_size']) self.item_scorer = ItemScorer(device=self.device) self.navigation_model = Navigation(device=self.device) if 'optimizer' in self.config['training']: self.optimizer = optim.Adam(self.model.parameters(), self.config['training']['optimizer']['learning_rate']) self.model_updates = 0 self.model_loss = 0. if verbose: print(self.model) print('Total Model Parameters: {}'.format(count_parameters(self.model))) # choose the agent self.agent = lambda device, model: HAgent(device=device, model=model, item_scorer=self.item_scorer, hcp=self.config['general']['hcp'], navigation_model=self.navigation_model) # Command Queue self.command_q = None # Saving and Loading self.experiment_tag = self.config['checkpoint'].get('experiment_tag', 'NONAME') self.saver = Saver(model=self.model, ckpt_path=self.config['checkpoint'].get('model_checkpoint_path', 'NOPATH'), experiment_tag=self.experiment_tag, load_pretrained=len(self.config['checkpoint']['pretrained_experiment_path']) > 0, pretrained_model_path=os.path.join(_FILE_PREFIX, self.config['checkpoint']['pretrained_experiment_path']), device=self.device, save_frequency=self.config['checkpoint'].get('save_frequency', 1E10)) # Logging Statistics tb_dir = None if 'tensorboard' not in self.config else os.path.join(self.config['tensorboard']['directory'], self.experiment_tag) self.statistics = StatisticsTracker(tb_dir=tb_dir) # EventHandler self.event_handler = EventHandler() self.event_handler.add(self.statistics.stats_episode_clear, Event.NEWEPISODE) self.event_handler.add(self.counter.new_episode, Event.NEWEPISODE) def _init_episode(self): """ Initialize settings for the start of a new game. """ self.event_handler(Event.NEWEPISODE) self._episode_has_started = True self.transitions = [[] for _ in range(self.batch_size)] self.model.reset_hidden() self.last_score = np.array([0] * self.batch_size) self.last_done = [False] * self.batch_size self.model_updates = 0 self.model_loss = 0. self.agents = [self.agent(device=self.device, model=self.model) for _ in range(self.batch_size)] self.command_q = [[] for _ in range(self.batch_size)] def act_eval(self, obs: List[str], scores: List[int], dones: List[bool], infos: List[Dict]): """ Agent step if its in test mode. """ if all(dones): self._end_episode(obs, scores) return # individually for every agent in the batch for idx, (observation, score, done, info, cmd_q) in enumerate(zip(obs, scores, dones, infos, self.command_q)): if done: # placeholder command self.command_q[idx] = ['look'] if len(cmd_q) == 0: # only if add new command if there is nothing left in the queue for this agent new_cmds, _ = self.agents[idx].step(observation=observation, info=info) [self.command_q[idx].append(cmd) for cmd in new_cmds] self.counter.step() return [cmd_q.pop(0) for cmd_q in self.command_q] def act(self, obs: List[str], scores: List[int], dones: List[bool], infos: Dict[str, List[Any]]) -> Optional[List[str]]: """ Step of the agent. """ # re-structure infos infos = [{k: v[i] for k, v in infos.items()} for i in range(len(obs))] if not self._episode_has_started: self._init_episode() if self.mode == 'test': return self.act_eval(obs, scores, dones, infos) elif self.mode == 'manual_eval': return self.manual_eval(obs, scores, dones, infos) current_score = [] # individually for every agent in the batch for idx, (observation, score, done, last_done, info, cmd_q) in enumerate(zip(obs, scores, dones, self.last_done, infos, self.command_q)): just_finished = (last_done != done) if not done or just_finished: self.counter.increase_steps_taken(idx) if len(cmd_q) > 0: # has still commands to fire current_score.append(0.) continue if done and not just_finished: self.command_q[idx] = ['look'] current_score.append(0.) continue else: self.agents[idx].update_score(score) # update score current_score.append(self.agents[idx].current_score) # add new command new_cmds, learning_info = self.agents[idx].step(observation=observation, info=info) [self.command_q[idx].append(cmd) for cmd in new_cmds] # update the model self.model_update(done=done, index=learning_info.index, output=learning_info.score, value=learning_info.value, score=self.agents[idx].current_score, batch_idx=idx) self.last_done = dones self.statistics.stats_episode_append(score=np.mean(current_score)) if all(dones): self._end_episode(obs, scores, cmds=[agent.cmd_memory for agent in self.agents]) return self.saver.save(epoch=self.counter('epoch'), episode=self.counter('episode')) self.counter.step() return [cmd_q.pop(0) for cmd_q in self.command_q] def model_update(self, done, index, output, value, score, batch_idx): """ Store the information for the model update. After invoking it 'update_frequency' times for a specific agent the a2c update is performed. """ if self.transitions[batch_idx]: self.transitions[batch_idx][-1].reward = torch.Tensor([score])[0].type(torch.float).to(self.device) if len(self.transitions[batch_idx]) >= self.config['training']['update_frequency'] or done: # done == just_finished # do the update self._a2c_update(value, batch_idx) else: # add the transition self.transitions[batch_idx].append(Transition(reward=None, index=index, output=output, value=value, done=done)) def _a2c_update(self, value, batch_idx): """ Uses the stored model information from the last 'update_frequency' steps to perform an A2C update. """ # compute the returns and advantages from the last 'update_frequency' model steps returns, advantages = self._discount_rewards(value, self.transitions[batch_idx]) for transition, _return, advantage in zip(self.transitions[batch_idx], returns, advantages): reward, index, output, value, done = transition if done: continue advantage = advantage.detach() probs = F.softmax(output, dim=-1) log_probs = torch.log(probs) log_action_prob = log_probs[index] policy_loss = -log_action_prob * advantage value_loss = (.5 * (value - _return)**2) entropy = (-log_probs * probs).mean() # add up the loss over time self.model_loss += policy_loss + 0.5 * value_loss - 0.1 * entropy self.statistics.stats_episode_append( reward=reward, policy=policy_loss.item(), value=value_loss.item(), entropy=entropy.item(), confidence=torch.mean(torch.exp(log_action_prob)).item() ) self.model_updates += 1 self.transitions[batch_idx] = [] if self.model_loss == 0 or self.model_updates % self.batch_size != 0: # print('skipped') return # Only if all of the agents in the batch have performed their update the backpropagation is invoked to reduce # computational complexity self.statistics.stats_episode_append(loss=self.model_loss.item()) self.optimizer.zero_grad() self.model_loss.backward(retain_graph=True) nn.utils.clip_grad_norm_(self.model.parameters(), self.config['training']['optimizer']['clip_grad_norm']) self.optimizer.step() self.model_loss = 0. def _discount_rewards(self, last_value, transitions): """ Discounts the rewards of the agent over time to compute the returns and advantages. """ returns, advantages = [], [] R = last_value.data for t in reversed(range(len(transitions))): rewards, _, _, values, done = transitions[t] R = rewards + self.config['general']['discount_gamma'] * R adv = R - values returns.append(R) advantages.append(adv) return returns[::-1], advantages[::-1] def _end_episode(self, observation, scores, **kwargs): self._episode_has_started = False if self.mode != 'test': points, possible_points = self._get_points(observation, scores) self.statistics.flush_episode_statistics(possible_points=possible_points, episode_no=self.counter('episode'), steps=np.mean(self.counter('steps_taken')), points=points, **kwargs) def _get_points(self, obs, scores): """ Parses the obtained points from the last observation. """ batch_size = len(obs) points = [] possible_points = None for i in range(batch_size): try: points.append(int(obs[i].split('You scored ')[1].split(' out of a possible')[0])) possible_points = int(obs[i].split('out of a possible ')[1].split(',')[0]) except: points.append(scores[i]) possible_points = possible_points if possible_points is not None else 5 return points, possible_points def train(self) -> None: """ Tell the agent it is in training mode. """ self.mode = 'train' def eval(self) -> None: """ Tell the agent it is in evaluation mode. """ self.mode = 'test' self.model.reset_hidden() def select_additional_infos(self) -> EnvInfos: request_infos = EnvInfos() request_infos.description = True request_infos.inventory = True if self.config['general']['hcp'] >= 2: request_infos.entities = True request_infos.verbs = True if self.config['general']['hcp'] >= 4: request_infos.extras = ["recipe"] if self.config['general']['hcp'] >= 5: request_infos.admissible_commands = True # TEST request_infos.entities = True request_infos.verbs = True request_infos.extras = ["recipe", "walkthrough"] request_infos.admissible_commands = True return request_infos def started_new_epoch(self): """ Call this function from outside to let the agent know that a new epoch has started. """ self.counter.new_epoch()
from slgep_lib import wrap_config from utils import Saver from mfea import mfea import argparse import yaml # Load configuration config = yaml.load(open('config.yaml').read()) # Load benchmark benchmark = yaml.load(open('atari_benchmark/multitask-benchmark.yaml').read()) instances = [] for i in range(1, 41): if i not in [100]: instances.append('multi-' + str(i)) seeds = range(1, 21) for seed in seeds: for instance in instances: data = benchmark[instance] config.update(data) config = wrap_config(config) saver = Saver(config, instance, seed) mfea(config, saver.append) saver.save()
def train(): olp = OneLinePrint() logger.info('start building batch data') vocab = Vocab(hps.vocab_file, hps.vocab_size) batcher = Batcher(hps.data_path, vocab, hps, hps.single_pass) logger.info('end building batch data') logger.info('vocab size: %s' % vocab.size()) criterion = nn.NLLLoss(ignore_index=vocab.pad_id()) model = Model(vocab, hps) if hps.use_cuda: model = model.cuda() if hps.restore: model.load_state_dict(torch.load(hps.restore)) opt = optimzier(hps.opt, model.parameters()) if hps.ckpt_name != '': saver = Saver(hps.ckpt_path, hps.ckpt_name, model) # for store summary if hps.store_summary: writer = SummaryWriter(comment='_' + hps.ckpt_name) # loss_sum = 0 logger.info('----Start training----') timer = Timer() timer.start() for step in range(hps.start_step, hps.num_iters + 1): # # Decay learning rate # if step % hps.lr_decay_step == 0: # olp.write( # 'decay learning rate to %f' % decay_lr(opt, step)) # Forward ------------------------------------------------------------- opt.zero_grad() batch = batcher.next_batch() (inputs, inp_lens, inp_pad, dec_inps, targets, dec_lens, dec_pad) = batch.expand(hps.use_cuda) outputs = model(dec_inps, dec_lens) # output: (B*T*(1~3)U) loss = criterion(outputs.view(-1, vocab.size()), targets.view(-1)) # Backward ------------------------------------------------------------ loss.backward() # gradient clipping global_norm = nn.utils.clip_grad_norm(model.parameters(), hps.clip) opt.step() # loss_sum += loss.data[0] # Utils --------------------------------------------------------------- # save checkpoint if step % hps.ckpt_steps == 0 and hps.ckpt_name != '': saver.save(step, loss.data[0]) olp.write('save checkpoint (step=%d)\n' % step) # print the train loss and ppl ppl = np.exp(loss.data[0]) olp.write('step %s train loss: %f, ppl: %8.2f' % (step, loss.data[0], ppl)) olp.flush() # store summary if hps.store_summary and (step - 1) % hps.summary_steps == 0: writer.add_scalar('loss', loss, step) writer.add_scalar('ppl', ppl, step) writer.add_scalar('global_norm', global_norm, step) if step - 1 != 0: lap_time, _ = timer.lap('summary') steps = hps.summary_steps writer.add_scalar('avg time/step', lap_time / steps, step) # print output and target # if step % hps.summary_steps == 0: # logger.info('\nstep:%d~%d avg loss: %f', step - hps.summary_steps, # step, loss_sum / hps.summary_steps) # loss_sum = 0 if hps.store_summary: writer.close()