class BotAgent: def __init__(self, env): """An agent based on a GOFAI bot.""" self.env = env self.on_reset() def on_reset(self): self.bot = Bot(self.env) def act(self, obs=None, update_internal_state=True, *args, **kwargs): action = self.bot.replan() return {'action': action} def analyze_feedback(self, reward, done): pass
class InteractiveIIL: def __init__(self, args): self.args = args # seeding utils.seed(args.seed) self.env = gym.make(id=args.env) self.episodes = 300 # args.episodes self.horizon = self.env.max_steps self.initial_decay = 0.99 # args.decay self.observation_preprocessor = utils.ObssPreprocessor( model_name=args.model, obs_space=self.env.observation_space, load_vocab_from=getattr(self.args, 'pretrained_model', None)) # TODO: for now I am only running the small model self.model = models.ACModel(obs_space=self.env.observation_space, action_space=self.env.action_space) self.learner = ModelAgent( model_or_name=self.model, obss_preprocessor=self.observation_preprocessor, argmax=True) self.teacher = Bot(self.env) self.data = [] self.observation_preprocessor.vocab.save() utils.save_model(self.model, args.model) self.model.train() if torch.cuda.is_available(): self.model.cuda() self.optimizer = torch.optim.Adam(self.model.parameters(), self.args.lr, eps=self.args.optim_eps) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if self.device.type == 'cpu': print('running on cpu...') def train(self): for episode in range(self.episodes): alpha = self.initial_decay**episode observation = self.env.reset() last_action = None done = False while not done: active_agent = np.random.choice(a=[self.teacher, self.learner], p=[alpha, 1. - alpha]) optimal_action = self.teacher.replan(action_taken=last_action) if active_agent == self.teacher: action = optimal_action else: action = self.learner.act(observation) next_observation, reward, done, info = self.env.step(action) self.data.append([observation, optimal_action, done]) last_action = action observation = next_observation self._train_epoch() def _train_epoch(self): batch_size = self.args.batch_size data_set_size = len(self.data) # NOTE: this is a really smart idea randomized_indexes = np.arange(0, len(self.data)) np.random.shuffle(randomized_indexes) for index in range(0, data_set_size, batch_size): batch = [ self.data[i] for i in randomized_indexes[index:index + batch_size] ] _log = self._train_batch(batch) def _train_batch(self, batch): pass
(run_no + 1, options.num_runs, mission.surface, mission_seed)) optimal_actions = [] before_optimal_actions = [] non_optimal_steps = options.non_optimal_steps or int( mission.max_steps // 3) rng = Random(mission_seed) try: episode_steps = 0 last_action = None while True: # vis_mask = expert.vis_mask # expert = Bot(mission) # expert.vis_mask = vis_mask action = expert.replan(last_action) # action = expert.replan(last_action) if options.advise_mode and episode_steps < non_optimal_steps: if rng.random() < options.bad_action_proba: while True: action = bad_agent.act( mission.gen_obs())['action'].item() fwd_pos = mission.agent_pos + mission.dir_vec fwd_cell = mission.grid.get(*fwd_pos) # The current bot can't recover from two kinds of behaviour: # - opening a box (cause it just disappears) # - closing a door (cause its path finding mechanism get confused) opening_box = (action == mission.actions.toggle and fwd_cell and fwd_cell.type == 'box') closing_door = (action == mission.actions.toggle
def replan(self, action_taken=None): # Create an entirely new bot each time we need to plan bot = Bot(self.mission) action = bot.replan() return action