class BotAgent: def __init__(self, env): """An agent based on a GOFAI bot.""" self.env = env self.on_reset() def on_reset(self): self.bot = Bot(self.env) def act(self, obs=None, update_internal_state=True, *args, **kwargs): action = self.bot.get_action() if update_internal_state: self.bot.take_action(action) return {'action': action} def analyze_feedback(self, reward, done): pass
def __init__(self, args): self.args = args # seeding utils.seed(args.seed) self.env = gym.make(id=args.env) self.episodes = 300 # args.episodes self.horizon = self.env.max_steps self.initial_decay = 0.99 # args.decay self.observation_preprocessor = utils.ObssPreprocessor( model_name=args.model, obs_space=self.env.observation_space, load_vocab_from=getattr(self.args, 'pretrained_model', None)) # TODO: for now I am only running the small model self.model = models.ACModel(obs_space=self.env.observation_space, action_space=self.env.action_space) self.learner = ModelAgent( model_or_name=self.model, obss_preprocessor=self.observation_preprocessor, argmax=True) self.teacher = Bot(self.env) self.data = [] self.observation_preprocessor.vocab.save() utils.save_model(self.model, args.model) self.model.train() if torch.cuda.is_available(): self.model.cuda() self.optimizer = torch.optim.Adam(self.model.parameters(), self.args.lr, eps=self.args.optim_eps) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if self.device.type == 'cpu': print('running on cpu...')
def __init__(self, start_loc='all', include_holdout_obj=True, num_meta_tasks=2, persist_agent=True, persist_goal=True, persist_objs=True, feedback_type=None, feedback_always=False, feedback_freq=False, intermediate_reward=False, cartesian_steps=1, **kwargs): """ :param start_loc: which part of the grid to start the agent in. ['top', 'bottom', 'all'] :param include_holdout_obj: If true, uses all objects. If False, doesn't use grey objects or boxes :param persist_agent: Whether to keep agent position the same across runs within a meta-task :param persist_goal: Whether to keep the goal (i.e. textual mission string) the same across runs in a meta-task :param persist_objs: Whether to keep object positions the same across runs within a meta-task :param feedback_type: Type of teacher feedback, string :param feedback_always: Whether to give that feedback type every time (rather than just when the agent needs help) :param kwargs: Additional arguments passed to the parent class """ assert start_loc in ['top', 'bottom', 'all'] self.start_loc = start_loc self.intermediate_reward = intermediate_reward self.include_holdout_obj = include_holdout_obj self.persist_agent = persist_agent self.persist_goal = persist_goal self.persist_objs = persist_objs self.num_meta_tasks = num_meta_tasks self.task = {} self.itr = 0 self.feedback_type = feedback_type super().__init__(**kwargs) if feedback_type is not None: self.oracle = {} teachers = {} for ft in feedback_type: if ft == 'PostActionAdvice': teacher = PostActionAdvice(Bot, self, feedback_always=feedback_always, feedback_frequency=feedback_freq, cartesian_steps=cartesian_steps) elif ft == 'PreActionAdvice': teacher = PreActionAdvice(Bot, self, feedback_always=feedback_always, feedback_frequency=feedback_freq, cartesian_steps=cartesian_steps) elif ft == 'CartesianCorrections': teacher = CartesianCorrections(Bot, self, feedback_always=feedback_always, feedback_frequency=feedback_freq, cartesian_steps=cartesian_steps) elif ft == 'SubgoalCorrections': teacher = SubgoalCorrections(Bot, self, feedback_always=feedback_always, feedback_frequency=feedback_freq, cartesian_steps=cartesian_steps) else: raise NotImplementedError teachers[ft] = teacher self.oracle[ft] = Bot(self) teacher = BatchTeacher(teachers) else: teacher = None self.teacher = teacher
class BotAgent: def __init__(self, env, forget=False): """An agent based on a GOFAI bot.""" self.env = env self.forget = forget self.on_reset() def on_reset(self): self.bot = Bot(self.env, forget=self.forget) def act(self, *args, **kwargs): return {'action': self.bot.step()} def analyze_feedback(self, reward, done): pass
def on_reset(self): self.bot = Bot(self.env)
num_success = 0 total_reward = 0 total_steps = [] total_bfs = 0 total_episode_steps = 0 total_bfs_steps = 0 for run_no in range(options.num_runs): level = level_dict[level_name] mission_seed = options.seed + run_no mission = level(seed=mission_seed) if not run_no % 1: print(run_no, mission.mission) expert = Bot(mission) if options.verbose: print( '%s/%s: %s, seed=%d' % (run_no + 1, options.num_runs, mission.surface, mission_seed)) optimal_actions = [] before_optimal_actions = [] non_optimal_steps = options.non_optimal_steps or int( mission.max_steps // 3) rng = Random(mission_seed) try: episode_steps = 0 last_action = None
bad_agent = RandomAgent(seed=options.random_agent_seed) start_time = time.time() for level_name in level_list: num_success = 0 total_reward = 0 total_steps = 0 for run_no in range(options.num_runs): level = level_dict[level_name] mission_seed = options.seed + run_no mission = level(seed=mission_seed) expert = Bot(mission) if options.verbose: print( '%s/%s: %s, seed=%d' % (run_no + 1, options.num_runs, mission.surface, mission_seed)) optimal_actions = [] before_optimal_actions = [] non_optimal_steps = options.non_optimal_steps or int( mission.max_steps // 3) rng = Random(mission_seed) try: episode_steps = 0 while True:
def on_reset(self): self.bot = Bot(self.env, forget=self.forget)
class InteractiveIIL: def __init__(self, args): self.args = args # seeding utils.seed(args.seed) self.env = gym.make(id=args.env) self.episodes = 300 # args.episodes self.horizon = self.env.max_steps self.initial_decay = 0.99 # args.decay self.observation_preprocessor = utils.ObssPreprocessor( model_name=args.model, obs_space=self.env.observation_space, load_vocab_from=getattr(self.args, 'pretrained_model', None)) # TODO: for now I am only running the small model self.model = models.ACModel(obs_space=self.env.observation_space, action_space=self.env.action_space) self.learner = ModelAgent( model_or_name=self.model, obss_preprocessor=self.observation_preprocessor, argmax=True) self.teacher = Bot(self.env) self.data = [] self.observation_preprocessor.vocab.save() utils.save_model(self.model, args.model) self.model.train() if torch.cuda.is_available(): self.model.cuda() self.optimizer = torch.optim.Adam(self.model.parameters(), self.args.lr, eps=self.args.optim_eps) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if self.device.type == 'cpu': print('running on cpu...') def train(self): for episode in range(self.episodes): alpha = self.initial_decay**episode observation = self.env.reset() last_action = None done = False while not done: active_agent = np.random.choice(a=[self.teacher, self.learner], p=[alpha, 1. - alpha]) optimal_action = self.teacher.replan(action_taken=last_action) if active_agent == self.teacher: action = optimal_action else: action = self.learner.act(observation) next_observation, reward, done, info = self.env.step(action) self.data.append([observation, optimal_action, done]) last_action = action observation = next_observation self._train_epoch() def _train_epoch(self): batch_size = self.args.batch_size data_set_size = len(self.data) # NOTE: this is a really smart idea randomized_indexes = np.arange(0, len(self.data)) np.random.shuffle(randomized_indexes) for index in range(0, data_set_size, batch_size): batch = [ self.data[i] for i in randomized_indexes[index:index + batch_size] ] _log = self._train_batch(batch) def _train_batch(self, batch): pass
level_list = [options.level] start_time = time.time() for level_name in level_list: num_success = 0 total_reward = 0 total_steps = 0 for run_no in range(options.num_runs): level = level_dict[level_name] mission_seed = options.seed + run_no mission = level(seed=mission_seed) expert = Bot(mission, forget=options.forget) if options.verbose: print( '%s/%s: %s, seed=%d' % (run_no + 1, options.num_runs, mission.surface, mission_seed)) try: episode_steps = 0 while True: action = expert.step() obs, reward, done, info = mission.step(action) total_reward += reward episode_steps += 1
def replan(self, action_taken=None): # Create an entirely new bot each time we need to plan bot = Bot(self.mission) action = bot.replan() return action