def stepEnv(self, action=None): # If no manual action was specified by the user if action is None: action = random.randint(0, self.env.action_space.n - 1) action = int(action) obs, reward, done, info = self.env.step(action) self.current_actions.append(action) self.current_images.append(self.lastObs['image']) self.current_directions.append(self.lastObs['direction']) self.showEnv(obs) self.lastObs = obs if done: if reward > 0: # i.e. we did not lose if self.shift < len(self.demos): self.demos[self.shift] = self.current_demo, self.shift else: self.demos.append( (self.current_mission, blosc.pack_array(np.array(self.current_images)), self.current_directions, self.current_actions)) utils.save_demos(self.demos, self.demos_path) self.missionBox.append('Demonstrations are saved.') utils.synthesize_demos(self.demos) self.shift += 1 self.resetEnv() else: self.shiftEnv()
def stepEnv(self, action=None): # If no manual action was specified by the user if action is None: action = random.randint(0, self.env.action_space.n - 1) action = int(action) obs, reward, done, info = self.env.step(action) self.current_demo.append((self.lastObs, action, reward, done)) self.showEnv(obs) self.lastObs = obs if done: if reward > 0: # i.e. we did not lose if self.shift < len(self.demos): self.demos[self.shift] = self.current_demo else: self.demos.append(self.current_demo) utils.save_demos(self.demos, args.env, "human") self.missionBox.append('Demonstrations are saved.') utils.synthesize_demos(self.demos) self.shift += 1 self.resetEnv() else: self.shiftEnv()
def generate_demos_cluster(): demos_per_job = args.episodes // args.jobs demos_path = utils.get_demos_path(args.demos, args.env, 'agent') job_demo_names = [ os.path.realpath(demos_path + '.shard{}'.format(i)) for i in range(args.jobs) ] for demo_name in job_demo_names: job_demos_path = utils.get_demos_path(demo_name) if os.path.exists(job_demos_path): os.remove(job_demos_path) processes = [] command = [args.job_script] command += sys.argv[1:] for i in range(args.jobs): cmd_i = list( map( str, command + ['--seed', args.seed + i] + ['--demos', job_demo_names[i]] + ['--episodes', demos_per_job] + ['--jobs', 0] + ['--valid-episodes', 0])) logger.info('LAUNCH COMMAND') logger.info(cmd_i) process = subprocess.Popen(cmd_i) processes += [process] for p in processes: p.wait() job_demos = [None] * args.jobs while True: jobs_done = 0 for i in range(args.jobs): if job_demos[i] is None or len(job_demos[i]) < demos_per_job: try: logger.info("Trying to load shard {}".format(i)) job_demos[i] = utils.load_demos( utils.get_demos_path(job_demo_names[i])) logger.info("{} demos ready in shard {}".format( len(job_demos[i]), i)) except Exception: logger.exception("Failed to load the shard") if job_demos[i] and len(job_demos[i]) == demos_per_job: jobs_done += 1 logger.info("{} out of {} shards done".format(jobs_done, args.jobs)) if jobs_done == args.jobs: break logger.info("sleep for 60 seconds") time.sleep(60) # Training demos all_demos = [] for demos in job_demos: all_demos.extend(demos) utils.save_demos(all_demos, demos_path)
def main(args): args.model = args.model or ImitationLearning.default_model_name(args) utils.configure_logging(args.model) il_learn = ImitationLearning(args) # Define logger and Tensorboard writer header = ([ "update", "frames", "FPS", "duration", "entropy", "policy_loss", "train_accuracy" ] + [ "validation_accuracy", "validation_return", "validation_success_rate" ]) writer = None if args.tb: from tensorboardX import SummaryWriter writer = SummaryWriter(utils.get_log_dir(args.model)) # Define csv writer csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv') first_created = not os.path.exists(csv_path) # we don't buffer data going in the csv log, cause we assume # that one update will take much longer that one write to the log csv_writer = csv.writer(open(csv_path, 'a', 1)) if first_created: csv_writer.writerow(header) # Log command, availability of CUDA, and model logger.info(args) logger.info("CUDA available: {}".format(torch.cuda.is_available())) logger.info(il_learn.acmodel) # Seed at which demo evaluation/generation will begin eval_seed = args.seed + len(il_learn.train_demos) # Phase at which we start cur_phase = 0 # Try to load the status (if resuming) status_path = os.path.join(utils.get_log_dir(args.model), 'status.json') if os.path.exists(status_path): with open(status_path, 'r') as src: status = json.load(src) eval_seed = status.get('eval_seed', eval_seed) cur_phase = status.get('cur_phase', cur_phase) model_name = args.model for phase_no in range(cur_phase, args.phases): logger.info("Starting phase {} with {} demos, eval_seed={}".format( phase_no, len(il_learn.train_demos), eval_seed)) # Each phase trains a different model from scratch args.model = model_name + ('_phase_%d' % phase_no) il_learn = ImitationLearning(args) # Train the imitation learning agent if len(il_learn.train_demos) > 0: train_status_path = os.path.join(utils.get_log_dir(args.model), 'status.json') il_learn.train(il_learn.train_demos, writer, csv_writer, train_status_path, header) # Stopping criterion valid_log = il_learn.validate(args.val_episodes) success_rate = np.mean( [1 if r > 0 else 0 for r in valid_log[0]['return_per_episode']]) if success_rate >= 0.99: logger.info( "Reached target success rate with {} demos, stopping".format( len(il_learn.train_demos))) break eval_seed = grow_training_set(il_learn, il_learn.train_demos, eval_seed, args.demo_grow_factor, args.num_eval_demos) # Save the current demo generation seed with open(status_path, 'w') as dst: status = {'eval_seed': eval_seed, 'cur_phase': phase_no + 1} json.dump(status, dst) # Save the demos demos_path = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=False) print('saving demos to:', demos_path) utils.save_demos(il_learn.train_demos, demos_path)
def generate_demos(n_episodes, valid, seed, shift=0): utils.seed(seed) # Generate environment env = gym.make(args.env) env.seed(seed) for i in range(shift): env.reset() agent = utils.load_agent(env, args.model, args.demos, 'agent', args.argmax, args.env) demos_path = utils.get_demos_path(args.demos, args.env, 'agent', valid) demos = [] checkpoint_time = time.time() while True: # Run the expert for one episode done = False obs = env.reset() agent.on_reset() actions = [] mission = obs["mission"] images = [] directions = [] try: while not done: action = agent.act(obs)['action'] if isinstance(action, torch.Tensor): action = action.item() new_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) actions.append(action) images.append(obs['image']) directions.append(obs['direction']) obs = new_obs if reward > 0 and (args.filter_steps == 0 or len(images) <= args.filter_steps): demos.append((mission, blosc.pack_array(np.array(images)), directions, actions)) if len(demos) >= n_episodes: break if reward == 0: if args.on_exception == 'crash': raise Exception("mission failed") logger.info("mission failed") except Exception: if args.on_exception == 'crash': raise logger.exception("error while generating demo #{}".format( len(demos))) continue if len(demos) and len(demos) % args.log_interval == 0: now = time.time() demos_per_second = args.log_interval / (now - checkpoint_time) to_go = (n_episodes - len(demos)) / demos_per_second logger.info( "demo #{}, {:.3f} demos per second, {:.3f} seconds to go". format(len(demos), demos_per_second, to_go)) checkpoint_time = now # Save demonstrations if args.save_interval > 0 and len( demos) < n_episodes and len(demos) % args.save_interval == 0: logger.info("Saving demos...") utils.save_demos(demos, demos_path) logger.info("Demos saved") # print statistics for the last 100 demonstrations print_demo_lengths(demos[-100:]) # Save demonstrations logger.info("Saving demos...") utils.save_demos(demos, demos_path) logger.info("Demos saved") print_demo_lengths(demos[-100:])
def generate_demos(n_episodes, valid, seed, shift=0): utils.seed(seed) # Generate environment env = gym.make(args.env) use_pixels = args.pixels if use_pixels: env = RGBImgPartialObsWrapper(env) agent = utils.load_agent(env, args.model, args.demos, 'agent', args.argmax, args.env) demos_path = utils.get_demos_path(args.demos, args.env, 'agent', valid) demos = [] checkpoint_time = time.time() just_crashed = False while True: if len(demos) == n_episodes: break done = False if just_crashed: logger.info( "reset the environment to find a mission that the bot can solve" ) env.reset() else: env.seed(seed + len(demos)) obs = env.reset() agent.on_reset() actions = [] mission = obs["mission"] images = [] directions = [] try: while not done: action = agent.act(obs)['action'] if isinstance(action, torch.Tensor): action = action.item() new_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) actions.append(action) images.append(obs['image']) if use_pixels: directions.append(None) else: directions.append(obs['direction']) obs = new_obs if reward > 0 and (args.filter_steps == 0 or len(images) <= args.filter_steps): demos.append((mission, blosc.pack_array(np.array(images)), directions, actions)) just_crashed = False if reward == 0: if args.on_exception == 'crash': raise Exception( "mission failed, the seed is {}".format(seed + len(demos))) just_crashed = True logger.info("mission failed") except (Exception, AssertionError): if args.on_exception == 'crash': raise just_crashed = True logger.exception("error while generating demo #{}".format( len(demos))) continue if len(demos) and len(demos) % args.log_interval == 0: now = time.time() demos_per_second = args.log_interval / (now - checkpoint_time) to_go = (n_episodes - len(demos)) / demos_per_second logger.info( "demo #{}, {:.3f} demos per second, {:.3f} seconds to go". format(len(demos) - 1, demos_per_second, to_go)) checkpoint_time = now # Save demonstrations if args.save_interval > 0 and len( demos) < n_episodes and len(demos) % args.save_interval == 0: logger.info("Saving demos...") utils.save_demos(demos, demos_path) logger.info("{} demos saved".format(len(demos))) # print statistics for the last 100 demonstrations print_demo_lengths(demos[-100:]) # Save demonstrations logger.info("Saving demos...") utils.save_demos(demos, demos_path) logger.info("{} demos saved".format(len(demos))) print_demo_lengths(demos[-100:])
demos = utils.load_demos(args.env, "agent") utils.synthesize_demos(demos) for i in range(1, args.episodes+1): # Run the expert for one episode done = False obs = env.reset() demo = [] while not(done): action = agent.get_action(obs) new_obs, reward, done, _ = env.step(action) agent.analyze_feedback(reward, done) demo.append((obs, action, reward, done)) obs = new_obs demos.append(demo) # Save demonstrations if args.save_interval > 0 and i < args.episodes and i % args.save_interval == 0: utils.save_demos(demos, args.env, "agent") utils.synthesize_demos(demos) # Save demonstrations utils.save_demos(demos, args.env, "agent") utils.synthesize_demos(demos)
while True: jobs_done = 0 for i in range(args.jobs): if job_demos[i] is None or len(job_demos[i]) < demos_per_job: try: logger.info("Trying to load shard {}".format(i)) job_demos[i] = utils.load_demos( utils.get_demos_path(job_demo_names[i])) logger.info("{} demos ready in shard {}".format( len(job_demos[i]), i)) except Exception: logger.exception("Failed to load the shard") if job_demos[i] and len(job_demos[i]) == demos_per_job: jobs_done += 1 logger.info("{} out of {} shards done".format(jobs_done, args.jobs)) if jobs_done == args.jobs: break logger.info("sleep for 60 seconds") time.sleep(60) # Training demos all_demos = [] for demos in job_demos: all_demos.extend(demos) demos_path = utils.get_demos_path(args.demos, args.env, 'agent') utils.save_demos(all_demos, demos_path) # Validation demos if args.valid_episodes: generate_demos(args.valid_episodes, True, 0)