Beispiel #1
0
    def stepEnv(self, action=None):
        # If no manual action was specified by the user
        if action is None:
            action = random.randint(0, self.env.action_space.n - 1)
        action = int(action)

        obs, reward, done, info = self.env.step(action)

        self.current_actions.append(action)
        self.current_images.append(self.lastObs['image'])
        self.current_directions.append(self.lastObs['direction'])

        self.showEnv(obs)
        self.lastObs = obs

        if done:
            if reward > 0:  # i.e. we did not lose
                if self.shift < len(self.demos):
                    self.demos[self.shift] = self.current_demo, self.shift
                else:
                    self.demos.append(
                        (self.current_mission,
                         blosc.pack_array(np.array(self.current_images)),
                         self.current_directions, self.current_actions))
                utils.save_demos(self.demos, self.demos_path)
                self.missionBox.append('Demonstrations are saved.')
                utils.synthesize_demos(self.demos)

                self.shift += 1
                self.resetEnv()
            else:
                self.shiftEnv()
Beispiel #2
0
    def stepEnv(self, action=None):
        # If no manual action was specified by the user
        if action is None:
            action = random.randint(0, self.env.action_space.n - 1)
        action = int(action)

        obs, reward, done, info = self.env.step(action)

        self.current_demo.append((self.lastObs, action, reward, done))

        self.showEnv(obs)
        self.lastObs = obs

        if done:
            if reward > 0:  # i.e. we did not lose
                if self.shift < len(self.demos):
                    self.demos[self.shift] = self.current_demo
                else:
                    self.demos.append(self.current_demo)
                utils.save_demos(self.demos, args.env, "human")
                self.missionBox.append('Demonstrations are saved.')
                utils.synthesize_demos(self.demos)

                self.shift += 1
                self.resetEnv()
            else:
                self.shiftEnv()
Beispiel #3
0
def generate_demos_cluster():
    demos_per_job = args.episodes // args.jobs
    demos_path = utils.get_demos_path(args.demos, args.env, 'agent')
    job_demo_names = [
        os.path.realpath(demos_path + '.shard{}'.format(i))
        for i in range(args.jobs)
    ]
    for demo_name in job_demo_names:
        job_demos_path = utils.get_demos_path(demo_name)
        if os.path.exists(job_demos_path):
            os.remove(job_demos_path)

    processes = []

    command = [args.job_script]
    command += sys.argv[1:]
    for i in range(args.jobs):
        cmd_i = list(
            map(
                str, command + ['--seed', args.seed + i] +
                ['--demos', job_demo_names[i]] +
                ['--episodes', demos_per_job] + ['--jobs', 0] +
                ['--valid-episodes', 0]))
        logger.info('LAUNCH COMMAND')
        logger.info(cmd_i)

        process = subprocess.Popen(cmd_i)
        processes += [process]

    for p in processes:
        p.wait()

    job_demos = [None] * args.jobs
    while True:
        jobs_done = 0
        for i in range(args.jobs):
            if job_demos[i] is None or len(job_demos[i]) < demos_per_job:
                try:
                    logger.info("Trying to load shard {}".format(i))
                    job_demos[i] = utils.load_demos(
                        utils.get_demos_path(job_demo_names[i]))
                    logger.info("{} demos ready in shard {}".format(
                        len(job_demos[i]), i))
                except Exception:
                    logger.exception("Failed to load the shard")
            if job_demos[i] and len(job_demos[i]) == demos_per_job:
                jobs_done += 1
        logger.info("{} out of {} shards done".format(jobs_done, args.jobs))
        if jobs_done == args.jobs:
            break
        logger.info("sleep for 60 seconds")
        time.sleep(60)

    # Training demos
    all_demos = []
    for demos in job_demos:
        all_demos.extend(demos)
    utils.save_demos(all_demos, demos_path)
def main(args):
    args.model = args.model or ImitationLearning.default_model_name(args)
    utils.configure_logging(args.model)
    il_learn = ImitationLearning(args)

    # Define logger and Tensorboard writer
    header = ([
        "update", "frames", "FPS", "duration", "entropy", "policy_loss",
        "train_accuracy"
    ] + [
        "validation_accuracy", "validation_return", "validation_success_rate"
    ])
    writer = None
    if args.tb:
        from tensorboardX import SummaryWriter
        writer = SummaryWriter(utils.get_log_dir(args.model))

    # Define csv writer
    csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv')
    first_created = not os.path.exists(csv_path)
    # we don't buffer data going in the csv log, cause we assume
    # that one update will take much longer that one write to the log
    csv_writer = csv.writer(open(csv_path, 'a', 1))
    if first_created:
        csv_writer.writerow(header)

    # Log command, availability of CUDA, and model
    logger.info(args)
    logger.info("CUDA available: {}".format(torch.cuda.is_available()))
    logger.info(il_learn.acmodel)

    # Seed at which demo evaluation/generation will begin
    eval_seed = args.seed + len(il_learn.train_demos)

    # Phase at which we start
    cur_phase = 0

    # Try to load the status (if resuming)
    status_path = os.path.join(utils.get_log_dir(args.model), 'status.json')
    if os.path.exists(status_path):
        with open(status_path, 'r') as src:
            status = json.load(src)
            eval_seed = status.get('eval_seed', eval_seed)
            cur_phase = status.get('cur_phase', cur_phase)

    model_name = args.model

    for phase_no in range(cur_phase, args.phases):
        logger.info("Starting phase {} with {} demos, eval_seed={}".format(
            phase_no, len(il_learn.train_demos), eval_seed))

        # Each phase trains a different model from scratch
        args.model = model_name + ('_phase_%d' % phase_no)
        il_learn = ImitationLearning(args)

        # Train the imitation learning agent
        if len(il_learn.train_demos) > 0:
            train_status_path = os.path.join(utils.get_log_dir(args.model),
                                             'status.json')
            il_learn.train(il_learn.train_demos, writer, csv_writer,
                           train_status_path, header)

        # Stopping criterion
        valid_log = il_learn.validate(args.val_episodes)
        success_rate = np.mean(
            [1 if r > 0 else 0 for r in valid_log[0]['return_per_episode']])

        if success_rate >= 0.99:
            logger.info(
                "Reached target success rate with {} demos, stopping".format(
                    len(il_learn.train_demos)))
            break

        eval_seed = grow_training_set(il_learn, il_learn.train_demos,
                                      eval_seed, args.demo_grow_factor,
                                      args.num_eval_demos)

        # Save the current demo generation seed
        with open(status_path, 'w') as dst:
            status = {'eval_seed': eval_seed, 'cur_phase': phase_no + 1}
            json.dump(status, dst)

        # Save the demos
        demos_path = utils.get_demos_path(args.demos,
                                          args.env,
                                          args.demos_origin,
                                          valid=False)
        print('saving demos to:', demos_path)
        utils.save_demos(il_learn.train_demos, demos_path)
Beispiel #5
0
def generate_demos(n_episodes, valid, seed, shift=0):
    utils.seed(seed)

    # Generate environment
    env = gym.make(args.env)
    env.seed(seed)
    for i in range(shift):
        env.reset()

    agent = utils.load_agent(env, args.model, args.demos, 'agent', args.argmax,
                             args.env)
    demos_path = utils.get_demos_path(args.demos, args.env, 'agent', valid)
    demos = []

    checkpoint_time = time.time()

    while True:
        # Run the expert for one episode

        done = False
        obs = env.reset()
        agent.on_reset()

        actions = []
        mission = obs["mission"]
        images = []
        directions = []

        try:
            while not done:
                action = agent.act(obs)['action']
                if isinstance(action, torch.Tensor):
                    action = action.item()
                new_obs, reward, done, _ = env.step(action)
                agent.analyze_feedback(reward, done)

                actions.append(action)
                images.append(obs['image'])
                directions.append(obs['direction'])

                obs = new_obs
            if reward > 0 and (args.filter_steps == 0
                               or len(images) <= args.filter_steps):
                demos.append((mission, blosc.pack_array(np.array(images)),
                              directions, actions))

            if len(demos) >= n_episodes:
                break
            if reward == 0:
                if args.on_exception == 'crash':
                    raise Exception("mission failed")
                logger.info("mission failed")
        except Exception:
            if args.on_exception == 'crash':
                raise
            logger.exception("error while generating demo #{}".format(
                len(demos)))
            continue

        if len(demos) and len(demos) % args.log_interval == 0:
            now = time.time()
            demos_per_second = args.log_interval / (now - checkpoint_time)
            to_go = (n_episodes - len(demos)) / demos_per_second
            logger.info(
                "demo #{}, {:.3f} demos per second, {:.3f} seconds to go".
                format(len(demos), demos_per_second, to_go))
            checkpoint_time = now

        # Save demonstrations

        if args.save_interval > 0 and len(
                demos) < n_episodes and len(demos) % args.save_interval == 0:
            logger.info("Saving demos...")
            utils.save_demos(demos, demos_path)
            logger.info("Demos saved")
            # print statistics for the last 100 demonstrations
            print_demo_lengths(demos[-100:])

    # Save demonstrations
    logger.info("Saving demos...")
    utils.save_demos(demos, demos_path)
    logger.info("Demos saved")
    print_demo_lengths(demos[-100:])
Beispiel #6
0
def generate_demos(n_episodes, valid, seed, shift=0):
    utils.seed(seed)

    # Generate environment
    env = gym.make(args.env)
    use_pixels = args.pixels
    if use_pixels:
        env = RGBImgPartialObsWrapper(env)

    agent = utils.load_agent(env, args.model, args.demos, 'agent', args.argmax,
                             args.env)
    demos_path = utils.get_demos_path(args.demos, args.env, 'agent', valid)
    demos = []

    checkpoint_time = time.time()

    just_crashed = False
    while True:
        if len(demos) == n_episodes:
            break

        done = False
        if just_crashed:
            logger.info(
                "reset the environment to find a mission that the bot can solve"
            )
            env.reset()
        else:
            env.seed(seed + len(demos))
        obs = env.reset()
        agent.on_reset()

        actions = []
        mission = obs["mission"]
        images = []
        directions = []

        try:
            while not done:
                action = agent.act(obs)['action']
                if isinstance(action, torch.Tensor):
                    action = action.item()
                new_obs, reward, done, _ = env.step(action)
                agent.analyze_feedback(reward, done)

                actions.append(action)
                images.append(obs['image'])
                if use_pixels:
                    directions.append(None)
                else:
                    directions.append(obs['direction'])

                obs = new_obs
            if reward > 0 and (args.filter_steps == 0
                               or len(images) <= args.filter_steps):
                demos.append((mission, blosc.pack_array(np.array(images)),
                              directions, actions))
                just_crashed = False

            if reward == 0:
                if args.on_exception == 'crash':
                    raise Exception(
                        "mission failed, the seed is {}".format(seed +
                                                                len(demos)))
                just_crashed = True
                logger.info("mission failed")
        except (Exception, AssertionError):
            if args.on_exception == 'crash':
                raise
            just_crashed = True
            logger.exception("error while generating demo #{}".format(
                len(demos)))
            continue

        if len(demos) and len(demos) % args.log_interval == 0:
            now = time.time()
            demos_per_second = args.log_interval / (now - checkpoint_time)
            to_go = (n_episodes - len(demos)) / demos_per_second
            logger.info(
                "demo #{}, {:.3f} demos per second, {:.3f} seconds to go".
                format(len(demos) - 1, demos_per_second, to_go))
            checkpoint_time = now

        # Save demonstrations

        if args.save_interval > 0 and len(
                demos) < n_episodes and len(demos) % args.save_interval == 0:
            logger.info("Saving demos...")
            utils.save_demos(demos, demos_path)
            logger.info("{} demos saved".format(len(demos)))
            # print statistics for the last 100 demonstrations
            print_demo_lengths(demos[-100:])

    # Save demonstrations
    logger.info("Saving demos...")
    utils.save_demos(demos, demos_path)
    logger.info("{} demos saved".format(len(demos)))
    print_demo_lengths(demos[-100:])
demos = utils.load_demos(args.env, "agent")
utils.synthesize_demos(demos)

for i in range(1, args.episodes+1):
    # Run the expert for one episode

    done = False
    obs = env.reset()
    demo = []

    while not(done):
        action = agent.get_action(obs)
        new_obs, reward, done, _ = env.step(action)
        agent.analyze_feedback(reward, done)

        demo.append((obs, action, reward, done))
        obs = new_obs

    demos.append(demo)

    # Save demonstrations

    if args.save_interval > 0 and i < args.episodes and i % args.save_interval == 0:
        utils.save_demos(demos, args.env, "agent")
        utils.synthesize_demos(demos)

# Save demonstrations

utils.save_demos(demos, args.env, "agent")
utils.synthesize_demos(demos)
Beispiel #8
0
    while True:
        jobs_done = 0
        for i in range(args.jobs):
            if job_demos[i] is None or len(job_demos[i]) < demos_per_job:
                try:
                    logger.info("Trying to load shard {}".format(i))
                    job_demos[i] = utils.load_demos(
                        utils.get_demos_path(job_demo_names[i]))
                    logger.info("{} demos ready in shard {}".format(
                        len(job_demos[i]), i))
                except Exception:
                    logger.exception("Failed to load the shard")
            if job_demos[i] and len(job_demos[i]) == demos_per_job:
                jobs_done += 1
        logger.info("{} out of {} shards done".format(jobs_done, args.jobs))
        if jobs_done == args.jobs:
            break
        logger.info("sleep for 60 seconds")
        time.sleep(60)

    # Training demos
    all_demos = []
    for demos in job_demos:
        all_demos.extend(demos)
    demos_path = utils.get_demos_path(args.demos, args.env, 'agent')
    utils.save_demos(all_demos, demos_path)

    # Validation demos
    if args.valid_episodes:
        generate_demos(args.valid_episodes, True, 0)