Example #1
0
    def generate_pretraining_data(self, env_id, make_env, n_pretrain_clips,
                                  clip_length, stacked_frames, workers):
        print(
            "Starting random rollouts to generate pretraining segments. No learning will take place..."
        )
        if self.clip_manager.total_number_of_clips == 0:
            # We need a valid clip for the root node of our search tree.
            # Null actions are more likely to generate a valid clip than a random clip from random actions.
            first_clip = basic_segment_from_null_action(
                env_id, make_env, clip_length, stacked_frames)
            # Add the null-action clip first, so the root is valid.
            self.clip_manager.add(
                first_clip, source="null-action", sync=True
            )  # Make synchronous to ensure this is the first clip.
            # Now add the rest

        desired_clips = n_pretrain_clips - self.clip_manager.total_number_of_clips

        # TODO sampling random rollouts
        random_clips = segments_from_rand_rollout(
            env_id,
            make_env,
            n_desired_segments=desired_clips,
            clip_length_in_seconds=clip_length,
            stacked_frames=stacked_frames,
            workers=workers)

        for clip in random_clips:
            self.clip_manager.add(clip, source="random rollout")
Example #2
0
def test_render_videos():
    env_id = "Hopper-v1"
    env = make_with_torque_removed(env_id)
    segments = segments_from_rand_rollout(env_id,
                                          make_with_torque_removed,
                                          n_desired_segments=1,
                                          clip_length_in_seconds=CLIP_LENGTH)

    for idx, segment in enumerate(segments):
        local_path = osp.join(TEST_RENDER_DIR, 'test-%s.mp4' % idx)
        print("Writing segment to: %s" % local_path)
        write_segment_to_video(segment, fname=local_path, env=env)
Example #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', required=True)
    parser.add_argument('-p', '--predictor', required=True)
    parser.add_argument('-n', '--name', required=True)
    parser.add_argument('-s', '--seed', default=1, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=None, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="parallel_trpo", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=10000, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    parser.add_argument('-x', '--human_labels', default=1000, type=int)
    args = parser.parse_args()

    print("Setting things up...")

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)

    env = make_with_torque_removed(env_id)

    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4

        if args.n_labels:
            label_schedule = LabelAnnealer(agent_logger,
                                           final_timesteps=num_timesteps,
                                           final_labels=args.n_labels,
                                           pretrain_labels=pretrain_labels)
        else:
            print(
                "No label limit given. We will request one label every few seconds."
            )
            label_schedule = ConstantLabelSchedule(
                pretrain_labels=pretrain_labels)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector(
                run_name, args.human_labels)

        elif args.predictor == "human":
            bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            assert bucket and bucket.startswith(
                "gs://"
            ), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector(
                env_id, experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        predictor = ComparisonRewardPredictor(
            env,
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )

        print(
            "Starting random rollouts to generate pretraining segments. No learning will take place..."
        )
        pretrain_segments = segments_from_rand_rollout(
            env_id,
            make_with_torque_removed,
            n_desired_segments=pretrain_labels * 2,
            clip_length_in_seconds=CLIP_LENGTH,
            workers=args.workers)
        for i in range(
                pretrain_labels):  # Turn our random segments into comparisons
            comparison_collector.add_segment_pair(
                pretrain_segments[i], pretrain_segments[i + pretrain_labels])

        # Sleep until the human has labeled most of the pretraining comparisons
        while len(comparison_collector.labeled_comparisons) < int(
                pretrain_labels * 0.75):
            comparison_collector.label_unlabeled_comparisons()
            if args.predictor == "synth":
                print("%s synthetic labels generated... " %
                      (len(comparison_collector.labeled_comparisons)))
            elif args.predictor == "human":
                print(
                    "%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... "
                    % (len(comparison_collector.labeled_comparisons),
                       pretrain_labels))
                sleep(5)

        # Start the actual training
        for i in range(args.pretrain_iters):
            predictor.train_predictor()  # Train on pretraining labels
            if i % 100 == 0:
                print("%s/%s predictor pretraining iters... " %
                      (i, args.pretrain_iters))

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor,
                                         env,
                                         save_dir=osp.join(
                                             '/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            env_id=env_id,
            make_env=make_with_torque_removed,
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=get_timesteps_per_episode(env),
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":

        def make_env():
            return make_with_torque_removed(env_id)

        train_pposgd_mpi(make_env,
                         num_timesteps=num_timesteps,
                         seed=args.seed,
                         predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" %
                         args.agent)
Example #4
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', required=True)
    parser.add_argument('-p', '--predictor', required=True)
    parser.add_argument('-n', '--name', required=True)
    parser.add_argument('-s', '--seed', default=1, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=None, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="parallel_trpo", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=10000, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    args = parser.parse_args()

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)


    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    ##make torcs envs
    # envs = []
    # for aidx in range(args.workers):
    #     agent = AgentTorcs2(aidx, bots=['scr_server'], track='road/g-track-1', text_mode=False, laps=3,
    #                         torcsIdxOffset=0, screen_capture=True)
    #     # agent = AgentTorcs2(aidx, bots=['scr_server', 'olethros', 'berniw', 'bt', 'damned'], track='road/g-track-1', text_mode=True)
    #     agent.reset()
    #     envs.append(agent)
    #
    #

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector()

        elif args.predictor == "human":
            # bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            # assert bucket and bucket.startswith("gs://"), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector( experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4

        if args.n_labels:
            label_schedule = LabelAnnealer(
                agent_logger,
                final_timesteps=num_timesteps,
                final_labels=args.n_labels,
                pretrain_labels=pretrain_labels)
        else:
            print("No label limit given. We will request one label every few seconds")
            label_schedule = ConstantLabelSchedule(pretrain_labels=pretrain_labels)

        # logger.info("frames = {}".format('start !!!!!!!!!!!!!'))

        print("Starting random rollouts to generate pretraining segments. No learning will take place...")
        pretrain_segments = segments_from_rand_rollout( n_desired_segments=pretrain_labels * 2,
            clip_length_in_seconds=CLIP_LENGTH, workers=args.workers)
        pretrain_segments.sort(key= lambda d :d['maxdistance'])
        for i in range(pretrain_labels):  # Turn our random segments into comparisons
            comparison_collector.add_segment_pair(pretrain_segments[i], pretrain_segments[i + 1])

        # Sleep until the human has labeled most of the pretraining comparisons
        while len(comparison_collector.labeled_comparisons) < int(pretrain_labels * 0.75):
            comparison_collector.label_unlabeled_comparisons()
            if args.predictor == "synth":
                print("%s synthetic labels generated... " % (len(comparison_collector.labeled_comparisons)))
            elif args.predictor == "human":
                print("%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... " % (
                    len(comparison_collector.labeled_comparisons), pretrain_labels))
                sleep(5)

        # Start the actual training
        predictor = ComparisonRewardPredictor(
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )
        for i in range(args.pretrain_iters):
            predictor.train_predictor()  # Train on pretraining labels
            if i % 100 == 0:
                print("%s/%s predictor pretraining iters... " % (i, args.pretrain_iters))

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor, save_dir=osp.join('/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=10000,
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":
        pass

        # train_pposgd_mpi(num_timesteps=num_timesteps, seed=args.seed, predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" % args.agent)