Ejemplo n.º 1
0
def test_render_videos():
    env_id = "Hopper-v1"
    env = make_with_torque_removed(env_id)
    segments = segments_from_rand_rollout(env_id,
                                          make_with_torque_removed,
                                          n_desired_segments=1,
                                          clip_length_in_seconds=CLIP_LENGTH)

    for idx, segment in enumerate(segments):
        local_path = osp.join(TEST_RENDER_DIR, 'test-%s.mp4' % idx)
        print("Writing segment to: %s" % local_path)
        write_segment_to_video(segment, fname=local_path, env=env)
Ejemplo n.º 2
0
def test_render_videos():
    env = make_with_torque_removed("Hopper-v1")
    collector = RandomRolloutSegmentCollector(20000, env=env)
    rl_teacher.agent.trpo.run_trpo_mujoco.train(
        num_timesteps=8000,
        env=env,
        seed=0,
        predictor=collector,
        random_rollout=True,
    )

    segments = collector.segments
    tmp_media_dir = '/tmp/rl_teacher_media_test'
    for segment in segments:
        local_path = osp.join(tmp_media_dir, str(uuid.uuid4()) + '.mp4')
        print("Writing segment to: %s" % local_path)
        write_segment_to_video(segment, fname=local_path, env=env)
Ejemplo n.º 3
0
def _write_and_upload_video(env_id, gcs_path, local_path, segment):
    env = make_with_torque_removed(env_id)
    write_segment_to_video(segment, fname=local_path, env=env)
    upload_to_gcs(local_path, gcs_path)
Ejemplo n.º 4
0
 def make_env():
     return make_with_torque_removed(env_id)
Ejemplo n.º 5
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', required=True)
    parser.add_argument('-p', '--predictor', required=True)
    parser.add_argument('-n', '--name', required=True)
    parser.add_argument('-s', '--seed', default=1, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=None, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="parallel_trpo", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=10000, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    parser.add_argument('-x', '--human_labels', default=1000, type=int)
    args = parser.parse_args()

    print("Setting things up...")

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)

    env = make_with_torque_removed(env_id)

    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4

        if args.n_labels:
            label_schedule = LabelAnnealer(agent_logger,
                                           final_timesteps=num_timesteps,
                                           final_labels=args.n_labels,
                                           pretrain_labels=pretrain_labels)
        else:
            print(
                "No label limit given. We will request one label every few seconds."
            )
            label_schedule = ConstantLabelSchedule(
                pretrain_labels=pretrain_labels)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector(
                run_name, args.human_labels)

        elif args.predictor == "human":
            bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            assert bucket and bucket.startswith(
                "gs://"
            ), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector(
                env_id, experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        predictor = ComparisonRewardPredictor(
            env,
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )

        print(
            "Starting random rollouts to generate pretraining segments. No learning will take place..."
        )
        pretrain_segments = segments_from_rand_rollout(
            env_id,
            make_with_torque_removed,
            n_desired_segments=pretrain_labels * 2,
            clip_length_in_seconds=CLIP_LENGTH,
            workers=args.workers)
        for i in range(
                pretrain_labels):  # Turn our random segments into comparisons
            comparison_collector.add_segment_pair(
                pretrain_segments[i], pretrain_segments[i + pretrain_labels])

        # Sleep until the human has labeled most of the pretraining comparisons
        while len(comparison_collector.labeled_comparisons) < int(
                pretrain_labels * 0.75):
            comparison_collector.label_unlabeled_comparisons()
            if args.predictor == "synth":
                print("%s synthetic labels generated... " %
                      (len(comparison_collector.labeled_comparisons)))
            elif args.predictor == "human":
                print(
                    "%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... "
                    % (len(comparison_collector.labeled_comparisons),
                       pretrain_labels))
                sleep(5)

        # Start the actual training
        for i in range(args.pretrain_iters):
            predictor.train_predictor()  # Train on pretraining labels
            if i % 100 == 0:
                print("%s/%s predictor pretraining iters... " %
                      (i, args.pretrain_iters))

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor,
                                         env,
                                         save_dir=osp.join(
                                             '/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            env_id=env_id,
            make_env=make_with_torque_removed,
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=get_timesteps_per_episode(env),
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":

        def make_env():
            return make_with_torque_removed(env_id)

        train_pposgd_mpi(make_env,
                         num_timesteps=num_timesteps,
                         seed=args.seed,
                         predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" %
                         args.agent)
Ejemplo n.º 6
0
 def env_fn():
     from rl_teacher.envs import make_with_torque_removed
     env = make_with_torque_removed(env_id)
     env.seed(seed)
     return env
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', default="ShortHopper-v1", type=str)
    parser.add_argument('-p', '--predictor', default="human", type=str)
    parser.add_argument('-n', '--name', default="human-175-hopper", type=str)
    parser.add_argument('-s', '--seed', default=6, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=20, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="pposgd_mpi", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=1, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    parser.add_argument('--log_path',
                        help='Directory to save learning curve data.',
                        default='tmp/openaiTest',
                        type=str)
    args = parser.parse_args()

    print("Setting things up...")

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)

    env = make_with_torque_removed(env_id)

    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4
        #online and offline
        if args.n_labels:
            label_schedule = LabelAnnealer(agent_logger,
                                           final_timesteps=num_timesteps,
                                           final_labels=args.n_labels,
                                           pretrain_labels=pretrain_labels)
        else:
            print(
                "No label limit given. We will request one label every few seconds."
            )
            label_schedule = ConstantLabelSchedule(
                pretrain_labels=pretrain_labels)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector()

        elif args.predictor == "human":
            bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            bucket = "gs://rl-teacher-preference"
            #assert bucket and bucket.startswith("gs://"), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector(
                env_id, experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        predictor = ComparisonRewardPredictor(
            env,
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )

        # print("Starting random rollouts to generate pretraining segments. No learning will take place...")
        # pretrain_segments = segments_from_rand_rollout(
        #     env_id, make_with_torque_removed, n_desired_segments=pretrain_labels * 2,
        #     clip_length_in_seconds=CLIP_LENGTH, workers=args.workers)
        # for i in range(pretrain_labels):  # Turn our random segments into comparisons
        #     comparison_collector.add_segment_pair(pretrain_segments[i], pretrain_segments[i + pretrain_labels])
        #
        # # Sleep until the human has labeled most of the pretraining comparisons
        # while len(comparison_collector.labeled_comparisons) < int(pretrain_labels * 0.75):
        #     comparison_collector.label_unlabeled_comparisons()
        #     if args.predictor == "synth":
        #         print("%s synthetic labels generated... " % (len(comparison_collector.labeled_comparisons)))
        #     elif args.predictor == "human":
        #         print("%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... " % (
        #             len(comparison_collector.labeled_comparisons), pretrain_labels))
        #         sleep(5)
        #
        # # Start the actual training
        #
        # for i in range(args.pretrain_iters):
        #     predictor.train_predictor()  # Train on pretraining labels
        #     if i % 10 == 0:
        #         print("%s/%s predictor pretraining iters... " % (i, args.pretrain_iters))
        #saver = tf.train.Saver(max_to_keep=5)
        #save_path = saver.save(sess, "/tmp/GAN/GAN_preference_based_model.ckpt")
        #print("Model saved in path: %s" % save_path)

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor,
                                         env,
                                         save_dir=osp.join(
                                             '/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            env_id=env_id,
            make_env=make_with_torque_removed,
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=get_timesteps_per_episode(env),
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":

        def make_env():
            return make_with_torque_removed(env_id)

        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        def configure_logger(log_path, **kwargs):
            if log_path is not None:
                logger.configure(log_path)
            else:
                logger.configure(**kwargs)

        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
            rank = 0
            configure_logger(args.log_path)
        else:
            rank = MPI.COMM_WORLD.Get_rank()
            configure_logger(args.log_path, format_strs=[])

        train_pposgd_mpi(make_env,
                         num_timesteps=num_timesteps,
                         seed=args.seed,
                         predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" %
                         args.agent)
Ejemplo n.º 8
0
                tb_logger.log(k, v)
            tb_logger.summary_step += 1

            stats["Time elapsed"] = "%.2f mins" % (
                (time.time() - start_time) / 60.0)
            print("\n********** Iteration {} ************".format(i))
            for k, v in stats.items():
                print(k + ": " + " " * (40 - len(k)) + str(v))
            if entropy != entropy:
                exit(-1)


logging.getLogger().setLevel(logging.DEBUG)

# env = envs.make(args.task)
env_id = args.task
env = make_with_torque_removed(env_id)
# env = Monitor(env, '/tmp/trpo_ilyasu')
# def capped_cubic_video_schedule(episode_id):
#     if episode_id < 1000:
#         return int(round(episode_id ** (1. / 3))) ** 3 == episode_id
#     else:
#         return episode_id % 1000 == 0

agent = TRPO(env)
agent.learn()

from sys import argv

print('python {}'.format(' '.join(argv)))
Ejemplo n.º 9
0
def _write_and_upload_video(env_id, gcs_path, local_path, segment):
    os.mkdir('/tmp/233')
    env = make_with_torque_removed(env_id)
    write_segment_to_video(segment, fname=local_path, env=env)