コード例 #1
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', required=True)
    parser.add_argument('-p', '--predictor', required=True)
    parser.add_argument('-n', '--name', required=True)
    parser.add_argument('-s', '--seed', default=1, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=None, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="parallel_trpo", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=10000, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    parser.add_argument('-x', '--human_labels', default=1000, type=int)
    args = parser.parse_args()

    print("Setting things up...")

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)

    env = make_with_torque_removed(env_id)

    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4

        if args.n_labels:
            label_schedule = LabelAnnealer(agent_logger,
                                           final_timesteps=num_timesteps,
                                           final_labels=args.n_labels,
                                           pretrain_labels=pretrain_labels)
        else:
            print(
                "No label limit given. We will request one label every few seconds."
            )
            label_schedule = ConstantLabelSchedule(
                pretrain_labels=pretrain_labels)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector(
                run_name, args.human_labels)

        elif args.predictor == "human":
            bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            assert bucket and bucket.startswith(
                "gs://"
            ), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector(
                env_id, experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        predictor = ComparisonRewardPredictor(
            env,
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )

        print(
            "Starting random rollouts to generate pretraining segments. No learning will take place..."
        )
        pretrain_segments = segments_from_rand_rollout(
            env_id,
            make_with_torque_removed,
            n_desired_segments=pretrain_labels * 2,
            clip_length_in_seconds=CLIP_LENGTH,
            workers=args.workers)
        for i in range(
                pretrain_labels):  # Turn our random segments into comparisons
            comparison_collector.add_segment_pair(
                pretrain_segments[i], pretrain_segments[i + pretrain_labels])

        # Sleep until the human has labeled most of the pretraining comparisons
        while len(comparison_collector.labeled_comparisons) < int(
                pretrain_labels * 0.75):
            comparison_collector.label_unlabeled_comparisons()
            if args.predictor == "synth":
                print("%s synthetic labels generated... " %
                      (len(comparison_collector.labeled_comparisons)))
            elif args.predictor == "human":
                print(
                    "%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... "
                    % (len(comparison_collector.labeled_comparisons),
                       pretrain_labels))
                sleep(5)

        # Start the actual training
        for i in range(args.pretrain_iters):
            predictor.train_predictor()  # Train on pretraining labels
            if i % 100 == 0:
                print("%s/%s predictor pretraining iters... " %
                      (i, args.pretrain_iters))

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor,
                                         env,
                                         save_dir=osp.join(
                                             '/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            env_id=env_id,
            make_env=make_with_torque_removed,
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=get_timesteps_per_episode(env),
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":

        def make_env():
            return make_with_torque_removed(env_id)

        train_pposgd_mpi(make_env,
                         num_timesteps=num_timesteps,
                         seed=args.seed,
                         predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" %
                         args.agent)
コード例 #2
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', required=True)
    parser.add_argument('-p', '--predictor', required=True)
    parser.add_argument('-n', '--name', required=True)
    parser.add_argument('-s', '--seed', default=1, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=None, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="parallel_trpo", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=10000, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    args = parser.parse_args()

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)


    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    ##make torcs envs
    # envs = []
    # for aidx in range(args.workers):
    #     agent = AgentTorcs2(aidx, bots=['scr_server'], track='road/g-track-1', text_mode=False, laps=3,
    #                         torcsIdxOffset=0, screen_capture=True)
    #     # agent = AgentTorcs2(aidx, bots=['scr_server', 'olethros', 'berniw', 'bt', 'damned'], track='road/g-track-1', text_mode=True)
    #     agent.reset()
    #     envs.append(agent)
    #
    #

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector()

        elif args.predictor == "human":
            # bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            # assert bucket and bucket.startswith("gs://"), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector( experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4

        if args.n_labels:
            label_schedule = LabelAnnealer(
                agent_logger,
                final_timesteps=num_timesteps,
                final_labels=args.n_labels,
                pretrain_labels=pretrain_labels)
        else:
            print("No label limit given. We will request one label every few seconds")
            label_schedule = ConstantLabelSchedule(pretrain_labels=pretrain_labels)

        # logger.info("frames = {}".format('start !!!!!!!!!!!!!'))

        print("Starting random rollouts to generate pretraining segments. No learning will take place...")
        pretrain_segments = segments_from_rand_rollout( n_desired_segments=pretrain_labels * 2,
            clip_length_in_seconds=CLIP_LENGTH, workers=args.workers)
        pretrain_segments.sort(key= lambda d :d['maxdistance'])
        for i in range(pretrain_labels):  # Turn our random segments into comparisons
            comparison_collector.add_segment_pair(pretrain_segments[i], pretrain_segments[i + 1])

        # Sleep until the human has labeled most of the pretraining comparisons
        while len(comparison_collector.labeled_comparisons) < int(pretrain_labels * 0.75):
            comparison_collector.label_unlabeled_comparisons()
            if args.predictor == "synth":
                print("%s synthetic labels generated... " % (len(comparison_collector.labeled_comparisons)))
            elif args.predictor == "human":
                print("%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... " % (
                    len(comparison_collector.labeled_comparisons), pretrain_labels))
                sleep(5)

        # Start the actual training
        predictor = ComparisonRewardPredictor(
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )
        for i in range(args.pretrain_iters):
            predictor.train_predictor()  # Train on pretraining labels
            if i % 100 == 0:
                print("%s/%s predictor pretraining iters... " % (i, args.pretrain_iters))

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor, save_dir=osp.join('/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=10000,
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":
        pass

        # train_pposgd_mpi(num_timesteps=num_timesteps, seed=args.seed, predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" % args.agent)
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', default="ShortHopper-v1", type=str)
    parser.add_argument('-p', '--predictor', default="human", type=str)
    parser.add_argument('-n', '--name', default="human-175-hopper", type=str)
    parser.add_argument('-s', '--seed', default=6, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=20, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="pposgd_mpi", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=1, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    parser.add_argument('--log_path',
                        help='Directory to save learning curve data.',
                        default='tmp/openaiTest',
                        type=str)
    args = parser.parse_args()

    print("Setting things up...")

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)

    env = make_with_torque_removed(env_id)

    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4
        #online and offline
        if args.n_labels:
            label_schedule = LabelAnnealer(agent_logger,
                                           final_timesteps=num_timesteps,
                                           final_labels=args.n_labels,
                                           pretrain_labels=pretrain_labels)
        else:
            print(
                "No label limit given. We will request one label every few seconds."
            )
            label_schedule = ConstantLabelSchedule(
                pretrain_labels=pretrain_labels)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector()

        elif args.predictor == "human":
            bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            bucket = "gs://rl-teacher-preference"
            #assert bucket and bucket.startswith("gs://"), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector(
                env_id, experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        predictor = ComparisonRewardPredictor(
            env,
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )

        # print("Starting random rollouts to generate pretraining segments. No learning will take place...")
        # pretrain_segments = segments_from_rand_rollout(
        #     env_id, make_with_torque_removed, n_desired_segments=pretrain_labels * 2,
        #     clip_length_in_seconds=CLIP_LENGTH, workers=args.workers)
        # for i in range(pretrain_labels):  # Turn our random segments into comparisons
        #     comparison_collector.add_segment_pair(pretrain_segments[i], pretrain_segments[i + pretrain_labels])
        #
        # # Sleep until the human has labeled most of the pretraining comparisons
        # while len(comparison_collector.labeled_comparisons) < int(pretrain_labels * 0.75):
        #     comparison_collector.label_unlabeled_comparisons()
        #     if args.predictor == "synth":
        #         print("%s synthetic labels generated... " % (len(comparison_collector.labeled_comparisons)))
        #     elif args.predictor == "human":
        #         print("%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... " % (
        #             len(comparison_collector.labeled_comparisons), pretrain_labels))
        #         sleep(5)
        #
        # # Start the actual training
        #
        # for i in range(args.pretrain_iters):
        #     predictor.train_predictor()  # Train on pretraining labels
        #     if i % 10 == 0:
        #         print("%s/%s predictor pretraining iters... " % (i, args.pretrain_iters))
        #saver = tf.train.Saver(max_to_keep=5)
        #save_path = saver.save(sess, "/tmp/GAN/GAN_preference_based_model.ckpt")
        #print("Model saved in path: %s" % save_path)

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor,
                                         env,
                                         save_dir=osp.join(
                                             '/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            env_id=env_id,
            make_env=make_with_torque_removed,
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=get_timesteps_per_episode(env),
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":

        def make_env():
            return make_with_torque_removed(env_id)

        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        def configure_logger(log_path, **kwargs):
            if log_path is not None:
                logger.configure(log_path)
            else:
                logger.configure(**kwargs)

        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
            rank = 0
            configure_logger(args.log_path)
        else:
            rank = MPI.COMM_WORLD.Get_rank()
            configure_logger(args.log_path, format_strs=[])

        train_pposgd_mpi(make_env,
                         num_timesteps=num_timesteps,
                         seed=args.seed,
                         predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" %
                         args.agent)
コード例 #4
0
    def __init__(self, sess, env, brain_name, trainer_parameters, training, seed,num_timesteps,num_labels,pretrain_labels):
        """
        Responsible for collecting experiences and training PPO model.
        :param sess: Tensorflow session.
        :param env: The UnityEnvironment.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param num_timesteps: timesteps collect segment.
        :param num_labels: .
        """
        self.param_keys = ['batch_size', 'beta', 'buffer_size', 'epsilon', 'gamma', 'hidden_units', 'lambd',
                           'learning_rate', 'max_steps', 'normalize', 'num_epoch', 'num_layers',
                           'time_horizon', 'sequence_length', 'summary_freq', 'use_recurrent',
                           'graph_scope', 'summary_path', 'memory_size', 'use_curiosity', 'curiosity_strength',
                           'curiosity_enc_size']

        for k in self.param_keys:
            if k not in trainer_parameters:
                raise UnityTrainerException("The hyperparameter {0} could not be found for the PPO trainer of "
                                            "brain {1}.".format(k, brain_name))

        super(PPOTrainer, self).__init__(sess, env, brain_name, trainer_parameters, training)

        self.use_recurrent = trainer_parameters["use_recurrent"]
        self.use_curiosity = bool(trainer_parameters['use_curiosity'])
        self.sequence_length = 1
        self.step = 0
        self.has_updated = False
        self.m_size = None
        if self.use_recurrent:
            self.m_size = trainer_parameters["memory_size"]
            self.sequence_length = trainer_parameters["sequence_length"]
            if self.m_size == 0:
                raise UnityTrainerException("The memory size for brain {0} is 0 even though the trainer uses recurrent."
                                            .format(brain_name))
            elif self.m_size % 4 != 0:
                raise UnityTrainerException("The memory size for brain {0} is {1} but it must be divisible by 4."
                                            .format(brain_name, self.m_size))

        self.variable_scope = trainer_parameters['graph_scope']
        with tf.variable_scope(self.variable_scope):
            tf.set_random_seed(seed)
            self.model = PPOModel(env.brains[brain_name],
                                  lr=float(trainer_parameters['learning_rate']),
                                  h_size=int(trainer_parameters['hidden_units']),
                                  epsilon=float(trainer_parameters['epsilon']),
                                  beta=float(trainer_parameters['beta']),
                                  max_step=float(trainer_parameters['max_steps']),
                                  normalize=trainer_parameters['normalize'],
                                  use_recurrent=trainer_parameters['use_recurrent'],
                                  num_layers=int(trainer_parameters['num_layers']),
                                  m_size=self.m_size,
                                  use_curiosity=bool(trainer_parameters['use_curiosity']),
                                  curiosity_strength=float(trainer_parameters['curiosity_strength']),
                                  curiosity_enc_size=float(trainer_parameters['curiosity_enc_size']))

        stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
                 'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
        if self.use_curiosity:
            stats['forward_loss'] = []
            stats['inverse_loss'] = []
            stats['intrinsic_reward'] = []
            self.intrinsic_rewards = {}
        self.stats = stats

        self.training_buffer = Buffer()
        self.cumulative_rewards = {}
        self.episode_steps = {}
        self.is_continuous_action = (env.brains[brain_name].vector_action_space_type == "continuous")
        self.is_continuous_observation = (env.brains[brain_name].vector_observation_space_type == "continuous")
        self.use_visual_obs = (env.brains[brain_name].number_visual_observations > 0)
        self.use_vector_obs = (env.brains[brain_name].vector_observation_space_size > 0)
        self.summary_path = trainer_parameters['summary_path']
        if not os.path.exists(self.summary_path):
            os.makedirs(self.summary_path)

        self.summary_writer = tf.summary.FileWriter(self.summary_path)

        self.inference_run_list = [self.model.output, self.model.all_probs, self.model.value,
                                   self.model.entropy, self.model.learning_rate]
        if self.is_continuous_action:
            self.inference_run_list.append(self.model.output_pre)
        if self.use_recurrent:
            self.inference_run_list.extend([self.model.memory_out])
        if (self.is_training and self.is_continuous_observation and
                self.use_vector_obs and self.trainer_parameters['normalize']):
            self.inference_run_list.extend([self.model.update_mean, self.model.update_variance])

        label_schedule = LabelAnnealer(
                logger,
                final_timesteps=num_timesteps,
                final_labels=num_labels,
                pretrain_labels=pretrain_labels
                )

        comparison_collector = HumanComparisonCollector(experiment_name=brain_name)
        
        self.predictor = ComparisonRewardPredictor(
            self.brain,
            self.summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=logger,
            label_schedule=label_schedule,
            clip_length= CLIP_LENGTH)