def main(): # Tensorflow is not fork-safe, so we must use spawn instead # https://github.com/tensorflow/tensorflow/issues/5448#issuecomment-258934405 multiprocessing.set_start_method('spawn') parser = argparse.ArgumentParser() parser.add_argument('-e', '--env_id', required=True) parser.add_argument('-p', '--reward_model', required=True) parser.add_argument('-n', '--name', required=True) parser.add_argument('-s', '--seed', default=1, type=int) parser.add_argument('-w', '--workers', default=4, type=int) parser.add_argument('-l', '--n_labels', default=None, type=int) parser.add_argument('-L', '--pretrain_labels', default=None, type=int) parser.add_argument('-t', '--num_timesteps', default=5e6, type=int) parser.add_argument('-a', '--agent', default="ga3c", type=str) parser.add_argument('-i', '--pretrain_iters', default=5000, type=int) parser.add_argument('-b', '--starting_beta', default=0.1, type=float) parser.add_argument('-c', '--clip_length', default=1.5, type=float) parser.add_argument('-f', '--stacked_frames', default=4, type=int) parser.add_argument('-V', '--no_videos', action="store_true") parser.add_argument('--force_new_environment_clips', action="store_true") parser.add_argument('--force_new_training_labels', action="store_true") parser.add_argument('--force_new_reward_model', action="store_true") parser.add_argument('--force_new_agent_model', action="store_true") args = parser.parse_args() env_id = args.env_id experiment_name = slugify(args.name) # Potentially erase old data if args.force_new_environment_clips: existing_clips = [x for x in os.listdir('clips') if x.startswith(env_id)] if len(existing_clips): print("Found {} old clips".format(len(existing_clips))) print("Are you sure you want to erase them and start fresh?") print("Warning: This will invalidate all training labels made from these clips!") if input("> ").lower().startswith('y'): for clip in existing_clips: os.remove(os.path.join('clips', clip)) from human_feedback_api import Clip Clip.objects.filter(environment_id=env_id).delete() # Also erase all label data for this experiment from human_feedback_api import SortTree SortTree.objects.filter(experiment_name=experiment_name).delete() from human_feedback_api import Comparison Comparison.objects.filter(experiment_name=experiment_name).delete() else: print("Quitting...") return if args.force_new_training_labels: from human_feedback_api import SortTree from human_feedback_api import Comparison all_tree_nodes = SortTree.objects.filter(experiment_name=experiment_name) if all_tree_nodes: print("Found a sorting tree with {} nodes".format(len(all_tree_nodes))) print("Are you sure you want to erase all the comparison data associated with this tree?") if input("> ").lower().startswith('y'): all_tree_nodes.delete() Comparison.objects.filter(experiment_name=experiment_name).delete() else: print("Quitting...") return print("Setting things up...") run_name = "%s/%s-%s" % (env_id, experiment_name, int(time())) env = make_env(env_id) n_pretrain_labels = args.pretrain_labels if args.pretrain_labels else (args.n_labels // 4 if args.n_labels else 0) episode_logger = EpisodeLogger(run_name) schedule = make_label_schedule(n_pretrain_labels, args.n_labels, args.num_timesteps, episode_logger) os.makedirs('checkpoints/reward_model', exist_ok=True) os.makedirs('clips', exist_ok=True) # Make reward model if args.reward_model == "rl": reward_model = OriginalEnvironmentReward(episode_logger) args.pretrain_iters = 0 # Don't bother pre-training a traditional RL agent else: reward_model = OrdinalRewardModel( args.reward_model, env, env_id, make_env, experiment_name, episode_logger, schedule, n_pretrain_labels, args.clip_length, args.stacked_frames, args.workers) if not args.force_new_reward_model: reward_model.try_to_load_model_from_checkpoint() reward_model.train(args.pretrain_iters, report_frequency=25) reward_model.save_model_checkpoint() # Wrap the reward model to capture videos every so often: if not args.no_videos: video_path = os.path.join('/tmp/rl_teacher_vids', run_name) checkpoint_interval = 20 if args.agent == "ga3c" else 200 reward_model = SegmentVideoRecorder(reward_model, env, save_dir=video_path, checkpoint_interval=checkpoint_interval) print("Starting joint training of reward model and agent") if args.agent == "ga3c": from ga3c.Server import Server as Ga3cServer from ga3c.Config import Config as Ga3cConfig Ga3cConfig.ATARI_GAME = env_id Ga3cConfig.MAKE_ENV_FUNCTION = make_env Ga3cConfig.NETWORK_NAME = experiment_name Ga3cConfig.SAVE_FREQUENCY = 200 Ga3cConfig.TENSORBOARD = True Ga3cConfig.LOG_WRITER = episode_logger Ga3cConfig.AGENTS = args.workers Ga3cConfig.LOAD_CHECKPOINT = not args.force_new_agent_model Ga3cConfig.STACKED_FRAMES = args.stacked_frames Ga3cConfig.BETA_START = args.starting_beta Ga3cConfig.BETA_END = args.starting_beta * 0.1 Ga3cServer(reward_model).main() elif args.agent == "parallel_trpo": from parallel_trpo.train import train_parallel_trpo train_parallel_trpo( env_id=env_id, make_env=make_env, stacked_frames=args.stacked_frames, predictor=reward_model, summary_writer=episode_logger, workers=args.workers, runtime=(args.num_timesteps / 1000), max_timesteps_per_episode=get_timesteps_per_episode(env), timesteps_per_batch=8000, max_kl=0.001, seed=args.seed, ) elif args.agent == "pposgd_mpi": from pposgd_mpi.run_mujoco import train_pposgd_mpi train_pposgd_mpi(lambda: make_env(env_id), num_timesteps=args.num_timesteps, seed=args.seed, predictor=reward_model) elif args.agent == "ppo_atari": from pposgd_mpi.run_atari import train_atari # TODO: Add Multi-CPU support! train_atari(env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=1, predictor=reward_model) else: raise ValueError("%s is not a valid choice for args.agent" % args.agent)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-e', '--env_id', default="ShortHopper-v1", type=str) parser.add_argument('-p', '--predictor', default="human", type=str) parser.add_argument('-n', '--name', default="human-175-hopper", type=str) parser.add_argument('-s', '--seed', default=6, type=int) parser.add_argument('-w', '--workers', default=4, type=int) parser.add_argument('-l', '--n_labels', default=None, type=int) parser.add_argument('-L', '--pretrain_labels', default=20, type=int) parser.add_argument('-t', '--num_timesteps', default=5e6, type=int) parser.add_argument('-a', '--agent', default="pposgd_mpi", type=str) parser.add_argument('-i', '--pretrain_iters', default=1, type=int) parser.add_argument('-V', '--no_videos', action="store_true") parser.add_argument('--log_path', help='Directory to save learning curve data.', default='tmp/openaiTest', type=str) args = parser.parse_args() print("Setting things up...") env_id = args.env_id run_name = "%s/%s-%s" % (env_id, args.name, int(time())) summary_writer = make_summary_writer(run_name) env = make_with_torque_removed(env_id) num_timesteps = int(args.num_timesteps) experiment_name = slugify(args.name) if args.predictor == "rl": predictor = TraditionalRLRewardPredictor(summary_writer) else: agent_logger = AgentLogger(summary_writer) pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4 #online and offline if args.n_labels: label_schedule = LabelAnnealer(agent_logger, final_timesteps=num_timesteps, final_labels=args.n_labels, pretrain_labels=pretrain_labels) else: print( "No label limit given. We will request one label every few seconds." ) label_schedule = ConstantLabelSchedule( pretrain_labels=pretrain_labels) if args.predictor == "synth": comparison_collector = SyntheticComparisonCollector() elif args.predictor == "human": bucket = os.environ.get('RL_TEACHER_GCS_BUCKET') bucket = "gs://rl-teacher-preference" #assert bucket and bucket.startswith("gs://"), "env variable RL_TEACHER_GCS_BUCKET must start with gs://" comparison_collector = HumanComparisonCollector( env_id, experiment_name=experiment_name) else: raise ValueError("Bad value for --predictor: %s" % args.predictor) predictor = ComparisonRewardPredictor( env, summary_writer, comparison_collector=comparison_collector, agent_logger=agent_logger, label_schedule=label_schedule, ) # print("Starting random rollouts to generate pretraining segments. No learning will take place...") # pretrain_segments = segments_from_rand_rollout( # env_id, make_with_torque_removed, n_desired_segments=pretrain_labels * 2, # clip_length_in_seconds=CLIP_LENGTH, workers=args.workers) # for i in range(pretrain_labels): # Turn our random segments into comparisons # comparison_collector.add_segment_pair(pretrain_segments[i], pretrain_segments[i + pretrain_labels]) # # # Sleep until the human has labeled most of the pretraining comparisons # while len(comparison_collector.labeled_comparisons) < int(pretrain_labels * 0.75): # comparison_collector.label_unlabeled_comparisons() # if args.predictor == "synth": # print("%s synthetic labels generated... " % (len(comparison_collector.labeled_comparisons))) # elif args.predictor == "human": # print("%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... " % ( # len(comparison_collector.labeled_comparisons), pretrain_labels)) # sleep(5) # # # Start the actual training # # for i in range(args.pretrain_iters): # predictor.train_predictor() # Train on pretraining labels # if i % 10 == 0: # print("%s/%s predictor pretraining iters... " % (i, args.pretrain_iters)) #saver = tf.train.Saver(max_to_keep=5) #save_path = saver.save(sess, "/tmp/GAN/GAN_preference_based_model.ckpt") #print("Model saved in path: %s" % save_path) # Wrap the predictor to capture videos every so often: if not args.no_videos: predictor = SegmentVideoRecorder(predictor, env, save_dir=osp.join( '/tmp/rl_teacher_vids', run_name)) # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward # The single changed section is in `rl_teacher/agent/trpo/core.py` print("Starting joint training of predictor and agent") if args.agent == "parallel_trpo": train_parallel_trpo( env_id=env_id, make_env=make_with_torque_removed, predictor=predictor, summary_writer=summary_writer, workers=args.workers, runtime=(num_timesteps / 1000), max_timesteps_per_episode=get_timesteps_per_episode(env), timesteps_per_batch=8000, max_kl=0.001, seed=args.seed, ) elif args.agent == "pposgd_mpi": def make_env(): return make_with_torque_removed(env_id) try: from mpi4py import MPI except ImportError: MPI = None def configure_logger(log_path, **kwargs): if log_path is not None: logger.configure(log_path) else: logger.configure(**kwargs) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(args.log_path) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(args.log_path, format_strs=[]) train_pposgd_mpi(make_env, num_timesteps=num_timesteps, seed=args.seed, predictor=predictor) else: raise ValueError("%s is not a valid choice for args.agent" % args.agent)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-e', '--env_id', required=True) parser.add_argument('-p', '--predictor', required=True) parser.add_argument('-n', '--name', required=True) parser.add_argument('-s', '--seed', default=1, type=int) parser.add_argument('-w', '--workers', default=4, type=int) parser.add_argument('-l', '--n_labels', default=None, type=int) parser.add_argument('-L', '--pretrain_labels', default=None, type=int) parser.add_argument('-t', '--num_timesteps', default=5e6, type=int) parser.add_argument('-a', '--agent', default="parallel_trpo", type=str) parser.add_argument('-i', '--pretrain_iters', default=10000, type=int) parser.add_argument('-V', '--no_videos', action="store_true") parser.add_argument('-x', '--human_labels', default=1000, type=int) args = parser.parse_args() print("Setting things up...") env_id = args.env_id run_name = "%s/%s-%s" % (env_id, args.name, int(time())) summary_writer = make_summary_writer(run_name) env = make_with_torque_removed(env_id) num_timesteps = int(args.num_timesteps) experiment_name = slugify(args.name) if args.predictor == "rl": predictor = TraditionalRLRewardPredictor(summary_writer) else: agent_logger = AgentLogger(summary_writer) pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4 if args.n_labels: label_schedule = LabelAnnealer(agent_logger, final_timesteps=num_timesteps, final_labels=args.n_labels, pretrain_labels=pretrain_labels) else: print( "No label limit given. We will request one label every few seconds." ) label_schedule = ConstantLabelSchedule( pretrain_labels=pretrain_labels) if args.predictor == "synth": comparison_collector = SyntheticComparisonCollector( run_name, args.human_labels) elif args.predictor == "human": bucket = os.environ.get('RL_TEACHER_GCS_BUCKET') assert bucket and bucket.startswith( "gs://" ), "env variable RL_TEACHER_GCS_BUCKET must start with gs://" comparison_collector = HumanComparisonCollector( env_id, experiment_name=experiment_name) else: raise ValueError("Bad value for --predictor: %s" % args.predictor) predictor = ComparisonRewardPredictor( env, summary_writer, comparison_collector=comparison_collector, agent_logger=agent_logger, label_schedule=label_schedule, ) print( "Starting random rollouts to generate pretraining segments. No learning will take place..." ) pretrain_segments = segments_from_rand_rollout( env_id, make_with_torque_removed, n_desired_segments=pretrain_labels * 2, clip_length_in_seconds=CLIP_LENGTH, workers=args.workers) for i in range( pretrain_labels): # Turn our random segments into comparisons comparison_collector.add_segment_pair( pretrain_segments[i], pretrain_segments[i + pretrain_labels]) # Sleep until the human has labeled most of the pretraining comparisons while len(comparison_collector.labeled_comparisons) < int( pretrain_labels * 0.75): comparison_collector.label_unlabeled_comparisons() if args.predictor == "synth": print("%s synthetic labels generated... " % (len(comparison_collector.labeled_comparisons))) elif args.predictor == "human": print( "%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... " % (len(comparison_collector.labeled_comparisons), pretrain_labels)) sleep(5) # Start the actual training for i in range(args.pretrain_iters): predictor.train_predictor() # Train on pretraining labels if i % 100 == 0: print("%s/%s predictor pretraining iters... " % (i, args.pretrain_iters)) # Wrap the predictor to capture videos every so often: if not args.no_videos: predictor = SegmentVideoRecorder(predictor, env, save_dir=osp.join( '/tmp/rl_teacher_vids', run_name)) # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward # The single changed section is in `rl_teacher/agent/trpo/core.py` print("Starting joint training of predictor and agent") if args.agent == "parallel_trpo": train_parallel_trpo( env_id=env_id, make_env=make_with_torque_removed, predictor=predictor, summary_writer=summary_writer, workers=args.workers, runtime=(num_timesteps / 1000), max_timesteps_per_episode=get_timesteps_per_episode(env), timesteps_per_batch=8000, max_kl=0.001, seed=args.seed, ) elif args.agent == "pposgd_mpi": def make_env(): return make_with_torque_removed(env_id) train_pposgd_mpi(make_env, num_timesteps=num_timesteps, seed=args.seed, predictor=predictor) else: raise ValueError("%s is not a valid choice for args.agent" % args.agent)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-e', '--env_id', required=True) parser.add_argument('-p', '--predictor', required=True) parser.add_argument('-n', '--name', required=True) parser.add_argument('-s', '--seed', default=1, type=int) parser.add_argument('-w', '--workers', default=4, type=int) parser.add_argument('-l', '--n_labels', default=None, type=int) parser.add_argument('-L', '--pretrain_labels', default=None, type=int) parser.add_argument('-t', '--num_timesteps', default=5e6, type=int) parser.add_argument('-a', '--agent', default="parallel_trpo", type=str) parser.add_argument('-i', '--pretrain_iters', default=10000, type=int) parser.add_argument('-V', '--no_videos', action="store_true") args = parser.parse_args() env_id = args.env_id run_name = "%s/%s-%s" % (env_id, args.name, int(time())) summary_writer = make_summary_writer(run_name) num_timesteps = int(args.num_timesteps) experiment_name = slugify(args.name) ##make torcs envs # envs = [] # for aidx in range(args.workers): # agent = AgentTorcs2(aidx, bots=['scr_server'], track='road/g-track-1', text_mode=False, laps=3, # torcsIdxOffset=0, screen_capture=True) # # agent = AgentTorcs2(aidx, bots=['scr_server', 'olethros', 'berniw', 'bt', 'damned'], track='road/g-track-1', text_mode=True) # agent.reset() # envs.append(agent) # # if args.predictor == "rl": predictor = TraditionalRLRewardPredictor(summary_writer) else: agent_logger = AgentLogger(summary_writer) if args.predictor == "synth": comparison_collector = SyntheticComparisonCollector() elif args.predictor == "human": # bucket = os.environ.get('RL_TEACHER_GCS_BUCKET') # assert bucket and bucket.startswith("gs://"), "env variable RL_TEACHER_GCS_BUCKET must start with gs://" comparison_collector = HumanComparisonCollector( experiment_name=experiment_name) else: raise ValueError("Bad value for --predictor: %s" % args.predictor) pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4 if args.n_labels: label_schedule = LabelAnnealer( agent_logger, final_timesteps=num_timesteps, final_labels=args.n_labels, pretrain_labels=pretrain_labels) else: print("No label limit given. We will request one label every few seconds") label_schedule = ConstantLabelSchedule(pretrain_labels=pretrain_labels) # logger.info("frames = {}".format('start !!!!!!!!!!!!!')) print("Starting random rollouts to generate pretraining segments. No learning will take place...") pretrain_segments = segments_from_rand_rollout( n_desired_segments=pretrain_labels * 2, clip_length_in_seconds=CLIP_LENGTH, workers=args.workers) pretrain_segments.sort(key= lambda d :d['maxdistance']) for i in range(pretrain_labels): # Turn our random segments into comparisons comparison_collector.add_segment_pair(pretrain_segments[i], pretrain_segments[i + 1]) # Sleep until the human has labeled most of the pretraining comparisons while len(comparison_collector.labeled_comparisons) < int(pretrain_labels * 0.75): comparison_collector.label_unlabeled_comparisons() if args.predictor == "synth": print("%s synthetic labels generated... " % (len(comparison_collector.labeled_comparisons))) elif args.predictor == "human": print("%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... " % ( len(comparison_collector.labeled_comparisons), pretrain_labels)) sleep(5) # Start the actual training predictor = ComparisonRewardPredictor( summary_writer, comparison_collector=comparison_collector, agent_logger=agent_logger, label_schedule=label_schedule, ) for i in range(args.pretrain_iters): predictor.train_predictor() # Train on pretraining labels if i % 100 == 0: print("%s/%s predictor pretraining iters... " % (i, args.pretrain_iters)) # Wrap the predictor to capture videos every so often: if not args.no_videos: predictor = SegmentVideoRecorder(predictor, save_dir=osp.join('/tmp/rl_teacher_vids', run_name)) # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward # The single changed section is in `rl_teacher/agent/trpo/core.py` print("Starting joint training of predictor and agent") if args.agent == "parallel_trpo": train_parallel_trpo( predictor=predictor, summary_writer=summary_writer, workers=args.workers, runtime=(num_timesteps / 1000), max_timesteps_per_episode=10000, timesteps_per_batch=8000, max_kl=0.001, seed=args.seed, ) elif args.agent == "pposgd_mpi": pass # train_pposgd_mpi(num_timesteps=num_timesteps, seed=args.seed, predictor=predictor) else: raise ValueError("%s is not a valid choice for args.agent" % args.agent)