def run_baselines(env, seed, log_dir): ''' Create baselines model and training. Replace the trpo and its training with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return ''' with tf.compat.v1.Session().as_default(): baselines_logger.configure(log_dir) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=int(1e6), gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close() return osp.join(log_dir, 'progress.csv')
def train(env_id, num_frames, seed): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json"%rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close()
def train(env_id, num_timesteps, seed): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env, seed, policy_entcoeff, num_timesteps, num_iters, checkpoint_dir, gamma, task_name=None): from baselines.trpo_mpi import trpo_mpi # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) trpo_mpi.learn(network=args.network, env=env, total_timesteps=num_timesteps, ent_coef=policy_entcoeff, max_iters=num_iters, ckpt_dir=checkpoint_dir, timesteps_per_batch=args.batchsize, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, gamma=gamma, lam=0.97, vf_iters=args.vf_iters, vf_stepsize=args.vf_stepsize, task_name=task_name, num_layers=args.policy_hidden_layer, num_hidden=args.policy_hidden_size)
def train(num_timesteps): env = GRID(grid_size=36, square_size=4, stochastic=True) import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) def policy_fn(name, ob_space, ac_space): return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed, dropout_on_V, dropout_tau_v, lengthscale_V, V_keep_prob, mc_samples, override_reg, optim_stepsize, vf_hid_size, activation_vf, sample_dropout): from baselines.ppo1 import mlp_policy from baselines.trpo_mpi import trpo_mpi U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) #### this guy is deciding if we do dropout on V or not dropout_on_V=dropout_on_V pol_tau = 1. def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, ## MAIN CHANGES hid_size_V=vf_hid_size, hid_size_actor=64, num_hid_layers=2, V_keep_prob=V_keep_prob,mc_samples=mc_samples,\ layer_norm=False,activation_critic=activation_vf,\ activation_actor=tf.nn.relu , dropout_on_V=dropout_on_V, sample_dropout=sample_dropout) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=optim_stepsize, ## MAIN CHANGES dropout_on_V=dropout_on_V, dropout_tau_V=dropout_tau_v, override_reg=override_reg) env.close()
def train_trpo(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) set_global_seeds(workerseed) env = gym.make(env_id) env.seed(workerseed) #timesteps_per_batch=1024 timesteps_per_batch=2048 #trpo_mpi.learn(network='mlp', env=env, total_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch, # max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3,seed=workerseed, # num_layers=2, num_hidden=32) trpo_mpi.learn(network='mlp',env=env,seed=workerseed,total_timesteps=num_timesteps) env.close()
def train(env_id, num_timesteps, seed, flight_log_dir, ckpt_dir, model_ckpt_path): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 1000000 * rank def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) flight_log = FlightLog(flight_log_dir) env = gym.make(env_id) env.seed(workerseed) set_global_seeds(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, flight_log = flight_log, ckpt_dir = ckpt_dir, model_ckpt_path = model_ckpt_path ) env.close()
def train(env_id, rank, environment_args, stacked_obs, num_hidden_units, max_iters, checkpoint_dir, log_dir, timesteps_per_batch, render, seed): sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) if environment_args is not None: try: env.unwrapped.set_environment_config(environment_args) except: print("Can't set the configuration to the environment!") if rank == 0: with open(osp.join(checkpoint_dir, "args.txt"), "a") as f: f.write("\nEnvironment argument:\n") for k, v in env.unwrapped._config.items(): f.write("{}: {}\n".format(k, v)) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=num_hidden_units, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)), allow_early_resets=True) # Support the stacked the frames env = FrameStack_Mujoco(env, stacked_obs) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, checkpoint_dir, log_dir, render=render, timesteps_per_batch=timesteps_per_batch, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_iters=max_iters, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def run_baselines(env, seed, log_dir): """Create Baseline model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.compat.v1.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) set_global_seeds(seed) def policy_fn(name, ob_space, ac_space): """Create policy for baselines. Args: name (str): Policy name. ob_space (gym.spaces.Box) : Observation space. ac_space (gym.spaces.Box) : Action space. Returns: baselines.ppo1.mlp_policy: MLP policy for baselines. """ return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hyper_parameters['hidden_sizes'][0], num_hid_layers=len(hyper_parameters['hidden_sizes'])) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=hyper_parameters['batch_size'], max_kl=hyper_parameters['max_kl'], cg_iters=10, cg_damping=0.1, max_timesteps=(hyper_parameters['batch_size'] * hyper_parameters['n_epochs']), gamma=hyper_parameters['discount'], lam=hyper_parameters['gae_lambda'], vf_iters=5, vf_stepsize=1e-3) return osp.join(log_dir, 'progress.csv')
def main(): # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines trpo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def run_baselines(env, seed, log_dir): """Create Baseline model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.compat.v1.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.compat.v1.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) set_global_seeds(seed) policy_network = 'mlp' trpo_mpi.learn(network=policy_network, env=env, total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs'], timesteps_per_batch=hyper_parameters['batch_size'], gamma=hyper_parameters['discount'], lam=hyper_parameters['gae_lambda'], max_kl=hyper_parameters['max_kl'], cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=1e-3) log_file_path = osp.join(log_dir, 'progress.csv') with open(log_file_path, 'r') as rf: reader = csv.reader(rf) columns = [[ 'Evaluation/AverageReturn' if c == 'EpRewMean' else c for c in next(reader) ] + ['Evaluation/Iteration']] new_lines = columns + [line + [i] for i, line in enumerate(reader)] with open(log_file_path, 'w') as wf: writer = csv.writer(wf, lineterminator='\n') writer.writerows(new_lines) return log_file_path
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create a new base directory like /tmp/openai-2018-05-21-12-27-22-552435 log_dir = os.path.join( energyplus_logbase_dir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) if not os.path.exists(log_dir + '/output'): os.makedirs(log_dir + '/output') os.environ["ENERGYPLUS_LOG"] = log_dir model = os.getenv('ENERGYPLUS_MODEL') if model is None: print('Environment variable ENERGYPLUS_MODEL is not defined') os.exit() weather = os.getenv('ENERGYPLUS_WEATHER') if weather is None: print('Environment variable ENERGYPLUS_WEATHER is not defined') os.exit() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: print('train: init logger with dir={}'.format(log_dir)) #XXX logger.configure(log_dir) else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) env = make_energyplus_env(env_id, workerseed) trpo_mpi.learn( env, policy_fn, max_timesteps=num_timesteps, #timesteps_per_batch=1*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, timesteps_per_batch=16 * 1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def run(cfg, num_timesteps, seed, hid_size, **kwargs): dir_path = os.path.dirname(os.path.realpath(__file__)) logger.configure(dir_path, ['stdout']) sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = GRLEnv(cfg) env.set_test(False) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=hid_size, num_hid_layers=2) env = MyMonitor(env, osp.join(logger.get_dir(), kwargs['output']), report='learn') env.seed(workerseed) gym.logger.setLevel(logging.WARN) if kwargs['evaluation']: trpo_mpi.play(sess, env, policy_fn, timesteps_per_batch=1024, load_file=kwargs['load_file']) else: trpo_mpi.learn(sess, env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, **kwargs) env.close()
def train(env_id, seed): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) env.seed(workerseed) task_name = "trpo." + args.env.split("-")[0] + "." + ("%.2f" % args.entcoeff) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=args.num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=args.entcoeff, sample_stochastic=args.sample_stochastic, task_name=task_name, save_per_iter=args.save_per_iter, ckpt_dir=args.checkpoint_dir, load_model_path=args.load_model_path, task=args.task) env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U # sess = U.single_threaded_session() # sess.__enter__() gpu_options = tf.GPUOptions(allow_growth=False, per_process_gpu_memory_fraction=0.2) tf_config = tf.ConfigProto(gpu_options=gpu_options) sess = tf.Session(config=tf_config) sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # env = make_mujoco_env(env_id, workerseed) env = normalize(InvertedDoublePendulumEnv(), normalize_obs=False) env_t = normalize(InvertedDoublePendulumEnv(), normalize_obs=False) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, tester=tester) time_step_holder = TimeStepHolder(0, 0) tester = Tester(episodes=100, period=10, env=env_t, time_step_holder=time_step_holder, file='./results', session=sess) env.close()
def train_trpo(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) set_global_seeds(workerseed) env = HistoryEnv(env_id, hist_len=1, history_type='fully_observable', kwargs={ 'board_size': 5, 'num_rocks': 7 }) #env = HistoryEnv(env_id, hist_len=15) print("ob_space: " + str(env.observation_space)) print("ac_space: " + str(env.action_space)) env.seed(workerseed) #timesteps_per_batch=1024 #timesteps_per_batch=2048 timesteps_per_batch = 5000 trpo_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, hidden_size, num_hidden_layers, seed, rank): with U.make_session(3) as sess: worker_seed = seed + 10000 * rank set_global_seeds(worker_seed) # env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), str(rank))) try: env = gym.make(env_id) env.seed(worker_seed) # Rendering and saving callback episode = 0 def episode_callback(locals, globals): nonlocal episode episode += 1 print("----- Episode {} -----".format(episode)) env.render() if episode % 20 == 0: save(sess) # Policy function policy_fn = lambda name, ob_space, ac_space: MlpPolicy( name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=hidden_size, num_hid_layers=num_hidden_layers ) # Learning trpo_mpi.learn( env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, callback=episode_callback) finally: env.close()
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=Monitor, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None) act = trpo_mpi.learn(env=env, network='mlp', total_timesteps=0, load_path=modelsdir + "model") obs, done = env.reset(), False episode_rew = 0 while True: obs, rew, done, _ = env.step(act.step(obs)[0]) episode_rew += rew[0] if isinstance(env, VecEnv) else rew done = done.any() if isinstance(done, np.ndarray) else done if done: print('episode_rew={}'.format(episode_rew)) episode_rew = 0 obs = env.reset()
def main(): args = parse_args() format_strs = ['log', 'csv', 'stdout'] if args.tensorboard: format_strs.append('tensorboard') config = parse_config(args.config) outdir = os.path.join(args.outdir, os.path.splitext(os.path.basename(args.config))[0]) logger.configure(dir=outdir, format_strs=format_strs) env_type, env_id = get_env_type(GAME_ENVIRONMENT) env = make_vec_env(env_id, env_type, 1, args.seed) model = trpo_mpi.learn(env=env, network=NETWORK_ARCHITECTURE, total_timesteps=args.total_timesteps, **config) env.close() if args.save: model.save(os.path.join(outdir, 'model'))
def train(env_id, num_timesteps, seed): """ Train TRPO model for the atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): # pylint: disable=W0613 return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close()
def train(args): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) task_name = "trpo." + args.env_id.split("-")[0] + "." + ("%.2f" % args.entcoeff) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, sample_stochastic=args.sample_stochastic, task_name=task_name, save_per_iter=args.save_per_iter, ckpt_dir=args.checkpoint_dir, load_model_path=args.load_model_path) env.close()
def train(env_id, num_timesteps, task, seed): U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) logger.configure() if task == "grid": env = bench.Monitor(gym.make("target-v0"), logger.get_dir()) elif task == "dynamics": env = bench.Monitor(gym.make("target-dynamics-v0"), logger.get_dir()) else: raise ValueError("task should be either grid or dynamics instead " + task + " was given.") trpo_mpi.learn( env, policy_fn, max_timesteps=num_timesteps, gamma=1.0, timesteps_per_batch=5000, max_kl=0.01, entcoeff=0.02, vf_iters=5, vf_stepsize=1e-3, lam=0.95, cg_iters=10, cg_damping=0.1, ) dir = logger.get_dir() with open("scripts/data/data_directories.txt", 'a') as file: file.write(dir + "\n") with open(dir + "/metadata.txt", 'w') as file: file.write(task + "," + str(num_timesteps)) results_plotter_terminal.plot_results([dir], num_timesteps, results_plotter_terminal.X_TIMESTEPS, "Target Set with Dynamics") #for this to have at least 99 episodes. See line 20 in rolling_window in results_plotter.py from open ai baselines #If there are too few samples, that dimension will be negative and numpy will cause this to crash env.close()
def train(env_id, num_timesteps, seed, outdir): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() logdir = os.path.join(outdir, env_id) os.makedirs(logdir, exist_ok=True) rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) logger.configure(logdir) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() # for parallel executing, each process has a rand if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) set_global_seeds(workerseed) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train_trpo(env_id, num_timesteps, seed, hist_len, block_high, nsteps, hid_size, give_state): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=2) set_global_seeds(workerseed) env = make_control_env(env_id, workerseed, hist_len=hist_len, block_high=block_high, not_guided=True, give_state=False) env.seed(workerseed) timesteps_per_batch = nsteps trpo_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def train(env_id, num_timesteps, seed): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2, sess=sess, placeholders=placeholders) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def trpo_baselines(log_dir, env_id, seed): """Create Baseline model and training. Args: log_dir (str): Experiment log directory. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ # Set up TF Session ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.compat.v1.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) set_global_seeds(seed) env = AutoStopEnv(env_name=env_id, max_path_length=100) trpo_mpi.learn(network='mlp', env=env, total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs'], timesteps_per_batch=hyper_parameters['batch_size'], gamma=hyper_parameters['discount'], lam=hyper_parameters['gae_lambda'], max_kl=hyper_parameters['max_kl'], cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=1e-3)
def main(): num_env = 1 env_id = "CartPole-v0" env_type = "classic_control" seed = None env = make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None) act = trpo_mpi.learn(env=env, network='mlp', total_timesteps=500000)
args = parser.parse_args() sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) # create the environment env = gym.make(str(args.environment)) # initial_observation = env.reset() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) # env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) # gym.logger.setLevel(logging.WARN) with tf.Session() as sess: trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, save_model_with_prefix="", outdir="/tmp/experiments/"+str(args.environment)+"/TRPO/")