def train(env_id, num_timesteps, seed, total_gen): import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='constant', total_gen=total_gen, ) env.close()
def train(num_timesteps, seed): import mlp_policy, pposgd_simple import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = RobotPath.env(render=False, max_step=2000) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=512, num_hid_layers=3) pposgd_simple.learn(sess, env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, seed): import mlp_policy, pposgd_simple U.make_session(num_cpu=1, num_gpu=0).__enter__() env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=128, num_hid_layers=2) env.seed(seed) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=1e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='constant', ) env.close()
def train(env_id, num_timesteps, seed): from baselines.pposgd import mlp_policy import pposgd_simple sess=U.make_session(num_cpu=1) sess.__enter__() logger.session().__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, "monitor.json") env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close() # save model saver = tf.train.Saver() saver.save(sess, "model/model.ckpt")
def train(env_id, num_timesteps, seed): import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, seed) logger.log("========observation_space %s action_space %s" % (str(env.observation_space), str(env.action_space))) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=1024, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch, dc, method, mpath): #from baselines.ppo1 import mlp_policy import pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) if num_options == 1: optimsize = 64 elif num_options == 2: optimsize = 32 else: print("Only two options or primitive actions is currently supported.") sys.exit() assert method in METHODS, "Method should be either of " + str(METHODS) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, #timesteps_per_batch=2048, timesteps_per_batch=( 2048 * 5 ), # this part is changed to realize more stable learning 2019/01/31 clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=optimsize, gamma=0.99, lam=0.95, schedule='constant', num_options=num_options, app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed, dc=dc, method=method, mpath=mpath) env.close()
def train(): env = RLCube() num_timesteps = 10000 timesteps_per_actorbatch = 1000 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) pi = pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', )
def train(num_timesteps, seed, resume): map_folder = "." map_name = "empty" map_ = Map2D(map_folder, map_name) print("Map '{}' loaded.".format(map_name)) # RL multi-agent simulator import pposgd_simple import cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) env = PepperRLEnv(args) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank( ) if seed is not None else None set_global_seeds(workerseed) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) pposgd_simple.learn( env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, # original 1e-3 optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', resume_training=resume, ) env.close()
def train(env, num_timesteps, seed): rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() # mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) # env = make_robotics_env(env_id, workerseed, rank=rank) # def policy_fn(name, ob_space, ac_space): # return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, # hid_size=256, num_hid_layers=3) # def policy_fn(name, ob_space, ac_space): # return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_dims_p=[64, 64], hid_dims_v=[64, 64]) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.01, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(args): rank = MPI.COMM_WORLD.Get_rank() ncpu = get_cpu_per_task() ncpu //= 8 sys.stdout.flush() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = int(args.seed) set_global_seeds(workerseed) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(args.env, args.seed) running_scores = pposgd_simple.learn(env, policy_fn, timesteps_per_actorbatch=2048, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', args=args) env.close() # Save result for run if MPI.COMM_WORLD.Get_rank() == 0: pkl_res(running_scores, args)
def train(self,num_timesteps,seed, model_path=None,model_iter=None,argtype=None): sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=128, num_hid_layers=3) self.set_seed(seed) pi = pposgd.learn(self, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=512, clip_param=0.2,entcoeff=0.0, optim_epochs=10,optim_stepsize=1e-4,optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', model_path=model_path, model_iter=model_iter, mode=argtype ) return pi
def main(args): # mpi communicator. comm = MPI.COMM_WORLD rank = comm.Get_rank() # seed. workerseed = args.seed + 10000 * comm.Get_rank() if args.seed is not None else None if workerseed is not None: tc.manual_seed(workerseed % 2 ** 32) np.random.seed(workerseed % 2 ** 32) random.seed(workerseed % 2 ** 32) # logger. if rank == 0: logger.configure() else: logger.configure(format_strs=[]) # env. env = make_atari(args.env_name) env.seed(workerseed) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) print(f"frame_stacking: {args.frame_stacking}") env = wrap_deepmind(env, frame_stack=args.frame_stacking, clip_rewards=(args.mode =='train'), episode_life=(args.mode =='train')) # See Mnih et al., 2015 -> Methods -> Training Details. env.seed(workerseed) # agent. agent = CnnPolicy( img_channels=env.observation_space.shape[-1], num_actions=env.action_space.n, kind=args.model_type) # optimizer and scheduler. max_grad_steps = args.optim_epochs * args.env_steps // (comm.Get_size() * args.optim_batchsize) optimizer = tc.optim.Adam(agent.parameters(), lr=args.optim_stepsize, eps=1e-5) scheduler = tc.optim.lr_scheduler.OneCycleLR( optimizer=optimizer, max_lr=args.optim_stepsize, total_steps=max_grad_steps, pct_start=0.0, anneal_strategy='linear', cycle_momentum=False, div_factor=1.0) # checkpoint. if rank == 0: try: state_dict = tc.load(os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')) agent.load_state_dict(state_dict) print(f"Continuing from checkpoint found at {os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')}") except FileNotFoundError: print("Bad checkpoint or none on process 0. Continuing from scratch.") # sync. with tc.no_grad(): for p in agent.parameters(): p_data = p.data.numpy() comm.Bcast(p_data, root=0) p.data.copy_(tc.tensor(p_data).float()) # operations. if args.mode == 'train': learn(env=env, agent=agent, optimizer=optimizer, scheduler=scheduler, comm=comm, timesteps_per_actorbatch=args.timesteps_per_actorbatch, max_timesteps=args.env_steps, optim_epochs=args.optim_epochs, optim_batchsize=args.optim_batchsize, gamma=args.gamma, lam=args.lam, clip_param=args.epsilon, entcoeff=args.ent_coef, checkpoint_dir=args.checkpoint_dir, model_name=args.model_name) env.close() elif args.mode == 'play': if comm.Get_rank() == 0: play(env=env, agent=agent, args=args) env.close() elif args.mode == 'movie': if comm.Get_rank() == 0: movie(env=env, agent=agent, args=args) env.close() else: raise NotImplementedError("Mode of operation not supported!")