def main(config): # wandb.init(project='rl', config=config) # wandb.save(str(pathlib.Path(wandb.run.dir) / '*.t7')) # wandb.run.summary['step'] = 0 trainer = PPO(**config) sampler = RaySampler(config['track']) replay = ReplayBuffer(config['max_frames']) for epoch in range(config['max_epoch'] + 1): # wandb.run.summary['epoch'] = epoch for rollout_batch in sampler.get_samples(trainer.get_policy(epoch), **config): for rollout, _ in rollout_batch: for data in rollout: replay.add(data) print([x.r[0] for x in rollout]) metrics = trainer.train(replay) # wandb.log(metrics, step=wandb.run.summary['step']) if epoch % 50 == 0: torch.save(trainer.actor.state_dict(), pathlib.Path(wandb.run.dir) / ('model_%03d.t7' % epoch))
def main(): #env = FourRoomsEnv(goal_pos=(12, 16)) from gym_minigrid.envs import EmptyEnv5x5 env = EmptyEnv5x5() #env = GridWorldMDP() torch.manual_seed(config.seed) print(config.agent) #env.seed(config.seed) env = MiniGridWrapper(env) model = PPO(action_space=env.action_space.n, observation_space=env.observation_space.shape[0], h_dim=config.h_dim) # dtm = datetime.now().strftime("%d-%H-%M-%S-%f") # writer = tb.SummaryWriter(log_dir=f"logs/{dtm}_as_ppo:{config.as_ppo}") for global_step in itertools.count(): batch, info = gather_trajectories(env, model, config.horizon) config.tb.add_scalar("return", info["env/returns"], global_step=global_step) losses = model.train_net(batch) model.data.clear() for k, v in losses.items(): config.tb.add_scalar(k, v, global_step=global_step) if global_step % config.save_interval == 0: log_dir = config.tb.add_object('model', model, global_step=global_step) # eval_policy(log_dir=log_dir) if (global_step * config.horizon) > config.max_steps: break env.close()
def train(env, hyperparameters, actor_model, critic_model): """ Trains the model. Parameters: env - the environment to train on hyperparameters - a dict of hyperparameters to use, defined in main actor_model - the actor model to load in if we want to continue training critic_model - the critic model to load in if we want to continue training Return: None """ print(f"Training", flush=True) # Create a model for PPO. model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters) # Tries to load in an existing actor/critic model to continue training on if actor_model != '' and critic_model != '': print(f"Loading in {actor_model} and {critic_model}...", flush=True) model.actor.load_state_dict(torch.load(actor_model)) model.critic.load_state_dict(torch.load(critic_model)) print(f"Successfully loaded.", flush=True) elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model print( f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!" ) sys.exit(0) else: print(f"Training from scratch.", flush=True) # Train the PPO model with a specified total timesteps # NOTE: You can change the total timesteps here, I put a big number just because # you can kill the process whenever you feel like PPO is converging model.learn(total_timesteps=200_000_000)
def main(env_id, dim_latent, render, num_process, lr_p, lr_v, gamma, tau, epsilon, batch_size, ppo_mini_batch_size, ppo_epochs, max_iter, eval_iter, save_iter, model_path, log_path, seed): base_dir = log_path + env_id + "/PPO_encoder_exp{}".format(seed) writer = SummaryWriter(base_dir) ppo = PPO( env_id=env_id, dim_latent=dim_latent, render=render, num_process=20, #cpu_count(), min_batch_size=batch_size, lr_p=lr_p, lr_v=lr_v, gamma=gamma, tau=tau, clip_epsilon=epsilon, ppo_epochs=ppo_epochs, ppo_mini_batch_size=ppo_mini_batch_size, seed=seed) for i_iter in range(1, max_iter + 1): ppo.learn(writer, i_iter) if i_iter % eval_iter == 0: ppo.eval(i_iter, render=render) if i_iter % save_iter == 0: ppo.save(model_path) pickle.dump( ppo, open('{}/{}_ppo_encoder.p'.format(model_path, env_id), 'wb')) torch.cuda.empty_cache()
def main(): env_name = 'BreakoutNoFrameskip-v4' env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(env_name), episode_life=True, clip_rewards=True, frame_stack=True, scale=True) output_size = env.action_space.n input_shape = env.observation_space.shape with tf.Session() as sess: with tf.variable_scope('Breakout_lr'): input = tf.placeholder(tf.float32, [None, *input_shape]) model = PPO(sess, input, models.nature_cnn(input), actiontype.Discrete, output_size, learning_rate=lambda f: 2.5e-4 * (1 - f), epochs=4, minibatch_size=4, gamma=0.99, beta2=0.01, name='Breakout_lr') train(sess, model, env_name, 1e7, 256, log_interval=5, num_envs=16, atari=True) #run_only(sess, model, env, render=True) env.close()
def __init__(self, observation_space, action_space): self.k = 10 self.actions = HUMAN_ACTIONS self.action_space = gym.spaces.Discrete(NUM_ACTIONS) shape = observation_space.shape self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, shape[0], shape[1]), dtype=np.uint8) env_shape = self.observation_space.shape state_dim = np.prod(env_shape) self.state_dim = state_dim self.action_dim = self.action_space.n self.agent = PPO( self.state_dim * self.k, self.action_dim, n_latent_var=600, betas=(0.9, 0.999), lr=1e-4, K_epochs=8, gamma=0.99, eps_clip=0.2, ) self.actions = HUMAN_ACTIONS self.agent.policy.load_state_dict( torch.load("results/experiment_1/checkpoint_210_eps.pth", map_location=torch.device(device))) self.framestack = None
def main(): env = gym.make('CartPole-v1') model = PPO() saved_model = torch.load('models/ppo_model9500.pt') model.load_state_dict(saved_model) while True: score = 0 s = env.reset() for i in range(200): prob = model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) model.put_data((s, a, r / 100.0, s_prime, prob[a].item(), done)) s = s_prime score += r env.render() if done: break print('score = {}'.format(score))
def main(env_id, render, num_process, lr_p, lr_v, gamma, tau, epsilon, batch_size, ppo_mini_batch_size, ppo_epochs, max_iter, eval_iter, save_iter, model_path, log_path, seed): base_dir = log_path + env_id + "/PPO_exp{}".format(seed) writer = SummaryWriter(base_dir) ppo = PPO(env_id=env_id, render=render, num_process=1, min_batch_size=batch_size, lr_p=lr_p, lr_v=lr_v, gamma=gamma, tau=tau, clip_epsilon=epsilon, ppo_epochs=ppo_epochs, ppo_mini_batch_size=ppo_mini_batch_size, seed=seed, model_path='trained_models') for i_iter in range(1, 6): ppo.eval(i_iter, render=True) torch.cuda.empty_cache()
def train(): g_exit = GracefulExit() timestamp = datetime.datetime.utcnow().strftime(TIMESTAMP_FORMAT) logger = Logger(ENV_NAME, timestamp) env = gym.make(ENV_NAME) dim_obs = env.observation_space.shape[0] + 1 dim_act = env.action_space.shape[0] scaler = VecScaler(dim_obs) rec_dir = os.path.join(REC_DIR, ENV_NAME, timestamp) env = gym.wrappers.Monitor(env, rec_dir, force=True) agent = PPO(dim_obs, dim_act, GAMMA, LAMBDA, CLIP_RANGE, LR_POLICY, LR_VALUE_F, logger) run_batch(env, agent.policy, 5, scaler) episode = 0 while episode < NUM_EPISODES: batch_size = min(MAX_BATCH, NUM_EPISODES - episode) trajectories, steps, mean_return = run_batch(env, agent.policy, batch_size, scaler) episode += batch_size logger.log({'_time': datetime.datetime.utcnow().strftime(TIMESTAMP_FORMAT), '_episode': episode, 'steps': steps, '_mean_return': mean_return}) agent.update(trajectories) logger.write() if g_exit.exit: break agent.close() logger.close()
def train(): env = gym.make(GAME).unwrapped all_ep_r = [] memory = rpm(1000000) agent = PPO(state_space=S_DIM, action_space=A_DIM, max_episode_num=EP_MAX, episode_lens=EP_LEN, discount_factor=GAMMA, actor_learning_rate=A_LR, critic_learning_rate=C_LR, mini_batch_size=MINI_BATCH_SIZE, epochs=EPOCHS) # load weights # agent.load_weights(SAVE_INDEX) # run(env, agent) for i in range(EP_MAX): [steps, episode_r, c_time, aloss, closs] = execute_one_episode(env, agent, memory) print('Ep: %4d' % i, "|Ep_r: %i" % episode_r, '|aloss: %8.4f' % aloss, '|closs: %8.4f' % closs, '|steps: %4d' % steps, '|time: %6.4f' % c_time) if i == 0: all_ep_r.append(episode_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + episode_r * 0.1) # create_path('weights/' + SAVE_INDEX) agent.save_weights(SAVE_INDEX) plt.plot(np.arange(len(all_ep_r)), all_ep_r) plt.xlabel('Episode') plt.ylabel('Moving averaged episode reward') create_path('weights/' + SAVE_INDEX + '/figure') plt.savefig('weights/' + SAVE_INDEX + '/figure/fig.png') plt.show()
def worker_policy(args, manager, config): init_logging_handler(args.log_dir, '_policy') agent = PPO(None, args, manager, config, 0, pre=True) best = float('inf') for e in range(args.epoch): agent.imitating(e) best = agent.imit_test(e, best)
def create_model(sess, name): with tf.variable_scope(name): input = tf.placeholder(tf.float32, [None, 12]) initializer = tf.orthogonal_initializer(np.sqrt(2)) #Orthogonal initializer network = add_dense(input, 32, activation=tf.nn.tanh, kernel_initializer=initializer, name="dense1") network = add_dense(network, 32, activation=tf.nn.tanh, kernel_initializer=initializer, name="dense2") return PPO(sess, input, network, actiontype.Continuous, 2, epochs=10, minibatch_size=32, gamma=0.99, beta2=0.00, epsilon=0.2,\ learning_rate=lambda f : 3e-4*(1-f), name=name)
def worker_estimator(args, manager, config, make_env): init_logging_handler(args.log_dir, '_estimator') agent = PPO(make_env, args, manager, config, args.process, pre_irl=True) agent.load(args.save_dir + '/best') best0, best1 = float('inf'), float('inf') for e in range(args.epoch): agent.train_irl(e, args.batchsz_traj) best0 = agent.test_irl(e, args.batchsz, best0) best1 = agent.imit_value(e, args.batchsz_traj, best1)
def main(config): parser = get_parser() argv = sys.argv[1:] args, _ = parser.parse_known_args(argv) init_logging_handler(config.log_dir) logging.info(args) config = update_cfg(config, args) logging.info("Start initializing") irl_model = RewardModule(config).to(device=device) # this is the reward model only, which will be fed to RewardEstimator. reward_agent = RewardEstimator(config=config, irl_model=irl_model) user_policy = ActorCriticDiscrete(config).to(device=device) user_policy = init_net(user_policy) user_ppo = PPO(config, user_policy) system_policy = ActorCriticContinuous(config).to(device=device) system_policy = init_net(system_policy) init_system_policy = ActorCriticContinuous(config).to(device=device) init_system_policy.load_state_dict(system_policy.state_dict()) system_ppo = PPO(config, system_policy, init_policy=init_system_policy) # reward_true = RewardTruth(config).to(device=device) # this is the ground truth which will not be updated once randomly initialized. reward_true = RewardTruthSampled(config).to(device) reward_true = init_net(reward_true) logging.info("Finish building module: reward agent, user ppo, system ppo") main_agent = InteractAgent(config=config, user_agent=user_ppo, user_reward=reward_agent, system_agent=system_ppo, reward_groundtruth=reward_true ) for e_id in range(config.master_epochs): main_agent.master_train(e_id) # for _ in range(3): # main_agent.system_train() # raise ValueError("stop here") logging.info("@@@@@@@@@@ Finished @@@@@@@@@@@")
def train_ppo(df, df_dense, df_wide, df_fail, state_dim, action_dim, lr, betas, gamma, epochs, model_path): memory = Memory() n_latent_var = [128, 32] K_epochs = 4 eps_clip = 0.2 update_timestep = 2000 ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) timestep = 0 loss_file = open(model_path + 'loss.txt', 'a') for epoch in range(epochs): print("epoch start:" + str(epoch) + '\n') moving_loss = 0 cnt = 0 for index in range(df.shape[0]): timestep += 1 row = df.iloc[index] state_dense = df_dense[index] state_wide = df_wide[index] fail_state = df_fail[index] state = np.concatenate((state_dense, state_wide, fail_state)) action = row['action'] reward = row['reward'] done = row['done'] ppo.policy_old.act(state, action, reward, done, memory) if timestep % update_timestep == 0: loss = ppo.update(memory) memory.clear_memory() timestep = 0 moving_loss += np.mean(loss) cnt += 1 loss_file.write( str(epoch) + '-th round loss: ' + str(round(moving_loss / cnt, 4)) + '\n') loss_file.flush() torch.save( ppo.policy.action_layer.state_dict(), model_path + 'ppo_20191009_20191021_action_layer' + str(epoch) + '-th_epoch.pkl') torch.save( ppo.policy.value_layer.state_dict(), model_path + 'ppo_20191009_20191021_action_layer' + str(epoch) + '-th_epoch.pkl') gc.collect() loss_file.close() return ppo.policy.action_layer.cpu().eval()
def main(): # parse args args = option.args # worker device if args.backend == 'cpu': args.worker_device = "/cpu:0" else: gpu_id = args.index % args.gpu_count os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) args.worker_device = "/gpu:0" # start session config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions(allow_growth=True), ) sess = tf.Session(config=config) sess.__enter__() # create env env = gym.make(args.env) env = wrap_train(env) # create ppo #with tf.device(args.worker_device): ppo = PPO(env.observation_space, env.action_space, cnn_model_func, clip_param=0.2, entcoeff=0.01) # create worker if args.mode == 'train': worker = TrainWorker(env, ppo, args.render, args.index == 0, train_data_size=256, optimize_size=64, optimize_epochs=4, gamma=0.99, lambda_=0.95, max_steps=1e6) else: pass # start worker worker()
def rank_policies(memory: Memory, library, **ppo_params): agent = PPO(None, **ppo_params) # pylint: disable=not-callable returns = torch.tensor(memory.returns).float().to(DEVICE).detach() # eturns = (returns - returns.mean()) / (returns.std() + 1e-5) states = torch.tensor(memory.states).float().to(DEVICE).detach() actions = torch.tensor(memory.actions).float().to(DEVICE).detach() vals = [] for params in library: agent.policy.load_state_dict(params) logp, _, _ = agent.policy.evaluate(states, actions) p = torch.exp(logp) vals.append(torch.sum(p * returns).item()) return np.argsort(vals)[::-1], np.asarray(vals)
def create_policy(policy_type='rand', board_size=8, seed=0, search_depth=1): if policy_type == 'rand': policy = simple_policies.RandomPolicy(seed=seed) elif policy_type == 'greedy': policy = simple_policies.GreedyPolicy() elif policy_type == 'maximin': policy = simple_policies.MaxiMinPolicy(search_depth) elif policy_type == 'human': policy = simple_policies.HumanPolicy(board_size) elif policy_type == 'dqn': policy = DQN('dqn', board_size) elif policy_type == 'ppo': policy = PPO('ppo', board_size) return policy
def maml_initialize(starting_policy, env_fn, n, n_inner, alpha_inner, **ppo_params): timesteps = ppo_params.get('update_interval') * max(n_inner, 1) ppo_params['lr'] = alpha_inner library = [] gradients = [] env = env_fn(seed=ppo_params.get('seed')) for i in range(n): env.randomize() env.reset() agent = PPO(env, **ppo_params) agent.policy.load_state_dict(copy_tensor(starting_policy)) agent.learn(timesteps, track_higher_gradients=True) library.append(agent.policy.state_dict()) gradients.append(get_gradients(agent.meta_policy.parameters(), agent.meta_policy.parameters(time=0))) return library, gradients
def initialize_rl_alg(self, args): hyperparams = { 'optim_epochs': self.args.ppo_optim_epochs, 'minibatch_size': self.args.ppo_minibatch_size, 'gamma': self.args.gamma, 'value_iters': self.args.ppo_value_iters, 'clip_epsilon': self.args.ppo_clip, 'entropy_coeff': self.args.entropy_coeff, } self.rl_alg = PPO(policy=self.policy, policy_optimizer=self.policy_optimizer, valuefn=self.valuefn, value_optimizer=self.value_optimizer, replay_buffer=self.replay_buffer, **hyperparams)
def main(): torch.set_default_tensor_type('torch.DoubleTensor') batchsz = 2048 ppo = PPO(make_env, 10) # load model from checkpoint ppo.load() # comment this line to close evaluaton thread, to speed up training process. ppo.render(2) for i in range(10000): ppo.update(batchsz) if i % 100 == 0 and i: ppo.save()
def main(): torch.set_default_tensor_type('torch.DoubleTensor') args = arguements.achieve_args() ppo = PPO() ignore01, ignore02 = ppo.load() avg_rewards = [] saved_rewards = [] for i in range(10000): avg_reward, POLICY = ppo.update() avg_rewards.append(avg_reward) print('avg_rewards:', avg_rewards) saved_rewards.append(avg_reward) # saving model each 10 iterations if i % 10 == 0 and i != 0: pass idx = i # MUST CHANGE THIS WHEN RESUME TRAINING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! print('--- saving models ---') ppo.save(idx, filename=args.model_name) print( '--- saving rewards ---' ) # save rewards each 10 iters, one saving only contains 10 rewards rewards_name = 'rewards_' + args.model_name + '_from' + str( idx - 10) + 'to' + str(idx) + '.txt' with open(rewards_name, "wb") as fp: # Pickling pickle.dump(saved_rewards, fp) print('rewards in the last several iters:', saved_rewards) saved_rewards = [] # plot the rewards each 5 iterations if i % 5 == 0: pass iter = list(range(len(avg_rewards))) plt.plot(iter, avg_rewards) plt.show()
def train_cartpole(params): """ Unit test the PPO with a simple CartPole example. Returns: bool:Test success (did the model learn anything) """ env = gym.make('CartPole-v1') model = PPO(params, 4, out=2) score = 0.0 print_interval = 20 initial = 0 for n_epi in range(101): s = env.reset() done = False save = 0 while not done: for t in range(params['T_horizon']): prob = model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) model.put_data( (s, a, r / 100.0, s_prime, prob[a].item(), done)) s = s_prime score += r if done: break model.train_net() if n_epi == 0: save = score score = 0.0 if n_epi == 30: env.close() return score > save
def get_algorithm(*argv, **kwargs): if args.algorithm == 'pg': return PG(*argv, **kwargs) if args.algorithm == 'ddpg': return DDPG(*argv, **kwargs) if args.algorithm == 'td3': return TD3(*argv, **kwargs) if args.algorithm == 'rbi': return RBI(*argv, **kwargs) if args.algorithm == 'drbi': return DRBI(*argv, **kwargs) if args.algorithm == 'ppo': return PPO(*argv, **kwargs) if args.algorithm == 'sacq': return SACQ(*argv, **kwargs) if args.algorithm == 'sspg': return SSPG(*argv, **kwargs) raise NotImplementedError
def main(algo): seed = 7 path = 'model_checkpoints/ppo.ckpt' # Load the ENV # env = UnityEnv(env_file='Environments/Reacher_Linux_one/Reacher.x86_64',no_graphics=True) env = UnityEnv(env_file='Environments/Tennis_Linux/Tennis.x86_64', no_graphics=True) # number of agents num_agents = env.num_agents print('Number of agents:', num_agents) # size of each action action_size = env.action_size # examine the state space state_size = env.state_size print('Size of each action: {}, Size of the state space {}'.format( action_size, state_size)) config = Config(algo) if torch.cuda.is_available(): device = torch.device("cuda:0") device2 = torch.device("cuda:1") agent = PPO(action_size, state_size, seed, device, config) # try: # except: # device = torch.device("cuda:0") # else: # device = torch.device('cpu') # try: # agent_a = PPO(action_size,state_size,seed,device,config) # agent_b = PPO(action_size,state_size,seed,device2,config) # print('Double GPU') # except: # print('Single GPU') # agent_a = PPO(action_size,state_size,seed,device,config) # agent_b = PPO(action_size,state_size,seed,device,config) train_ppo(env, agent, EPISODES, path)
def main(): env = gym.make('AntBulletEnv-v0') output_size = env.action_space.shape[0] with tf.Session() as sess: name = 'ant_5m' with tf.variable_scope(name): input = tf.placeholder(tf.float32, [None, env.observation_space.shape[0]]) initializer = tf.orthogonal_initializer( np.sqrt(2)) #Orthogonal initializer network = add_dense(input, 64, activation=tf.nn.tanh, kernel_initializer=initializer, name="dense1") network = add_dense(network, 64, activation=tf.nn.tanh, kernel_initializer=initializer, name="dense2") model = PPO(sess, input, network, actiontype.Continuous, output_size, epochs=10, minibatch_size=32, gamma=0.99, beta2=0.000, epsilon=0.2, learning_rate=lambda f: 3e-4 * (1 - f), name=name) train(sess, model, 'AntBulletEnv-v0', 1000000, 2048, num_envs=16, log_interval=5) run_only(sess, model, env) env.close()
def main(args): env = gym.make(args.env_name) device = torch.device(args.device) # 1.Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # 2.Create actor, critic, EnvSampler() and PPO. state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] actor = PolicyNetwork(state_size, action_size, hidden_sizes=args.hidden_sizes) critic = ValueNetwork(state_size, hidden_sizes=args.hidden_sizes) env_sampler = EnvSampler(env, args.max_episode_step) ppo = PPO(actor, critic, clip=args.clip, gamma=args.gamma, tau=args.tau, target_kl=args.target_kl, device=device, pi_steps_per_update=args.pi_steps_per_update, value_steps_per_update=args.value_steps_per_update, pi_lr=args.pi_lr, v_lr=args.value_lr) # 3.Start training. def get_action(state): state = torch.FloatTensor(state).unsqueeze(0).to(device) action = actor.select_action(state) return action.detach().cpu().numpy()[0] total_step = 0 for episode in range(1, args.episodes+1): episode_reward, samples = env_sampler(get_action, args.batch_size) actor_loss, value_loss = ppo.update(*samples) yield episode*args.batch_size, episode_reward, actor_loss, value_loss
def test(): env = gym.make(GAME).unwrapped agent = PPO(state_space=S_DIM, action_space=A_DIM, max_episode_num=EP_MAX, episode_lens=EP_LEN, discount_factor=GAMMA, actor_learning_rate=A_LR, critic_learning_rate=C_LR, mini_batch_size=MINI_BATCH_SIZE, epochs=EPOCHS) agent.load_weights(SAVE_INDEX) # env_reset # state = env.reset() # print(state) # state = env.reset() # print(state) state = env.reset() print(state) steps = 0 episode_r = 0 all_value = [] while steps < 1000: # env.render() # get action action = agent.choose_action(state) # execute one action state_after_action, reward, done, _ = env.step(action) steps += 1 episode_r += reward state = state_after_action state_value = agent.get_value(state) all_value.append(state_value) plt.plot(np.arange(len(all_value)), all_value) plt.xlabel('state') plt.ylabel('state value') plt.show() create_path('figure/'+SAVE_INDEX+'/figure') plt.savefig('weights/'+SAVE_INDEX+'/figure/fig.png') print("test 1000 steps, got reward: %i" % episode_r)
def main(_): tf.Session().__enter__() np.random.seed(FLAGS.seed) random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) def make_env(seed): def _make_env(): env = gym.make(FLAGS.env) env.seed(seed) env.allow_early_resets = True env = gym_wrapper.StackObs(env) return env return _make_env try: env = gym_wrapper.Workers( [make_env(_) for _ in [random.randint(0, 1000)] * FLAGS.nenvs]) ppo = PPO(env, nsteps=FLAGS.nsteps, learning_rate=FLAGS.lr, clip_range=FLAGS.cr, max_steps=FLAGS.max_steps, mb_size=FLAGS.mb_size, opteps=FLAGS.opteps, gae=FLAGS.gae, gamma=FLAGS.gamma, vf_coef=FLAGS.vf_coef, ent_coef=FLAGS.ent_coef, normalize_observations=FLAGS.normalize_obs) ppo.run() env.close() except KeyboardInterrupt: env.close()
def main(): net = PPONet(torch.device("cuda:0")) optimizer = Adam(params=net.parameters(), lr=1e-4) scheduler = LambdaLR(optimizer=optimizer, lr_lambda=lambda e: max(0.9999**e, 0.1)) agent = PPO(net=net, optimizer=optimizer, scheduler=scheduler, c1=0.5, c2=0, gamma=1, lambda_=0.99, epsilon=0.05, run_for_t=16, train_for_n_epochs=4, batch_size=16, verbose=True, device=torch.device("cuda:0")) run_ppo(agent, render=True)