def run(game, state, params_dir): config = tf.ConfigProto() config.gpu_options.allow_growth = True env = make_local_env(game=game, state=state, stack=True, scale_rew=True) load_path = 'params_3/checkpoints/00151' def env_fn(): return env with tf.Session(config=config): model = ppo2.Model(policy = policies.CnnPolicy, ob_space = env.observation_space, ac_space = env.action_space, nbatch_act = 1, nsteps = 4500, nbatch_train = 4500 // 4, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5) print(env.observation_space) print(env.action_space) model.load(load_path) runner = ppo2.Runner(env=DummyVecEnv([env_fn]), model=model, nsteps=4500, gamma=0.99, lam=0.95) runner.run()
def get_exp(self, envIdx): level = self.levels[envIdx] env = make_local_env(level[0], level[1], True, True) def env_fn(): return env # self.model.load_ram(params) runner = ppo2.Runner( env=DummyVecEnv([env_fn]), num_envs=1, model=self.model, nsteps=steps_per_ep, gamma=gamma, lam=lam, lr=lr, cliprange=cliprange, noptepochs=noptepochs, nbatch_train=nbatch_train) exp = runner.run() env.close() # tf.reset_default_graph() # del runner # gc.collect() pid = os.getpid() py = psutil.Process(pid) memUse = py.memory_info()[0]/2.**30 print('memory use: %.6f GB from worker %d after model' %(memUse, self.id)) return exp
def main(): """Run DQN until the environment throws an exception.""" env = make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1') env = AllowBacktracking(make_local_env(env, stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 3) optimize = dqn.optimize(learning_rate=1e-4) sess.run(tf.global_variables_initializer()) dqn.train(num_steps=num_steps, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000) print(tf.trainable_variables()) save_path='/home/noob/retro-noob/rainbow/params/params' utils.save_state(save_path+'_tf_saver') with tf.variable_scope('model'): params = tf.trainable_variables() ps = sess.run(params) joblib.dump(ps, save_path + '_joblib')
def main(game, state, timesteps=5000, save_interval=1, last_dir=None, params_folder=None): """Run PPO until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 env = make(game=game, state=state) env = make_local_env(env, stack=True, scale_rew=True) logger.configure(params_folder, format_strs=['stdout']) def env_fn(): return env load_path = None if last_dir: list_of_params = glob.glob(last_dir + '/checkpoints/*') load_path = max(list_of_params, key=os.path.getctime) print('Restoring params from ', load_path) with tf.Session(config=config): # Take more timesteps than we need to be sure that # we stop due to an exception. ppo2.learn(policy=policies.CnnPolicy, env=DummyVecEnv([env_fn]), nsteps=4096, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=3, log_interval=1, ent_coef=0.01, lr=lambda _: 2e-4, cliprange=lambda _: 0.1, total_timesteps=timesteps, save_interval=save_interval, load_path=load_path)
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Training Agent: {}'.format(rank)) print("prank:", rank, "os.pid:", os.getpid()) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = AllowBacktracking( make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("Got a local env; obs space:", env.observation_space) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) env.seed(args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() print("player.state.shape:", player.state.shape) player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() player.eps_len += 2 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 512).cuda()) player.hx = Variable(torch.zeros(1, 512).cuda()) else: player.cx = Variable(torch.zeros(1, 512)) player.hx = Variable(torch.zeros(1, 512)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if player.done: break if player.done: # if player.info['ale.lives'] == 0 or player.max_length: # player.eps_len = 0 state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions()
def test(args, shared_model, env_conf): ptitle('Test Agent') gpu_id = args.gpu_ids[-1] log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger( '{}_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) print("test proc:") env = AllowBacktracking(make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False)) print("test got env:", env.observation_space) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = A3Clstm( player.env.observation_space.shape[0], player.env.action_space) player.state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward """ if player.done and player.info['ale.lives'] > 0 and not player.max_length: state = player.env.reset() player.eps_len += 2 player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() """ if player.done or player.max_length: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}". format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean)) if args.save_max and reward_sum >= max_score: max_score = reward_sum if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) else: state_to_save = player.model.state_dict() torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() player.eps_len += 2 time.sleep(10) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
#!/usr/bin/env python from sonic_util import make_local_env from argparse import ArgumentParser import os import torch import json parser = ArgumentParser(prog="test_sonic", description="Test a trained model on the Sonic retro gym env") parser.add_argument("--model-path", default="./trained_models/ppo/Sonic-Genesis-mixed-Train_mean1500_max6k.pt", help="Path to the pytorch agent model file", metavar="MODELPATH") parser.add_argument("--env-config", default="sonic_config.json", help="Path to the env config json file", metavar="ENVCONFIGFILE") args = parser.parse_args() if os.path.exists(args.model_path): agent_policy, obs = torch.load(args.model_path) env_confs = json.load(open(args.env_config, 'r')) test_env_conf = env_confs['Test'] test_envs = [v for _, v in test_env_conf.items()] print("test_envs:", test_envs) # Step 1: Test the agent against 1 env # Step 2: Test the agent against all the test env test_env = test_envs[0] env = make_local_env(game=test_env['game'], state=test_env['level']) obs = env.reset() env.render('human')
saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env), map_location=lambda storage, loc: storage) log = {} setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format(args.log_dir, args.env)) log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) env = make_local_env(env_conf["game"], env_conf["level"], stack=False, scale_rew=False) num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.model = A3Clstm(player.env.observation_space.shape[0], player.env.action_space) player.gpu_id = gpu_id if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() if args.new_gym_eval: player.env = gym.wrappers.Monitor(player.env, "{}_monitor".format(args.env), force=True)
help='number of frames to stack (default: 4)') parser.add_argument('--model-path', default='./trained_models/ppo/Sonic-GHZA1.pt', help='Path to the agent Policy to be loaded (default: ./trained_models/ppo/Sonic-GHZA1.pt)') parser.add_argument('--add-timestep', action='store_true', default=False, help='add timestep to observations') parser.add_argument('--num-episodes', type=int, default=100, help="Number of episodes to test/run the agent for") parser.add_argument('--log-dir', type=str, default='logs', help='Log directory to store the tensorboard summary files') #summary_file_path_prefix = writer = SummaryWriter() args = parser.parse_args() env = make_local_env(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1',stack=False, scale_rew =False) actor_critic, ob_rms, saved_rew = torch.load(args.model_path) print("Loaded Policy that got a mean reward of:", saved_rew) render_func = env.render obs_shape = env.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) current_obs = torch.zeros(1, *obs_shape) states = torch.zeros(1, actor_critic.state_size) masks = torch.zeros(1, 1) def update_current_obs(obs): shape_dim0 = env.observation_space.shape[0]
gamma = 0.99 lam = 0.95 cliprange = 0.2 vf_coef = 0.5 max_grad_norm = 0.5 # ratio of sum of norms, for clipping gradients nbatch_act = 1 # number of envs nminibatches = 4 lr = 2e-4 # learning rate noptepochs = 4 nbatch_train = horizon // nminibatches # number of training batches config = tf.ConfigProto() config.gpu_options.allow_growth = True env = make_local_env(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1', stack=True, scale_rew=True) env.close() train_data = pd.read_csv('../sonic-train.csv') levels = [] for index, level in train_data.iterrows(): levels.append((level.game, level.state)) with tf.Session(config=config): model = ppo2.Model(policy=policies.CnnPolicy, ob_space=env.observation_space, ac_space=env.action_space, nbatch_act=nbatch_act, nsteps=steps_per_ep, nbatch_train=nbatch_train,