def __init__(self, model_path, dtype, seed=451): self._seed = seed self._idx = 0 self._dtype = dtype self.env = LoveLetterEnv(AgentRandom(seed), seed) state = self.env.reset() self._model = ActorCritic(state.shape[0], self.env.action_space).type(dtype) if torch.cuda.is_available(): self._model.load_state_dict(torch.load(model_path))
def __init__(self, model_path, dtype, seed=451): self._seed = seed self._idx = 0 self._dtype = dtype self.env = LoveLetterEnv(AgentRandom(seed), seed) state = self.env.reset() self._model = ActorCritic(state.shape[0], self.env.action_space).type(dtype) self._model.load_state_dict( torch.load(model_path, map_location={'cuda:0': 'cpu'}))
def __init__(self, model_path, seed=451): self._seed = seed self._idx = 0 self.env = LoveLetterEnv(AgentRandom(seed), seed) self.vec_env = DummyVecEnv([ lambda: self.env ]) # The algorithms require a vectorized environment to run state = self.env.reset() self._model = PPO2(MlpPolicy, self.vec_env, verbose=0, tensorboard_log="./tensorboard/") self._model.load(model_path)
class AgentA3C(Agent): '''Agent which leverages Actor Critic Learning''' def __init__(self, model_path, dtype, seed=451): self._seed = seed self._idx = 0 self._dtype = dtype self.env = LoveLetterEnv(AgentRandom(seed), seed) state = self.env.reset() self._model = ActorCritic( state.shape[0], self.env.action_space).type(dtype) self._model.load_state_dict(torch.load(model_path)) def _move(self, game): '''Return move which ends in score hole''' assert game.active() self._idx += 1 state = self.env.force(game) state = torch.from_numpy(state).type(self._dtype) cx = Variable(torch.zeros(1, 256).type(self._dtype), volatile=True) hx = Variable(torch.zeros(1, 256).type(self._dtype), volatile=True) _, logit, (hx, cx) = self._model( (Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action_idx = prob.max(1)[1].data.cpu().numpy()[0, 0] player_action = self.env.action_from_index(action_idx, game) if player_action is None: # print("ouch") options = Agent.valid_actions(game, self._seed + self._idx) if len(options) < 1: raise Exception("Unable to play without actions") random.seed(self._seed + self._idx) return random.choice(options) # print("playing ", self._idx, player_action) return player_action
class TFAgent(Agent): '''Agent which leverages tensorflow''' def __init__(self, model_path, seed=451): self._seed = seed self._idx = 0 self.env = LoveLetterEnv(AgentRandom(seed), seed) self.vec_env = DummyVecEnv([ lambda: self.env ]) # The algorithms require a vectorized environment to run state = self.env.reset() self._model = PPO2(MlpPolicy, self.vec_env, verbose=0, tensorboard_log="./tensorboard/") self._model.load(model_path) def _move(self, game): '''Return move which ends in score hole''' assert game.active() self._idx += 1 state = self.env.force(game) action_idx = self._model.predict(state, deterministic=True)[0] player_action = self.env.action_from_index(action_idx, game) if player_action is None: # print("ouch") options = Agent.valid_actions(game, self._seed + self._idx) if len(options) < 1: raise Exception("Unable to play without actions") random.seed(self._seed + self._idx) return random.choice(options) # print("playing ", self._idx, player_action) return player_action
help='path/prefix for the filename to save shared model\'s parameters') parser.add_argument( '--load-name', default=None, metavar='SN', help='path/prefix for the filename to load shared model\'s parameters') if __name__ == '__main__': args = parser.parse_args() torch.manual_seed(args.seed) dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor env = LoveLetterEnv(AgentRandom(args.seed), args.seed) state = env.reset() shared_model = ActorCritic(state.shape[0], env.action_space).type(dtype) if args.load_name is not None: shared_model.load_state_dict(torch.load(args.load_name)) shared_model.share_memory() # train(1,args,shared_model,dtype) processes = [] p = mp.Process(target=test, args=(args.num_processes, args, shared_model, dtype)) p.start() processes.append(p) if not args.evaluate:
def test(rank, args, shared_model, dtype): test_ctr = 0 torch.manual_seed(args.seed + rank) # set up logger timestring = str(date.today()) + '_' + \ time.strftime("%Hh-%Mm-%Ss", time.localtime(time.time())) run_name = args.save_name + '_' + timestring configure("logs/run_" + run_name, flush_secs=5) env = LoveLetterEnv(AgentRandom(args.seed + rank), args.seed + rank) env.seed(args.seed + rank) state = env.reset() model = ActorCritic(state.shape[0], env.action_space).type(dtype) model.eval() state = torch.from_numpy(state).type(dtype) reward_sum = 0 max_reward = -99999999 max_winrate = 0 rewards_recent = deque([], 100) done = True start_time = time.time() episode_length = 0 while True: episode_length += 1 # Sync with the shared model if done: model.load_state_dict(shared_model.state_dict()) cx = Variable(torch.zeros(1, 256).type(dtype), volatile=True) hx = Variable(torch.zeros(1, 256).type(dtype), volatile=True) else: cx = Variable(cx.data.type(dtype), volatile=True) hx = Variable(hx.data.type(dtype), volatile=True) value, logit, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx))) prob = F.softmax(logit) action = prob.max(1)[1].data.cpu().numpy() state, reward, done, _ = env.step(action[0, 0]) done = done or episode_length >= args.max_episode_length reward_sum += reward if done: rewards_recent.append(reward_sum) rewards_recent_avg = sum(rewards_recent) / len(rewards_recent) print( "{} | Episode Reward {: >4}, Length {: >2} | Avg Reward {:0.2f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length, rewards_recent_avg)) # if not stuck or args.evaluate: log_value('Reward', reward_sum, test_ctr) log_value('Reward Average', rewards_recent_avg, test_ctr) log_value('Episode length', episode_length, test_ctr) if reward_sum >= max_reward: # pickle.dump(shared_model.state_dict(), open(args.save_name + '_max' + '.p', 'wb')) path_output = args.save_name + '_max' torch.save(shared_model.state_dict(), path_output) path_now = "{}_{}".format(args.save_name, datetime.datetime.now().isoformat()) torch.save(shared_model.state_dict(), path_now) max_reward = reward_sum win_rate_v_random = Arena.compare_agents_float( lambda seed: AgentA3C(path_output, dtype, seed), lambda seed: AgentRandom(seed), 800) msg = " {} | VsRandom: {: >4}%".format( datetime.datetime.now().strftime("%c"), round(win_rate_v_random * 100, 2)) print(msg) log_value('Win Rate vs Random', win_rate_v_random, test_ctr) if win_rate_v_random > max_winrate: print("Found superior model at {}".format( datetime.datetime.now().isoformat())) torch.save( shared_model.state_dict(), "{}_{}_best_{}".format( args.save_name, datetime.datetime.now().isoformat(), win_rate_v_random)) max_winrate = win_rate_v_random reward_sum = 0 episode_length = 0 state = env.reset() test_ctr += 1 if test_ctr % 10 == 0 and not args.evaluate: # pickle.dump(shared_model.state_dict(), open(args.save_name + '.p', 'wb')) torch.save(shared_model.state_dict(), args.save_name) if not args.evaluate: time.sleep(60) elif test_ctr == evaluation_episodes: # Ensure the environment is closed so we can complete the # submission env.close() # gym.upload('monitor/' + run_name, api_key=api_key) state = torch.from_numpy(state).type(dtype)
def train(rank, args, shared_model, dtype): torch.manual_seed(args.seed + rank) env = LoveLetterEnv(AgentRandom(args.seed + rank), args.seed + rank) env.seed(args.seed + rank) state = env.reset() model = ActorCritic(state.shape[0], env.action_space).type(dtype) optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() values = [] log_probs = [] state = torch.from_numpy(state).type(dtype) done = True episode_length = 0 while True: episode_length += 1 # Sync with the shared model model.load_state_dict(shared_model.state_dict()) if done: cx = Variable(torch.zeros(1, 256).type(dtype)) hx = Variable(torch.zeros(1, 256).type(dtype)) else: cx = Variable(cx.data.type(dtype)) hx = Variable(hx.data.type(dtype)) values = [] log_probs = [] rewards = [] entropies = [] for step in range(args.num_steps): value, logit, (hx, cx) = model( (Variable(state.unsqueeze(0)), (hx, cx))) prob = F.softmax(logit) log_prob = F.log_softmax(logit) entropy = -(log_prob * prob).sum(1) entropies.append(entropy) action = prob.multinomial().data log_prob = log_prob.gather(1, Variable(action)) state, reward, done, _ = env.step(action.cpu().numpy()[0][0]) done = done or episode_length >= args.max_episode_length if done: episode_length = 0 state = env.reset() state = torch.from_numpy(state).type(dtype) values.append(value) log_probs.append(log_prob) rewards.append(reward) if done: break R = torch.zeros(1, 1).type(dtype) if not done: value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1).type(dtype) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae) - args.beta * entropies[i] optimizer.zero_grad() (policy_loss + 0.5 * value_loss).backward() torch.nn.utils.clip_grad_norm(model.parameters(), 40) ensure_shared_grads(model, shared_model) optimizer.step()
class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__( *args, **kwargs, net_arch=[512, dict(pi=[256, 128], vf=[256, 128])], feature_extraction="mlp") if __name__ == '__main__': args = parser.parse_args() if args.load_name: env = SubprocVecEnv([ lambda: LoveLetterEnv(TFAgent(args.load_name, args.seed + i)) for i in range(args.num_processes) ]) else: env = SubprocVecEnv([ lambda: LoveLetterEnv(AgentRandom(args.seed + i)) for i in range(args.num_processes) ]) model = PPO2(CustomPolicy, env, verbose=0, tensorboard_log=args.log_dir, learning_rate=args.lr, n_steps=args.num_steps, nminibatches=5)