def __init__(self, c: Configs): self.c = c # total number of samples for a single update self.envs = self.c.n_workers * self.c.env_per_worker self.batch_size = self.envs * self.c.worker_steps assert (self.batch_size % self.c.mini_batch_size == 0) # #### Initialize # create workers self.workers = [Worker(27 + i, c.env_per_worker) for i in range(self.c.n_workers)] self.score_queue = SortedQueue(400) # initialize tensors for observations self.obs = np.zeros((self.envs, *kTensorDim)) for worker in self.workers: worker.child.send(("reset", None)) for w, worker in enumerate(self.workers): self.obs[self.w_range(w)] = worker.child.recv() self.obs = obs_to_torch(self.obs) # model for sampling self.model = Model(c.channels, c.blocks).to(device) # optimizer self.scaler = GradScaler() self.optimizer = optim.Adam(self.model.parameters(), lr = self.c.lr, weight_decay = self.c.reg_l2)
def Main(model_path): c = Configs() model = Model(c.channels, c.blocks).to(device) if model_path is None: model_path = os.path.join(os.path.dirname(sys.argv[0]), 'models/model.pth') if model_path[-3:] == 'pkl': model.load_state_dict(torch.load(model_path)[0].state_dict()) else: model.load_state_dict(torch.load(model_path)) model.eval() batch_size = 100 n = 2000 games = [tetris.Tetris(GetSeed(i)) for i in range(batch_size)] for i in games: ResetGame(i) started = batch_size results = [] rewards = [0. for i in range(batch_size)] is_running = [True for i in range(batch_size)] while len(results) < n: states = [i.GetState() for i, j in zip(games, is_running) if j] states = obs_to_torch(np.stack(states), device) pi = model(states, False)[0] pi = torch.argmax(pi, 1) j = 0 for i in range(batch_size): if not is_running[i]: continue action = pi[j].item() j += 1 r, x, y = action // 200, action // 10 % 20, action % 10 rewards[i] += games[i].InputPlacement(r, x, y)[1] if games[i].IsOver(): results.append((games[i].GetScore(), games[i].GetLines())) rewards[i] = 0. if started < n: games[i] = tetris.Tetris(GetSeed(i)) ResetGame(games[i]) else: is_running[i] = False if len(results) % 200 == 0: print(len(results), '/', n, 'games started') s = list(reversed(sorted([i[0] for i in results]))) for i in range(len(s) - 1): for t in range(2000000, 700000, -50000): if s[i] >= t and s[i+1] < t: print(t, (i + 1) / n) s = list(reversed(sorted([i[1] for i in results]))) for i in range(len(s) - 1): for t in range(350, 150, -10): if s[i] >= t and s[i+1] < t: print(t, (i + 1) / n)
model_path = sys.argv[1] if file_given else os.path.join( os.path.dirname(sys.argv[0]), 'models/model.pth') if model_path[-3:] == 'pkl': model.load_state_dict(torch.load(model_path)[0].state_dict()) else: model.load_state_dict(torch.load(model_path)) model.eval() envs = [Game(i + start_seed) for i in range(kEnvs)] finished = [False for i in range(kEnvs)] score = [0. for i in range(kEnvs)] while not all(finished): obs = [] for i in envs: obs.append(i.obs) with torch.no_grad(): obs = obs_to_torch(np.stack(obs)) pi = model(obs)[0] act = torch.argmax(pi.probs, 1).cpu().numpy() #act = pi.sample().cpu().numpy() x, y = act // kW, act % kW tb = [] for i in range(kEnvs): if finished[i]: continue _, _, over, info = envs[i].step((x[i], y[i])) if over: score[i] = info['score'] finished[i] = True score = sorted(list(dict(Counter(score)).items())) for i, j in score: print(i, j) #score = [(i, j) for j, i in enumerate(score)]
def __init__(self, c: Configs, name: str): self.name = name self.c = c # total number of samples for a single update self.envs = self.c.n_workers * self.c.env_per_worker self.batch_size = self.envs * self.c.worker_steps assert (self.batch_size % (self.c.n_update_per_epoch * self.c.mini_batch_size) == 0) self.update_batch_size = self.batch_size // self.c.n_update_per_epoch # #### Initialize self.total_games = 0 # model for sampling self.model = Model(c.channels, c.blocks).to(device) # dynamic hyperparams self.cur_lr = self.c.lr() self.cur_reg_l2 = self.c.reg_l2() self.cur_step_reward = 0. self.cur_right_gain = 0. self.cur_fix_prob = 0. self.cur_neg_mul = 0. self.cur_entropy_weight = self.c.entropy_weight() self.cur_prob_reg_weight = self.c.prob_reg_weight() self.cur_target_prob_weight = self.c.target_prob_weight() self.cur_gamma = self.c.gamma() self.cur_lamda = self.c.lamda() # optimizer self.scaler = GradScaler() self.optimizer = optim.Adam(self.model.parameters(), lr=self.cur_lr, weight_decay=self.cur_reg_l2) # initialize tensors for observations shapes = [(self.envs, *kTensorDim), (self.envs, self.c.worker_steps, 3), (self.envs, self.c.worker_steps)] types = [np.dtype('float32'), np.dtype('float32'), np.dtype('bool')] self.shms = [ shared_memory.SharedMemory(create=True, size=math.prod(shape) * typ.itemsize) for shape, typ in zip(shapes, types) ] self.obs_np, self.rewards, self.done = [ np.ndarray(shape, dtype=typ, buffer=shm.buf) for shm, shape, typ in zip(self.shms, shapes, types) ] # create workers shm = [(shm.name, shape, typ) for shm, shape, typ in zip(self.shms, shapes, types)] self.workers = [ Worker(name, shm, self.w_range(i), 27 + i) for i in range(self.c.n_workers) ] self.set_game_param(self.c.right_gain(), self.c.fix_prob(), self.c.neg_mul(), self.c.step_reward()) for i in self.workers: i.child.send(('reset', None)) for i in self.workers: i.child.recv() self.obs = obs_to_torch(self.obs_np, device)
def sample(self, train=True) -> (Dict[str, np.ndarray], List): """### Sample data with current policy""" actions = torch.zeros((self.envs, self.c.worker_steps), dtype=torch.int32, device=device) obs = torch.zeros((self.envs, self.c.worker_steps, *kTensorDim), dtype=torch.float32, device=device) log_pis = torch.zeros((self.envs, self.c.worker_steps), dtype=torch.float32, device=device) values = torch.zeros((self.envs, self.c.worker_steps, 3), dtype=torch.float32, device=device) # sample `worker_steps` from each worker tot_lines = 0 tot_score = 0 for t in range(self.c.worker_steps): with torch.no_grad(): # `self.obs` keeps track of the last observation from each worker, # which is the input for the model to sample the next action obs[:, t] = self.obs # sample actions from $\pi_{\theta_{OLD}}$ pi, v = self.model(self.obs) values[:, t] = v a = pi.sample() actions[:, t] = a log_pis[:, t] = pi.log_prob(a) actions_cpu = a.cpu().numpy() # run sampled actions on each worker # workers will place results in self.obs_np,rewards,done for w, worker in enumerate(self.workers): worker.child.send(('step', (t, actions_cpu[self.w_range(w)], tracker.get_global_step()))) for i in self.workers: info_arr = i.child.recv() # collect episode info, which is available if an episode finished if train: self.total_games += len(info_arr) for info in info_arr: tot_lines += info['lines'] tot_score += info['score'] tracker.add('reward', info['reward']) tracker.add('scorek', info['score'] * 1e-3) tracker.add('lines', info['lines']) tracker.add('length', info['length']) self.obs = obs_to_torch(self.obs_np, device) # reshape rewards & log rewards reward_max = self.rewards[:, :, 0].max() if train: tracker.add('maxk', reward_max / 1e-2) tracker.add('mil_games', self.total_games * 1e-6) tracker.add('perline', tot_score * 1e-3 / tot_lines) # calculate advantages advantages = self._calc_advantages(self.done, self.rewards, values) samples = { 'obs': obs, 'actions': actions, 'values': values, 'log_pis': log_pis, 'advantages': advantages } # samples are currently in [workers, time] table, flatten it for i in samples: samples[i] = samples[i].view(-1, *samples[i].shape[2:]) return samples
def GetTorch(game): return obs_to_torch(game.env.GetState(), device).unsqueeze(0)
def sample(self) -> (Dict[str, np.ndarray], List): """### Sample data with current policy""" rewards = np.zeros((self.envs, self.c.worker_steps), dtype = np.float16) done = np.zeros((self.envs, self.c.worker_steps), dtype = np.bool) actions = torch.zeros((self.envs, self.c.worker_steps), dtype = torch.int32, device = device) obs = torch.zeros((self.envs, self.c.worker_steps, *kTensorDim), dtype = torch.uint8, device = device) log_pis = torch.zeros((self.envs, self.c.worker_steps), dtype = torch.float16, device = device) values = torch.zeros((self.envs, self.c.worker_steps), dtype = torch.float16, device = device) # sample `worker_steps` from each worker for t in range(self.c.worker_steps): with torch.no_grad(): # `self.obs` keeps track of the last observation from each worker, # which is the input for the model to sample the next action obs[:, t] = self.obs # sample actions from $\pi_{\theta_{OLD}}$ for each worker; # this returns arrays of size `n_workers` pi, v = self.model(self.obs) values[:, t] = v a = pi.sample() actions[:, t] = a log_pis[:, t] = pi.log_prob(a) # run sampled actions on each worker for w, worker in enumerate(self.workers): worker.child.send(("step", actions[self.w_range(w),t].cpu().numpy())) self.obs = np.zeros((self.envs, *kTensorDim)) for w, worker in enumerate(self.workers): # get results after executing the actions now = self.w_range(w) self.obs[now], rewards[now,t], done[now,t], info_arr = worker.child.recv() # collect episode info, which is available if an episode finished; # this includes total reward and length of the episode - # look at `Game` to see how it works. # We also add a game frame to it for monitoring. for info in info_arr: if not info: continue self.score_queue.add(info['score']) tracker.add('reward', info['reward']) tracker.add('score', info['score']) tracker.add('score_per01', self.score_queue.get_ratio(0.01)) tracker.add('score_per10', self.score_queue.get_ratio(0.1)) tracker.add('score_per50', self.score_queue.get_ratio(0.5)) tracker.add('score_per90', self.score_queue.get_ratio(0.9)) tracker.add('score_per99', self.score_queue.get_ratio(0.99)) tracker.add('length', info['length']) self.obs = obs_to_torch(self.obs) # calculate advantages advantages = self._calc_advantages(done, rewards, values) samples = { 'obs': obs, 'actions': actions, 'values': values, 'log_pis': log_pis, 'advantages': advantages } # samples are currently in [workers, time] table, # we should flatten it for i in samples: samples[i] = samples[i].view(-1, *samples[i].shape[2:]) return samples