Esempio n. 1
0
    def __init__(self, c: Configs):
        self.c = c
        # total number of samples for a single update
        self.envs = self.c.n_workers * self.c.env_per_worker
        self.batch_size = self.envs * self.c.worker_steps
        assert (self.batch_size % self.c.mini_batch_size == 0)

        # #### Initialize
        # create workers
        self.workers = [Worker(27 + i, c.env_per_worker) for i in range(self.c.n_workers)]
        self.score_queue = SortedQueue(400)

        # initialize tensors for observations
        self.obs = np.zeros((self.envs, *kTensorDim))
        for worker in self.workers:
            worker.child.send(("reset", None))
        for w, worker in enumerate(self.workers):
            self.obs[self.w_range(w)] = worker.child.recv()
        self.obs = obs_to_torch(self.obs)

        # model for sampling
        self.model = Model(c.channels, c.blocks).to(device)

        # optimizer
        self.scaler = GradScaler()
        self.optimizer = optim.Adam(self.model.parameters(), lr = self.c.lr, weight_decay = self.c.reg_l2)
Esempio n. 2
0
def Main(model_path):
    c = Configs()
    model = Model(c.channels, c.blocks).to(device)
    if model_path is None:
        model_path = os.path.join(os.path.dirname(sys.argv[0]), 'models/model.pth')
    if model_path[-3:] == 'pkl': model.load_state_dict(torch.load(model_path)[0].state_dict())
    else: model.load_state_dict(torch.load(model_path))
    model.eval()

    batch_size = 100
    n = 2000
    games = [tetris.Tetris(GetSeed(i)) for i in range(batch_size)]
    for i in games: ResetGame(i)
    started = batch_size
    results = []
    rewards = [0. for i in range(batch_size)]
    is_running = [True for i in range(batch_size)]
    while len(results) < n:
        states = [i.GetState() for i, j in zip(games, is_running) if j]
        states = obs_to_torch(np.stack(states), device)
        pi = model(states, False)[0]
        pi = torch.argmax(pi, 1)
        j = 0
        for i in range(batch_size):
            if not is_running[i]: continue
            action = pi[j].item()
            j += 1
            r, x, y = action // 200, action // 10 % 20, action % 10
            rewards[i] += games[i].InputPlacement(r, x, y)[1]
            if games[i].IsOver():
                results.append((games[i].GetScore(), games[i].GetLines()))
                rewards[i] = 0.
                if started < n:
                    games[i] = tetris.Tetris(GetSeed(i))
                    ResetGame(games[i])
                else:
                    is_running[i] = False
                if len(results) % 200 == 0: print(len(results), '/', n, 'games started')
    s = list(reversed(sorted([i[0] for i in results])))
    for i in range(len(s) - 1):
        for t in range(2000000, 700000, -50000):
            if s[i] >= t and s[i+1] < t: print(t, (i + 1) / n)
    s = list(reversed(sorted([i[1] for i in results])))
    for i in range(len(s) - 1):
        for t in range(350, 150, -10):
            if s[i] >= t and s[i+1] < t: print(t, (i + 1) / n)
Esempio n. 3
0
 model_path = sys.argv[1] if file_given else os.path.join(
     os.path.dirname(sys.argv[0]), 'models/model.pth')
 if model_path[-3:] == 'pkl':
     model.load_state_dict(torch.load(model_path)[0].state_dict())
 else:
     model.load_state_dict(torch.load(model_path))
 model.eval()
 envs = [Game(i + start_seed) for i in range(kEnvs)]
 finished = [False for i in range(kEnvs)]
 score = [0. for i in range(kEnvs)]
 while not all(finished):
     obs = []
     for i in envs:
         obs.append(i.obs)
     with torch.no_grad():
         obs = obs_to_torch(np.stack(obs))
         pi = model(obs)[0]
         act = torch.argmax(pi.probs, 1).cpu().numpy()
         #act = pi.sample().cpu().numpy()
     x, y = act // kW, act % kW
     tb = []
     for i in range(kEnvs):
         if finished[i]: continue
         _, _, over, info = envs[i].step((x[i], y[i]))
         if over:
             score[i] = info['score']
             finished[i] = True
 score = sorted(list(dict(Counter(score)).items()))
 for i, j in score:
     print(i, j)
 #score = [(i, j) for j, i in enumerate(score)]
Esempio n. 4
0
    def __init__(self, c: Configs, name: str):
        self.name = name
        self.c = c
        # total number of samples for a single update
        self.envs = self.c.n_workers * self.c.env_per_worker
        self.batch_size = self.envs * self.c.worker_steps
        assert (self.batch_size %
                (self.c.n_update_per_epoch * self.c.mini_batch_size) == 0)
        self.update_batch_size = self.batch_size // self.c.n_update_per_epoch

        # #### Initialize
        self.total_games = 0

        # model for sampling
        self.model = Model(c.channels, c.blocks).to(device)

        # dynamic hyperparams
        self.cur_lr = self.c.lr()
        self.cur_reg_l2 = self.c.reg_l2()
        self.cur_step_reward = 0.
        self.cur_right_gain = 0.
        self.cur_fix_prob = 0.
        self.cur_neg_mul = 0.
        self.cur_entropy_weight = self.c.entropy_weight()
        self.cur_prob_reg_weight = self.c.prob_reg_weight()
        self.cur_target_prob_weight = self.c.target_prob_weight()
        self.cur_gamma = self.c.gamma()
        self.cur_lamda = self.c.lamda()

        # optimizer
        self.scaler = GradScaler()
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.cur_lr,
                                    weight_decay=self.cur_reg_l2)

        # initialize tensors for observations
        shapes = [(self.envs, *kTensorDim),
                  (self.envs, self.c.worker_steps, 3),
                  (self.envs, self.c.worker_steps)]
        types = [np.dtype('float32'), np.dtype('float32'), np.dtype('bool')]
        self.shms = [
            shared_memory.SharedMemory(create=True,
                                       size=math.prod(shape) * typ.itemsize)
            for shape, typ in zip(shapes, types)
        ]
        self.obs_np, self.rewards, self.done = [
            np.ndarray(shape, dtype=typ, buffer=shm.buf)
            for shm, shape, typ in zip(self.shms, shapes, types)
        ]
        # create workers
        shm = [(shm.name, shape, typ)
               for shm, shape, typ in zip(self.shms, shapes, types)]
        self.workers = [
            Worker(name, shm, self.w_range(i), 27 + i)
            for i in range(self.c.n_workers)
        ]
        self.set_game_param(self.c.right_gain(), self.c.fix_prob(),
                            self.c.neg_mul(), self.c.step_reward())
        for i in self.workers:
            i.child.send(('reset', None))
        for i in self.workers:
            i.child.recv()

        self.obs = obs_to_torch(self.obs_np, device)
Esempio n. 5
0
    def sample(self, train=True) -> (Dict[str, np.ndarray], List):
        """### Sample data with current policy"""
        actions = torch.zeros((self.envs, self.c.worker_steps),
                              dtype=torch.int32,
                              device=device)
        obs = torch.zeros((self.envs, self.c.worker_steps, *kTensorDim),
                          dtype=torch.float32,
                          device=device)
        log_pis = torch.zeros((self.envs, self.c.worker_steps),
                              dtype=torch.float32,
                              device=device)
        values = torch.zeros((self.envs, self.c.worker_steps, 3),
                             dtype=torch.float32,
                             device=device)

        # sample `worker_steps` from each worker
        tot_lines = 0
        tot_score = 0
        for t in range(self.c.worker_steps):
            with torch.no_grad():
                # `self.obs` keeps track of the last observation from each worker,
                #  which is the input for the model to sample the next action
                obs[:, t] = self.obs
                # sample actions from $\pi_{\theta_{OLD}}$
                pi, v = self.model(self.obs)
                values[:, t] = v
                a = pi.sample()
                actions[:, t] = a
                log_pis[:, t] = pi.log_prob(a)
                actions_cpu = a.cpu().numpy()

            # run sampled actions on each worker
            # workers will place results in self.obs_np,rewards,done
            for w, worker in enumerate(self.workers):
                worker.child.send(('step', (t, actions_cpu[self.w_range(w)],
                                            tracker.get_global_step())))
            for i in self.workers:
                info_arr = i.child.recv()
                # collect episode info, which is available if an episode finished
                if train:
                    self.total_games += len(info_arr)
                    for info in info_arr:
                        tot_lines += info['lines']
                        tot_score += info['score']
                        tracker.add('reward', info['reward'])
                        tracker.add('scorek', info['score'] * 1e-3)
                        tracker.add('lines', info['lines'])
                        tracker.add('length', info['length'])
            self.obs = obs_to_torch(self.obs_np, device)

        # reshape rewards & log rewards
        reward_max = self.rewards[:, :, 0].max()
        if train:
            tracker.add('maxk', reward_max / 1e-2)
            tracker.add('mil_games', self.total_games * 1e-6)
            tracker.add('perline', tot_score * 1e-3 / tot_lines)

        # calculate advantages
        advantages = self._calc_advantages(self.done, self.rewards, values)
        samples = {
            'obs': obs,
            'actions': actions,
            'values': values,
            'log_pis': log_pis,
            'advantages': advantages
        }
        # samples are currently in [workers, time] table, flatten it
        for i in samples:
            samples[i] = samples[i].view(-1, *samples[i].shape[2:])
        return samples
Esempio n. 6
0
def GetTorch(game):
    return obs_to_torch(game.env.GetState(), device).unsqueeze(0)
Esempio n. 7
0
    def sample(self) -> (Dict[str, np.ndarray], List):
        """### Sample data with current policy"""

        rewards = np.zeros((self.envs, self.c.worker_steps), dtype = np.float16)
        done = np.zeros((self.envs, self.c.worker_steps), dtype = np.bool)
        actions = torch.zeros((self.envs, self.c.worker_steps), dtype = torch.int32, device = device)
        obs = torch.zeros((self.envs, self.c.worker_steps, *kTensorDim), dtype = torch.uint8, device = device)
        log_pis = torch.zeros((self.envs, self.c.worker_steps), dtype = torch.float16, device = device)
        values = torch.zeros((self.envs, self.c.worker_steps), dtype = torch.float16, device = device)

        # sample `worker_steps` from each worker
        for t in range(self.c.worker_steps):
            with torch.no_grad():
                # `self.obs` keeps track of the last observation from each worker,
                #  which is the input for the model to sample the next action
                obs[:, t] = self.obs
                # sample actions from $\pi_{\theta_{OLD}}$ for each worker;
                #  this returns arrays of size `n_workers`
                pi, v = self.model(self.obs)
                values[:, t] = v
                a = pi.sample()
                actions[:, t] = a
                log_pis[:, t] = pi.log_prob(a)

            # run sampled actions on each worker
            for w, worker in enumerate(self.workers):
                worker.child.send(("step", actions[self.w_range(w),t].cpu().numpy()))

            self.obs = np.zeros((self.envs, *kTensorDim))
            for w, worker in enumerate(self.workers):
                # get results after executing the actions
                now = self.w_range(w)
                self.obs[now], rewards[now,t], done[now,t], info_arr = worker.child.recv()

                # collect episode info, which is available if an episode finished;
                #  this includes total reward and length of the episode -
                #  look at `Game` to see how it works.
                # We also add a game frame to it for monitoring.
                for info in info_arr:
                    if not info: continue
                    self.score_queue.add(info['score'])
                    tracker.add('reward', info['reward'])
                    tracker.add('score', info['score'])
                    tracker.add('score_per01', self.score_queue.get_ratio(0.01))
                    tracker.add('score_per10', self.score_queue.get_ratio(0.1))
                    tracker.add('score_per50', self.score_queue.get_ratio(0.5))
                    tracker.add('score_per90', self.score_queue.get_ratio(0.9))
                    tracker.add('score_per99', self.score_queue.get_ratio(0.99))
                    tracker.add('length', info['length'])
            self.obs = obs_to_torch(self.obs)

        # calculate advantages
        advantages = self._calc_advantages(done, rewards, values)
        samples = {
            'obs': obs,
            'actions': actions,
            'values': values,
            'log_pis': log_pis,
            'advantages': advantages
        }
        # samples are currently in [workers, time] table,
        #  we should flatten it
        for i in samples:
            samples[i] = samples[i].view(-1, *samples[i].shape[2:])
        return samples