def replay_train(mainDQN: dqn.DQN, targetDQN: dqn.DQN,
                 train_batch: list) -> float:
    states = np.vstack([x[0] for x in train_batch])
    actions = np.array([x[1] for x in train_batch])
    rewards = np.array([x[2] for x in train_batch])
    next_states = np.vstack([x[3] for x in train_batch])
    done = np.array([x[4] for x in train_batch])

    X = states
    Q_target = rewards + DISCOUNT_RATE * np.max(targetDQN.predict(next_states),
                                                axis=1) * ~done

    y = mainDQN.predict(states)
    y[np.arange(len(X)), actions] = Q_target

    # Train our network using target and predicted Q values on each episode
    return mainDQN.update(X, y)
Esempio n. 2
0
 def __init__(self, conf):
     self.env = Environment(name=conf.env,
                            width=conf.width,
                            height=conf.height,
                            history=conf.history)
     self.hist = History(self.env)
     self.mem = ReplayMemory(self.env,
                             capacity=conf.mem_capacity,
                             batch_size=conf.batch_size)
     self._capa = conf.mem_capacity
     self._ep_en = conf.ep_end
     self._ep_st = conf.ep_start
     self._learn_st = conf.learn_start
     self._tr_freq = conf.train_freq
     self._update_freq = conf.update_freq
     self.q = DQN(self.hist._history, self.env.action_size).type(dtype)
     self.target_q = DQN(self.hist._history,
                         self.env.action_size).type(dtype)
     self.optim = torch.optim.RMSprop(self.q.parameters(),
                                      lr=0.00025,
                                      alpha=0.95,
                                      eps=0.01)
Esempio n. 3
0
if args.random_seed:
    random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
    env = ALEEnvironment(args.game, args)
    logger.info("Using ALE Environment")
elif args.environment == 'gym':
    logger.handlers.pop()
    env = GymEnvironment(args.game, args)
    logger.info("Using Gym Environment")
else:
    assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DQN(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
    net.load_weights(args.load_weights)

if args.play_games:
    logger.info("Playing for %d game(s)" % args.play_games)
    stats.reset()
    agent.play(args.play_games)
    stats.write(0, "play")
    if args.visualization_file:
        from visualization import visualize
        # use states recorded during gameplay. NB! Check buffer size, that it can accomodate one game!
Esempio n. 4
0
if args.random_seed:
        random.seed(args.random_seed)

# instantiate classes
if args.environment == 'ale':
  env = ALEEnvironment(args.game, args)
  logger.info("Using ALE Environment")
elif args.environment == 'gym':
  logger.handlers.pop()
  env = GymEnvironment(args.game, args)
  logger.info("Using Gym Environment")
else:
  assert False, "Unknown environment" + args.environment

mem = ReplayMemory(args.replay_size, args)
net = DQN(env.numActions(), args)
agent = Agent(env, mem, net, args)
stats = Statistics(agent, net, mem, env, args)

if args.load_weights:
    logger.info("Loading weights from %s" % args.load_weights)
    net.load_weights(args.load_weights)

if args.play_games:
    logger.info("Playing for %d game(s)" % args.play_games)
    stats.reset()
    agent.play(args.play_games)
    stats.write(0, "play")
    if args.visualization_file:
        from visualization import visualize
        # use states recorded during gameplay. NB! Check buffer size, that it can accomodate one game!
Esempio n. 5
0
class Agent(object):
    def __init__(self, conf):
        self.env = Environment(name=conf.env,
                               width=conf.width,
                               height=conf.height,
                               history=conf.history)
        self.hist = History(self.env)
        self.mem = ReplayMemory(self.env,
                                capacity=conf.mem_capacity,
                                batch_size=conf.batch_size)
        self._capa = conf.mem_capacity
        self._ep_en = conf.ep_end
        self._ep_st = conf.ep_start
        self._learn_st = conf.learn_start
        self._tr_freq = conf.train_freq
        self._update_freq = conf.update_freq
        self.q = DQN(self.hist._history, self.env.action_size).type(dtype)
        self.target_q = DQN(self.hist._history,
                            self.env.action_size).type(dtype)
        self.optim = torch.optim.RMSprop(self.q.parameters(),
                                         lr=0.00025,
                                         alpha=0.95,
                                         eps=0.01)

    def train(self):
        screen, reward, action, terminal = self.env.new_random_game()
        for _ in range(self.env._history):
            self.hist.add(screen)
        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        ep_rewards, actions = [], []
        #for self.step in xrange(50000000):
        for self.step in tqdm(range(0, 50000000), ncols=70, initial=0):
            if self.step == self._learn_st:
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []

            action = self._select_action()
            screen, reward, terminal = self.env.act(action)
            self.observe(screen, reward, action, terminal)
            if terminal:
                screen, reward, action, terminal = self.env.new_random_game()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0.
            else:
                ep_reward += reward
            actions.append(action)
            total_reward += reward
            if self.step >= self._learn_st:
                if self.step % 10000 == 10000 - 1:
                    avg_reward = total_reward / 10000.
                    avg_loss = self.total_loss / self.update_count
                    avg_q = self.total_q / self.update_count
                    print '# games: {}, reward: {}, loss: {}, q: {}'.format(
                        num_game, avg_reward, avg_loss, avg_q)
                    num_game = 0
                    total_reward = 0.
                    self.total_loss = 0.
                    self.total_q = 0.
                    self.update_count = 0
                    ep_reward = 0.
                    ep_rewards = []
                    actions = []

    def observe(self, screen, reward, action, terminal):
        reward = max(-1., min(1., reward))
        self.hist.add(screen)
        self.mem.add(screen, reward, action, terminal)
        if self.step > self._learn_st:
            if self.step % self._tr_freq == 0:
                self._q_learning()
                #print '{} q-learning'.format(self.step)
            if self.step % self._update_freq == self._update_freq - 1:
                self.target_q.load_state_dict(self.q.state_dict())
                if self.step % (self._update_freq * 10) == (self._update_freq *
                                                            10) - 1:
                    torch.save(self.target_q,
                               'models1/model_{}'.format(self.step))
                #print 'update'

    def play(self, model_path, num_ep=200):
        self.q = torch.load(model_path)
        best_reward = 0
        best_screen_hist = []
        for ep in range(num_ep):
            print '# episode: {}'.format(ep)
            screen, reward, action, terminal = self.env.new_random_game(
                force=True)
            current_reward = 0
            current_screen_hist = []
            act_hist = []
            current_screen_hist.append(self.env.screen)
            for _ in range(self.env._history):
                self.hist.add(screen)
            cnt = 0
            while not terminal:
                cnt += 1
                action = self._select_action(test_mode=True)
                act_hist.append(action)
                if cnt > 200:  # avoid local maxima ??? same actions....??
                    if np.array(act_hist[-100:]).mean() == act_hist[-1]:
                        action = random.randrange(self.env.action_size)

                screen, reward, terminal = self.env.act(action, is_train=False)
                self.hist.add(screen)
                current_reward += reward
                #print cnt, action, current_reward, terminal, self.env.lives
                current_screen_hist.append(self.env.screen)
            print current_reward
            print 'count: {}'.format(cnt)
            if current_reward > best_reward:
                best_reward = current_reward
                best_screen_hist = current_screen_hist
        import imageio
        print 'best reward: {}'.format(best_reward)
        imageio.mimsave('movies_play/best_{}.gif'.format(best_reward),
                        best_screen_hist,
                        'GIF',
                        duration=0.0001)

    def _q_learning(self):
        sc_t, actions, rewards, sc_t_1, terminals = self.mem.sample()
        batch_obs_t = self._to_tensor(sc_t)
        batch_obs_t_1 = self._to_tensor(sc_t_1, volatile=True)
        batch_rewards = self._to_tensor(rewards).unsqueeze(1)
        batch_actions = self._to_tensor(
            actions, data_type=torch.cuda.LongTensor).unsqueeze(1)
        batch_terminals = self._to_tensor(1. - terminals).unsqueeze(1)

        q_dash = self.q(batch_obs_t)
        #print 'shape_q: {}'.format(q_dash.shape)

        q_values = self.q(batch_obs_t).gather(1, batch_actions)
        next_max_q_values = self.target_q(batch_obs_t_1).max(1)[0].unsqueeze(1)
        next_q_values = batch_terminals * next_max_q_values
        target_q_values = batch_rewards + (0.99 * next_q_values)
        target_q_values.volatile = False

        cri = torch.nn.SmoothL1Loss()

        self.loss = cri(q_values, target_q_values)
        self.optim.zero_grad()
        self.loss.backward()
        self.optim.step()
        self.update_count += 1
        self.total_q += q_values.data.mean()
        self.total_loss += self.loss.data.mean()

    def _select_action(self, test_mode=False):
        # epsilon greedy policy
        if not test_mode:
            ep = self._ep_en + max(
                0., (self._ep_st - self._ep_en) *
                (self._capa - max(0., self.step - self._learn_st)) /
                self._capa)
        else:
            ep = -1.
        if random.random() < ep:
            action = random.randrange(self.env.action_size)
        else:
            inputs = self._to_tensor(self.hist.get)
            pred = self.q(inputs.unsqueeze(0))
            action = pred.data.max(1)[1][
                0]  # ##### actual = pred.data.max(1)[1][0][0]
        return action

    def _to_tensor(self, ndarray, volatile=False, data_type=dtype):
        return Variable(torch.from_numpy(ndarray),
                        volatile=volatile).type(data_type)
Esempio n. 6
0
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)