Example #1
0
def main(_config):
    env = gym.make(_config.ENV_NAME)

    agent = DQN(env, _config)

    print("[*] --- Begin Emulator Training ---")

    for episode in range(_config.EPISODE):

        obs = env.reset()

        # === Emulator ===
        for i in range(_config.STEP):
            action = agent.pick_action(obs)
            obs_next, reward, done, _ = env.step(action)

            # agent will store the newest experience into replay buffer, and training with mini-batch and off-policy
            agent.perceive(obs, action, reward, done)

            if done:
                break

            obs = obs_next

        # == train ==
        agent.train(episode)

        if (episode + 1) % agent.save_every == 0:
            agent.save(step=episode)

        # == test ==
        print("\n[*] === Enter TEST module ===")
        test(env, _config.STEP, agent)

    agent.record()
Example #2
0
def train():
    print("뇌세포 깨우는 중..")
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)
    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all

    brain.update_target_network()
    epsilon = 1.0

    time_step = 0
    total_reward_list = []

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            if episode > OBSERVE:
                rpdilon -= 1 / 1000

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

            print('게임횟수 : %d, 점수 :  %d' % (episode + 1, total_reward))

            total_reward_list.append(total_reward)

            if episode % 10 == 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if episode % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Example #3
0
def train():
    print('뇌세포 꺠우는 중..')
    sess = tf.Session()

    game = Game(screenWidth, screenHeight, show_game=False)
    brain = DQN(sess, screenWidth, screenHeight, numAction)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summaryMerged = tf.summary.merge_all()

    brain.updateTargetNetwork()

    timeStep = 0
    totalRewardList = []

    for episode in range(maxEpisode):
        terminal = False
        totalReward = 0
        epsilon = 1.0

        state = game.reset()
        brain.initState(state)

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(numAction)

            else:
                action = brain.getAction()
            if episode > observe:
                epsilon -= 1 / 1000

            state, reward, terminal = game.step(action)
            totalReward += reward
            brain.remember(state, action, reward, terminal)

            if timeStep > observe and timeStep % trainInterval == 0:
                brain.train()
            if timeStep % targetUpdateInterval == 0:
                brain.updateTargetNetwork()

            timeStep += 1

        totalRewardList.append(totalReward)

        if episode % 10 == 0:
            summary = sess.run(summaryMerged,
                               feed_dict={rewards: totalRewardList})
            writer.add_summary(summary, timeStep)
        if episode % 100 == 99:
            print("게임횟수 : {0}, 점수 : {1:.4f}".format(episode + 1, totalReward))
            saver.save(sess, './model/dqn.ckpt', global_step=timeStep)
Example #4
0
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    sess.run(tf.global_variables_initializer())

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    time_step = 0
    epsilon = 1.0

    for episode in range(MAX_EPISODE):
        # 게임을 시작합니다.
        terminal = False

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        _, state, _, _ = game.first_step()
        brain.init_state(state)

        while not terminal:
            # 게임 기록을 가져옵니다.
            action, state, reward, terminal = game.step()

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)
            if (time_step > OBSERVE) and (time_step % TRAIN_INTERVAL) == 0:
                brain.train()
            # 타겟 네트웍을 업데이트 해 줍니다.
            # if (time_step % TARGET_UPDATE_INTERVAL) == 0:
            #     brain.update_target_network()
            time_step += 1
        # if episode % 50 == 0:
        print(episode)
    save_model(sess)
Example #5
0
def train():
    with tf.Session() as sess:
        tf.set_random_seed(GLOBAL_SEED)
        brain = DQN(sess, observation_size, action_size)
        rewards = tf.placeholder(tf.float32, [None])
        tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter('logs', sess.graph)
        summary_merged = tf.summary.merge_all()
        brain.update_target_network()
        time_step = 0
        total_reward_list = []

        for episode in range(MAX_EPISODE):
            done = False
            total_reward = 0
            epsilon = 1. / ((episode / 10) + 1)

            observation = env.reset()
            brain.init_state(observation)

            while not done:
                if np.random.rand() < epsilon:
                    action = random.randrange(action_size)
                else:
                    action = brain.get_action()

                observation, reward, done, _ = env.step(action)
                # print(observation, reward, done)
                total_reward += reward
                brain.remember(observation, action, reward, done)

                if time_step > 0:
                    if time_step % TRAIN_INTERVAL_FRAMES == 0:
                        _, loss = brain.train()
                    if time_step % TARGET_UPDATE_INTERVAL == 0:
                        brain.update_target_network()

                time_step += 1

            print('episode: %d total_reward: %d' % (episode, total_reward))

            total_reward_list.append(total_reward)

            if episode % 10 == 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if episode % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Example #6
0
class Agent():
  def __init__(self, args, env):
    self.action_space = env.action_space()
    self.atoms = args.atoms
    self.Vmin = args.V_min
    self.Vmax = args.V_max
    self.support = torch.linspace(args.V_min, args.V_max, args.atoms)  # Support (range) of z
    self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
    self.batch_size = args.batch_size
    self.n = args.multi_step
    self.discount = args.discount
    self.priority_exponent = args.priority_exponent
    self.max_gradient_norm = args.max_gradient_norm

    self.policy_net = DQN(args, self.action_space)
    if args.model and os.path.isfile(args.model):
      self.policy_net.load_state_dict(torch.load(args.model))
    self.policy_net.train()

    self.target_net = DQN(args, self.action_space)
    self.update_target_net()
    self.target_net.eval()

    self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps)
    if args.cuda:
      self.policy_net.cuda()
      self.target_net.cuda()
      self.support = self.support.cuda()

  # Resets noisy weights in all linear layers (of policy and target nets)
  def reset_noise(self):
    self.policy_net.reset_noise()
    self.target_net.reset_noise()

  # Acts based on single state (no batch)
  def act(self, state):
    return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[1][0]

  def learn(self, mem):
    idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(self.batch_size)
    batch_size = len(idxs)  # May return less than specified if invalid transitions sampled

    # Calculate current state probabilities
    ps = self.policy_net(states)  # Probabilities p(s_t, ·; θpolicy)
    ps_a = ps[range(batch_size), actions]  # p(s_t, a_t; θpolicy)

    # Calculate nth next state probabilities
    pns = self.policy_net(next_states).data  # Probabilities p(s_t+n, ·; θpolicy)
    dns = self.support.expand_as(pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θpolicy))
    argmax_indices_ns = dns.sum(2).max(1)[1]  # Perform argmax action selection using policy network: argmax_a[(z, p(s_t+n, a; θpolicy))]
    pns = self.target_net(next_states).data  # Probabilities p(s_t+n, ·; θtarget)
    pns_a = pns[range(batch_size), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θpolicy))]; θtarget)
    pns_a *= nonterminals  # Set p = 0 for terminal nth next states as all possible expected returns = expected reward at final transition

    # Compute Tz (Bellman operator T applied to z)
    Tz = returns.unsqueeze(1) + nonterminals * (self.discount ** self.n) * self.support.unsqueeze(0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
    Tz = Tz.clamp(min=self.Vmin, max=self.Vmax)  # Clamp between supported values
    # Compute L2 projection of Tz onto fixed support z
    b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
    l, u = b.floor().long(), b.ceil().long()

    # Distribute probability of Tz
    m = states.data.new(batch_size, self.atoms).zero_()
    offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).long().unsqueeze(1).expand(batch_size, self.atoms).type_as(actions)
    m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
    m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

    loss = -torch.sum(Variable(m) * ps_a.log(), 1)  # Cross-entropy loss (minimises Kullback-Leibler divergence)
    self.policy_net.zero_grad()
    (weights * loss).mean().backward()  # Importance weight losses
    nn.utils.clip_grad_norm(self.policy_net.parameters(), self.max_gradient_norm)  # Clip gradients (normalising by max value of gradient L2 norm)
    self.optimiser.step()

    mem.update_priorities(idxs, loss.data.abs().pow(self.priority_exponent))  # Update priorities of sampled transitions

  def update_target_net(self):
    self.target_net.load_state_dict(self.policy_net.state_dict())

  def save(self, path):
    torch.save(self.policy_net.state_dict(), os.path.join(path, 'model.pth'))

  # Evaluates Q-value based on single state (no batch)
  def evaluate_q(self, state):
    return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[0][0]

  def train(self):
    self.policy_net.train()

  def eval(self):
    self.policy_net.eval()
Example #7
0
def train():
	print('wake up the brain...')
	sess = tf.Session()

	game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
	brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

	rewards = tf.placeholder(tf.float32, [None])
	tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

	saver = tf.train.Saver()
	sess.run(tf.global_variables_initializer())

	writer = tf.summary.FileWriter('logs', sess.graph)
	summary_merged = tf.summary.merge_all()

	brain.update_target_network()

	epsilon = 1.0
	time_step = 0
	total_reward_list = []

	for episode in range(MAX_EPISODE):
		terminal = False
		total_reward = 0

		state = game.reset()
		brain.init_state(state)

		while not terminal:
			if np.random.rand() < epsilon:
				action = random.randrange(NUM_ACTION)
			else:
				action = brain.get_action()

			if episode > OBSERVE:
				epsilon -= 1 / 1000.

			state, reward, terminal = game.step(action)
			total_reward += reward

			brain.remember(state, action, reward, terminal)

			if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
				brain.train()

			if time_step % TARGET_UPDATE_INTERVAL == 0:
				brain.update_target_network()

			time_step += 1

		print('episode: %d, score: %d' % (episode + 1, total_reward))

		total_reward_list.append(total_reward)

		if episode % 10 == 0:
			summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
			writer.add_summary(summary, time_step)
			total_reward_list = []

		if episode % 100 == 0:
			saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Example #8
0
class DQNAgent:
    """
    Interacts with and learns from the environment.
    Vanilla DQN.
    """
    def __init__(self, state_size: int, action_size: int, seed: int):
        """
        Initialize an Agent object.

        :param state_size: dimension of each state;
        :param action_size: dimension of each action;
        :param seed: random seed.
        """

        self.state_size = state_size
        self.action_size = action_size
        random.seed(seed)

        # Q-Network
        self.network_local = DQN(state_size, action_size, seed).to(DEVICE)
        self.network_target = DQN(state_size, action_size, seed).to(DEVICE)
        self.optimizer = optim.Adam(self.network_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action: int, reward: float, next_state, done):
        """
        Save experiences in the replay memory and check if it's time to learn.

        :param state: (array_like) current state;
        :param action: action taken;
        :param reward: reward received;
        :param next_state: (array_like) next state;
        :param done: terminal state indicator; int or bool.
        """

        # Save experience in replay memory
        self.memory.push(state, action, reward, next_state, done)

        # Increment time step and compare it to the network update frequency
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # Check if there is enough samples in the memory to learn
            if len(self.memory) > BATCH_SIZE:
                # sample experiences from memory
                experiences = self.memory.sample()
                # learn from sampled experiences
                self.learn(experiences, GAMMA)

    def act(self, state, eps: float = 0.):
        """
        Returns actions for given state as per current policy.

        :param state: (array_like) current state
        :param eps: epsilon, for epsilon-greedy action selection
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
        self.network_local.eval()
        with torch.no_grad():
            action_values = self.network_local(state)
        self.network_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma: float):
        """
        Update value parameters using given batch of experience tuples.

        :param experiences: (Tuple[torch.Tensor]) tuple of (s, a, r, s', done) tuples;
        :param gamma: discount factor.
        """

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.network_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.network_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network_local, self.network_target, TAU)

    @staticmethod
    def soft_update(local_model, target_model, tau: float):
        """
        Soft update model parameters,
        θ_target = τ*θ_local + (1 - τ)*θ_target.

        :param local_model: (PyTorch model) weights will be copied from;
        :param target_model: (PyTorch model) weights will be copied to;
        :param tau: interpolation parameter.
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #9
0
class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = np.linspace(args.V_min, args.V_max,
                                   self.atoms)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_steps
        self.discount = args.discount
        self.norm_clip = args.max_norm_clip

        self.sess = tf.Session()

        with tf.variable_scope("online_net"):
            self.online_net = DQN(args, self.action_space)

        self.online_net.train()

        with tf.variable_scope("target_net"):
            self.target_net = DQN(args, self.action_space)
        self.target_net.train()

        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
        if tf.gfile.Exists("./models/model.ckpt"):
            self.saver.restore(self.sess, "./models/model.ckpt")

        online_net_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope="online_net")
        target_net_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope="target_net")
        update_target_op = []
        for var, var_target in zip(
                sorted(online_net_func_vars, key=lambda v: v.name),
                sorted(target_net_func_vars, key=lambda v: v.name)):
            update_target_op.append(var_target.assign(var))
        self.update_target_op = tf.group(*update_target_op)

        self.update_target_net()

        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=args.learning_rate, epsilon=args.adam_eps)

    def forward(self, network, inputs, log=False):
        if log:
            output = self.sess.run(network.action_log,
                                   feed_dict={network.inputs: inputs})
            return output
        else:
            output = self.sess.run(network.action,
                                   feed_dict={network.inputs: inputs})
            return output

    def reset_noise(self):
        self.online_net.reset_noise()

    def act(self, state):
        return np.argmax(
            np.sum(
                (self.forward(self.online_net, state.reshape(1, 84, 84, 4)) *
                 self.support),
                axis=-1))

    def act_e_greedy(self, state, epsilon=0.001):
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        log_ps = self.forward(self.online_net, states, log=True)
        log_ps_a = []
        for i in range(self.batch_size):
            log_ps_a.append(log_ps[i][actions[i]])

        pns = self.forward(self.online_net, next_states)
        dns = np.broadcast_to(self.support, (self.action_space, self.atoms))
        dns = np.multiply(
            np.broadcast_to(dns,
                            (self.batch_size, self.action_space, self.atoms)),
            pns)
        argmax_indices_ns = np.argmax(np.sum(dns, axis=2), axis=1)
        self.target_net.reset_noise()
        pns = self.forward(self.target_net, next_states)
        pns_a = pns[range(self.batch_size), argmax_indices_ns]

        Tz = np.expand_dims(
            returns, axis=1) + (self.discount**self.n) * np.multiply(
                nonterminals, np.expand_dims(self.support, axis=0))
        Tz = np.clip(Tz, self.Vmin, self.Vmax)
        b = (Tz - self.Vmin) / self.delta_z
        l, u = np.floor(b).astype(dtype=np.int64), np.ceil(b).astype(
            dtype=np.int64)
        l[(u > 0) * (l == u)] -= 1
        u[(l < (self.atoms - 1)) * (l == u)] += 1

        m = np.zeros([self.batch_size, self.atoms], dtype=states.dtype)
        offset = np.broadcast_to(
            np.expand_dims(np.linspace(0, ((self.batch_size - 1) * self.atoms),
                                       self.batch_size),
                           axis=1),
            (self.batch_size, self.atoms)).astype(actions.dtype)
        np.add.at(m.flatten(), (l + offset).flatten(),
                  (pns_a * (u.astype(np.float32) - b)).flatten())
        np.add.at(m.flatten(), (u + offset).flatten(),
                  (pns_a * (b - l.astype(np.float32))).flatten())

        loss = -np.sum(m * log_ps_a, 1)
        loss = weights * loss

    def update_target_net(self):
        self.sess.run(self.update_target_op)

    def save(self, path):
        self.save_path = self.saver.save(self.sess, "./models/model.ckpt")

    def evaluate_q(self, state):
        return np.sum(
            (self.forward(self.online_net, state.reshape(1, 84, 84, 4)) *
             self.support),
            axis=-1).max(axis=1)[0]

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()
Example #10
0
def train(args):
    logger = setup_logger(args)
    logger.info('---- Options ----')
    for k, v in vars(args).items():
        logger.info(k + ': ' + str(v))
    logger.info('--------\n')

    if torch.cuda.is_available():
        torch.cuda.manual_seed(0)
    else:
        torch.manual_seed(0)
    if os.path.isdir(args.tensorboard_dir):
        shutil.rmtree(args.tensorboard_dir)
    os.makedirs(args.tensorboard_dir)
    if not os.path.exists(args.saved_dir):
        os.makedirs(args.saved_dir)
    writer = SummaryWriter(args.tensorboard_dir)
    env = Tetris(width=args.width,
                 height=args.height,
                 block_size=args.block_size,
                 sim_rom_mode=args.sim_rom_mode)
    state_dim = 25
    action_dim = 2
    device = torch.device(
        'cuda:{}'.format(args.gpu) if torch.cuda.is_available() else 'cpu')
    model = DQN(input_dim=state_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    criterion = nn.MSELoss()

    state = env.reset()

    replay_memory = ReplayBufferOld(
        state_dim, action_dim, device=device,
        max_size=args.replay_memory_size)  # action = [x_axis, rotate_times]
    episode = 0
    step_cnt = 0
    seed = 0
    random.seed(seed)
    while episode < args.num_episodes:
        next_steps = env.get_next_states()

        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)
        next_states = next_states.to(device)
        model.eval()
        with torch.no_grad():
            predictions = model(next_states)[:, 0]
        index = get_action_index(args, episode, predictions, next_steps)
        model.train()

        next_state = next_states[index, :]
        next_state = next_state.cpu().numpy()
        action = next_actions[index]

        reward, done = env.step(action, render=False)
        if step_cnt > args.max_episode_length:
            done = True

        replay_memory.add(state, action, next_state, reward, done)
        if done:
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            state = env.reset()
            step_cnt = 0
        else:
            state = next_state
            step_cnt += 1
            continue

        if len(replay_memory) < args.replay_memory_size / 10:
            # logger.info("Episode:%d Current Memory Size: %d" % (episode, len(replay_memory)))
            continue
        episode += 1
        batch = replay_memory.sample(args.batch_size)
        state_batch, action_batch, next_state_batch, reward_batch, done_batch = batch

        q_values = model(state_batch)
        model.eval()
        with torch.no_grad():
            next_prediction_batch = model(next_state_batch)
        model.train()

        next_prediction_batch[done_batch < 0.5] = 0.0
        y_batch = reward_batch + args.gamma * next_prediction_batch

        optimizer.zero_grad()
        loss = criterion(q_values, y_batch)
        loss.backward()
        optimizer.step()

        logger.info(
            "Episode: {}/{}, Score: {}, Tetrominoes {}, Cleared lines: {}".
            format(episode, args.num_episodes, final_score, final_tetrominoes,
                   final_cleared_lines))
        writer.add_scalar('Train/Score', final_score, episode - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, episode - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines,
                          episode - 1)

        if episode > 2000 and episode % args.save_interval == 0:
            torch.save(model,
                       "{}/tetris_{}.pth".format(args.saved_dir, episode))
        if episode % 100:
            random.seed(seed % 10)
            seed += 1

    torch.save(model, "{}/tetris.pth".format(args.saved_dir))
Example #11
0
class Agent():
    def __init__(self, args, env):
        self.args = args
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount
        self.norm_clip = args.norm_clip
        self.coeff = 0.01 if args.game in [
            'pong', 'boxing', 'private_eye', 'freeway'
        ] else 1.

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        self.momentum_net = DQN(args, self.action_space).to(device=args.device)
        # self.predictor = prediction_MLP(in_dim=128, hidden_dim=128, out_dim=128)

        if args.model:  # Load pretrained model if provided
            if os.path.isfile(args.model):
                state_dict = torch.load(
                    args.model, map_location='cpu'
                )  # Always load tensors onto CPU by default, will shift to GPU if necessary
                if 'conv1.weight' in state_dict.keys():
                    for old_key, new_key in (('conv1.weight',
                                              'convs.0.weight'),
                                             ('conv1.bias', 'convs.0.bias'),
                                             ('conv2.weight',
                                              'convs.2.weight'),
                                             ('conv2.bias', 'convs.2.bias'),
                                             ('conv3.weight',
                                              'convs.4.weight'),
                                             ('conv3.bias', 'convs.4.bias')):
                        state_dict[new_key] = state_dict[
                            old_key]  # Re-map state dict for old pretrained models
                        del state_dict[
                            old_key]  # Delete old keys for strict load_state_dict
                self.online_net.load_state_dict(state_dict)
                print("Loading pretrained model: " + args.model)
            else:  # Raise error if incorrect model path provided
                raise FileNotFoundError(args.model)

        self.online_net.train()
        # self.pred.train()
        self.initialize_momentum_net()
        self.momentum_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        for param in self.momentum_net.parameters():
            param.requires_grad = False
        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.learning_rate,
                                    eps=args.adam_eps)

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):
        with torch.no_grad():
            a, _, _ = self.online_net(state.unsqueeze(0))
            return (a * self.support).sum(2).argmax(1).item()

    # Acts with an ε-greedy policy (used for evaluation only)
    def act_e_greedy(
            self,
            state,
            epsilon=0.001):  # High ε can reduce evaluation scores drastically
        return np.random.randint(
            0, self.action_space
        ) if np.random.random() < epsilon else self.act(state)

    def learn(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)
        # print('\n\n---------------')
        # print(f'idxs: {idxs}, ')
        # print(f'states: {states.shape}, ')
        # print(f'actions: {actions.shape}, ')
        # print(f'returns: {returns.shape}, ')
        # print(f'next_states: {next_states.shape}, ')
        # print(f'nonterminals: {nonterminals.shape}, ')
        # print(f'weights: {weights.shape},')

        aug_states_1 = aug(states).to(device=self.args.device)
        aug_states_2 = aug(states).to(device=self.args.device)

        # print(f'aug_states_1: {aug_states_1.shape}')
        # print(f'aug_states_2: {aug_states_2.shape}')

        # Calculate current state probabilities (online network noise already sampled)
        log_ps, _, _ = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)

        _, z_1, p_1 = self.online_net(aug_states_1, log=True)
        _, z_2, p_2 = self.online_net(aug_states_2, log=True)
        # p_1, p_2 = self.pred(z_1), self.pred(z_2)

        # with torch.no_grad():
        #     p_2 = self.pred(z_2)

        simsiam_loss = 2 + D(p_1, z_2) / 2 + D(p_2, z_1) / 2
        # simsiam_loss = p_1.mean() + p_2.mean()
        # simsiam_loss = p_1.mean() * 128
        # simsiam_loss = - F.cosine_similarity(p_1, z_2.detach(), dim=-1).mean()
        # print(simsiam_loss)
        # simsiam_loss = 0

        # _, z_target = self.momentum_net(aug_states_2, log=True) #z_k
        # z_proj = torch.matmul(self.online_net.W, z_target.T)
        # logits = torch.matmul(z_anch, z_proj)
        # logits = (logits - torch.max(logits, 1)[0][:, None])
        # logits = logits * 0.1
        # labels = torch.arange(logits.shape[0]).long().to(device=self.args.device)
        # moco_loss = (nn.CrossEntropyLoss()(logits, labels)).to(device=self.args.device)

        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)

        # print(f'z_1: {z_1.shape}')
        # print(f'p_1: {p_1.shape}')
        # print('---------------\n\n')

        # 1/0

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns, _, _ = self.online_net(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns, _, _ = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        # loss = loss + (moco_loss * self.coeff)
        loss = loss + (simsiam_loss * self.coeff)
        self.online_net.zero_grad()
        # self.pred.zero_grad()
        curl_loss = (weights * loss).mean()
        # print(curl_loss)
        curl_loss.mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        clip_grad_norm_(self.online_net.parameters(),
                        self.norm_clip)  # Clip gradients by L2 norm
        self.optimiser.step()

        mem.update_priorities(idxs,
                              loss.detach().cpu().numpy()
                              )  # Update priorities of sampled transitions

    def learn_old(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)
        # print('\n\n---------------')
        # print(f'idxs: {idxs}, ')
        # print(f'states: {states.shape}, ')
        # print(f'actions: {actions.shape}, ')
        # print(f'returns: {returns.shape}, ')
        # print(f'next_states: {next_states.shape}, ')
        # print(f'nonterminals: {nonterminals.shape}, ')
        # print(f'weights: {weights.shape},')

        aug_states_1 = aug(states).to(device=self.args.device)
        aug_states_2 = aug(states).to(device=self.args.device)

        # print(f'aug_states_1: {aug_states_1.shape}')
        # print(f'aug_states_2: {aug_states_2.shape}')

        # Calculate current state probabilities (online network noise already sampled)
        log_ps, _, _ = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        _, z_anch, _ = self.online_net(aug_states_1, log=True)  #z_q
        _, z_target, _ = self.momentum_net(aug_states_2, log=True)  #z_k
        z_proj = torch.matmul(self.online_net.W, z_target.T)
        logits = torch.matmul(z_anch, z_proj)
        logits = (logits - torch.max(logits, 1)[0][:, None])
        logits = logits * 0.1
        labels = torch.arange(
            logits.shape[0]).long().to(device=self.args.device)
        moco_loss = (nn.CrossEntropyLoss()(logits,
                                           labels)).to(device=self.args.device)

        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)

        # print(f'z_anch: {z_anch.shape}')
        # print(f'z_target: {z_target.shape}')
        # print(f'z_proj: {z_proj.shape}')
        # print(f'logits: {logits.shape}')
        # print(logits)
        # print(f'labels: {labels.shape}')
        # print(labels)
        # print('---------------\n\n')

        # 1/0

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns, _, _ = self.online_net(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns, _, _ = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        print(moco_loss)
        loss = loss + (moco_loss * self.coeff)
        self.online_net.zero_grad()
        curl_loss = (weights * loss).mean()
        curl_loss.mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        clip_grad_norm_(self.online_net.parameters(),
                        self.norm_clip)  # Clip gradients by L2 norm
        self.optimiser.step()

        mem.update_priorities(idxs,
                              loss.detach().cpu().numpy()
                              )  # Update priorities of sampled transitions

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def initialize_momentum_net(self):
        for param_q, param_k in zip(self.online_net.parameters(),
                                    self.momentum_net.parameters()):
            param_k.data.copy_(param_q.data)  # update
            param_k.requires_grad = False  # not update by gradient

    # Code for this function from https://github.com/facebookresearch/moco
    @torch.no_grad()
    def update_momentum_net(self, momentum=0.999):
        for param_q, param_k in zip(self.online_net.parameters(),
                                    self.momentum_net.parameters()):
            param_k.data.copy_(momentum * param_k.data +
                               (1. - momentum) * param_q.data)  # update

    # Save model parameters on current device (don't move model between devices)
    def save(self, path, name='model.pth'):
        torch.save(self.online_net.state_dict(), os.path.join(path, name))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        with torch.no_grad():
            a, _, _ = self.online_net(state.unsqueeze(0))
            return (a * self.support).sum(2).max(1)[0].item()

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()
Example #12
0
class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount
        self.norm_clip = args.norm_clip

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model:  # Load pretrained model if provided
            if os.path.isfile(args.model):
                state_dict = torch.load(
                    args.model, map_location='cpu'
                )  # Always load tensors onto CPU by default, will shift to GPU if necessary
                if 'conv1.weight' in state_dict.keys():
                    for old_key, new_key in (('conv1.weight',
                                              'convs.0.weight'),
                                             ('conv1.bias', 'convs.0.bias'),
                                             ('conv2.weight',
                                              'convs.2.weight'),
                                             ('conv2.bias', 'convs.2.bias'),
                                             ('conv3.weight',
                                              'convs.4.weight'),
                                             ('conv3.bias', 'convs.4.bias')):
                        state_dict[new_key] = state_dict[
                            old_key]  # Re-map state dict for old pretrained models
                        del state_dict[
                            old_key]  # Delete old keys for strict load_state_dict
                self.online_net.load_state_dict(state_dict)
                print("Loading pretrained model: " + args.model)
            else:  # Raise error if incorrect model path provided
                raise FileNotFoundError(args.model)

        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        # self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
        self.convs_optimiser = optim.Adam(self.online_net.convs.parameters(),
                                          lr=args.learning_rate,
                                          eps=args.adam_eps)
        self.linear_optimiser = optim.Adam(chain(
            self.online_net.fc_h_v.parameters(),
            self.online_net.fc_h_a.parameters(),
            self.online_net.fc_z_v.parameters(),
            self.online_net.fc_z_a.parameters()),
                                           lr=args.learning_rate,
                                           eps=args.adam_eps)

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):

        with torch.no_grad():
            # don't count these calls since it is accounted for after "action = dqn.act(state)" in main.py
            ret = (self.online_net(state.unsqueeze(0)) *
                   self.support).sum(2).argmax(1).item()
            return ret

    # Acts with an ε-greedy policy (used for evaluation only)
    def act_e_greedy(
            self,
            state,
            epsilon=0.001):  # High ε can reduce evaluation scores drastically
        return np.random.randint(
            0, self.action_space
        ) if np.random.random() < epsilon else self.act(state)

    def learn(self, mem, freeze=False):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights, _ = mem.sample(
            self.batch_size)

        # Calculate current state probabilities (online network noise already sampled)
        log_ps = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns = self.online_net(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        loss.mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        clip_grad_norm_(self.online_net.parameters(),
                        self.norm_clip)  # Clip gradients by L2 norm
        # self.optimiser.step()
        if not freeze:
            self.convs_optimiser.step()
        self.linear_optimiser.step()

    def learn_with_latent(self, latent_mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights, ns = latent_mem.sample(
            self.batch_size)

        # Calculate current state probabilities (online network noise already sampled)
        log_ps = self.online_net.forward_with_latent(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)
        with torch.no_grad():
            # Calculate nth next state probabilities
            pns = self.online_net.forward_with_latent(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution ds_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns = self.target_net.forward_with_latent(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # use ns instead of self.n since n is possibly different for each sequence in the batch
            ns = torch.tensor(ns, device=latent_mem.device).unsqueeze(1)
            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**ns) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        loss.mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        clip_grad_norm_(self.online_net.parameters(),
                        self.norm_clip)  # Clip gradients by L2 norm
        # self.optimiser.step()
        self.linear_optimiser.step()

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    # Save model parameters on current device (don't move model between devices)
    def save(self, path, name='model.pth'):
        torch.save(self.online_net.state_dict(), os.path.join(path, name))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        with torch.no_grad():
            return (self.online_net(state.unsqueeze(0)) *
                    self.support).sum(2).max(1)[0].item()

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()
Example #13
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 gamma=0.99,
                 step_size=1,
                 dueling_dqn=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        if dueling_dqn:
            print("Use dueling dqn")
            self.qnetwork_local = NoisyDuelingDQN(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = NoisyDuelingDQN(state_size, action_size,
                                                   seed).to(device)
        else:
            print("Use non-dueling dqn")
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.gamma = gamma
        self.step_size = step_size

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        return np.argmax(action_values.cpu().data.numpy())

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Compute and minimize loss
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        ## gamma ^ step_size for nstep dqn
        Q_targets = rewards + (pow(self.gamma, self.step_size) *
                               Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent(object):
    """ all improvments from Rainbow research work
    """
    def __init__(self, args, state_size, action_size):
        """
        Args:
           param1 (args): args
           param2 (int): args
           param3 (int): args
        """
        self.action_size = action_size
        self.state_size = state_size
        self.atoms = args.atoms
        self.V_min = args.V_min
        self.V_max = args.V_max
        self.device = args.device
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=self.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.qnetwork_local = DQN(args, self.state_size,
                                  self.action_size).to(device=args.device)
        if args.model and os.path.isfile(args.model):
            # Always load tensors onto CPU by default, will shift to GPU if necessary
            self.qnetwork_local.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.qnetwork_local.train()

        self.target_net = DQN(args, self.state_size,
                              self.action_size).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)

    def reset_noise(self):
        """ resets noisy weights in all linear layers """
        self.qnetwork_local.reset_noise()

    def act(self, state):
        """
          acts greedy(max) based on a single state
          Args:
             param1 (int) : state
        """
        with torch.no_grad():
            return (self.qnetwork_local(state.unsqueeze(0).to(self.device)) *
                    self.support).sum(2).argmax(1).item()

    def act_e_greedy(self, state, epsilon=0.001):
        """ acts with epsilon greedy policy
            epsilon exploration vs exploitation traide off
        Args:
            param1(int): state
            param2(float): epsilon
        Return : action int number between 0 and 4
        """
        return np.random.randint(
            0, self.action_size) if np.random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        """ uses samples with the given batch size to improve the Q function
        Args:
            param1 (Experince Replay Buffer) : mem
        """
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)
        # Calculate current state probabilities (online network noise already sampled)
        log_ps = self.qnetwork_local(
            states, log=True)  # Log probabilities log p(s_t, *; theta online)
        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; theat online)

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns = self.qnetwork_local(
                next_states)  # Probabilities p(s_t+n, *; theta online)
            dns = self.support.expand_as(
                pns
            ) * pns  # Distribution d_t+n = (z, p(s_t+n, *; theat online))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a;  theat online))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns = self.target_net(
                next_states)  # Probabilities p(s_t+n,  ; theata target)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; theat online))]; theat target)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n
            ) * self.support.unsqueeze(
                0)  # Tz = R^n + (discoit ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.V_min,
                          max=self.V_max)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.V_min) / self.delta_z  # b = (Tz - Vmin) / delta z
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.qnetwork_local.zero_grad()
        (weights * loss).mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        self.optimizer.step()

        mem.update_priorities(idxs,
                              loss.detach().cpu().numpy()
                              )  # Update priorities of sampled transitions
        self.soft_update()

    def soft_update(self, tau=1e-3):
        """ swaps the network weights from the online to the target

        Args:
           param1 (float): tau
        """
        for target_param, local_param in zip(self.target_net.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update_target_net(self):
        """ copy the model weights from the online to the target network """
        self.target_net.load_state_dict(self.qnetwork_local.state_dict())

    def save(self, path):
        """ save the model weights to a file
        Args:
           param1 (string): pathname
        """
        torch.save(self.qnetwork_local.state_dict(),
                   os.path.join(path, 'model.pth'))

    def evaluate_q(self, state):
        """ Evaluates Q-value based on single state
        """
        with torch.no_grad():
            return (self.qnetwork_local(state.unsqueeze(0)) *
                    self.support).sum(2).max(1)[0].item()

    def train(self):
        """
        activates the backprob. layers for the online network
        """
        self.qnetwork_local.train()

    def eval(self):
        """ invoke the eval from the online network
            deactivates the backprob
            layers like dropout will work in eval model instead
        """
        self.qnetwork_local.eval()
Example #15
0
class Agent:
    def __init__(self):
        self.mode = "train"
        with open("config.yaml") as reader:
            self.config = yaml.safe_load(reader)
        print(self.config)
        self.load_config()

        self.online_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.target_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.online_net.train()
        self.target_net.train()
        self.update_target_net()
        for param in self.target_net.parameters():
            param.requires_grad = False

        if self.use_cuda:
            self.online_net.cuda()
            self.target_net.cuda()

        self.naozi = ObservationPool(capacity=self.naozi_capacity)
        # optimizer
        self.optimizer = torch.optim.Adam(
            self.online_net.parameters(),
            lr=self.config['training']['optimizer']['learning_rate'])
        self.clip_grad_norm = self.config['training']['optimizer'][
            'clip_grad_norm']

    def load_config(self):
        # word vocab
        with open("vocabularies/word_vocab.txt") as f:
            self.word_vocab = f.read().split("\n")
        self.word2id = {}
        for i, w in enumerate(self.word_vocab):
            self.word2id[w] = i
        # char vocab
        with open("vocabularies/char_vocab.txt") as f:
            self.char_vocab = f.read().split("\n")
        self.char2id = {}
        for i, w in enumerate(self.char_vocab):
            self.char2id[w] = i

        self.EOS_id = self.word2id["</s>"]
        self.train_data_size = self.config['general']['train_data_size']
        self.question_type = self.config['general']['question_type']
        self.random_map = self.config['general']['random_map']
        self.testset_path = self.config['general']['testset_path']
        self.naozi_capacity = self.config['general']['naozi_capacity']
        self.eval_folder = pjoin(
            self.testset_path, self.question_type,
            ("random_map" if self.random_map else "fixed_map"))
        self.eval_data_path = pjoin(self.testset_path, "data.json")

        self.batch_size = self.config['training']['batch_size']
        self.max_nb_steps_per_episode = self.config['training'][
            'max_nb_steps_per_episode']
        self.max_episode = self.config['training']['max_episode']
        self.target_net_update_frequency = self.config['training'][
            'target_net_update_frequency']
        self.learn_start_from_this_episode = self.config['training'][
            'learn_start_from_this_episode']

        self.run_eval = self.config['evaluate']['run_eval']
        self.eval_batch_size = self.config['evaluate']['batch_size']
        self.eval_max_nb_steps_per_episode = self.config['evaluate'][
            'max_nb_steps_per_episode']

        # Set the random seed manually for reproducibility.
        self.random_seed = self.config['general']['random_seed']
        np.random.seed(self.random_seed)
        torch.manual_seed(self.random_seed)
        if torch.cuda.is_available():
            if not self.config['general']['use_cuda']:
                print(
                    "WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml"
                )
                self.use_cuda = False
            else:
                torch.backends.cudnn.deterministic = True
                torch.cuda.manual_seed(self.random_seed)
                self.use_cuda = True
        else:
            self.use_cuda = False

        if self.question_type == "location":
            self.answer_type = "pointing"
        elif self.question_type in ["attribute", "existence"]:
            self.answer_type = "2 way"
        else:
            raise NotImplementedError

        self.save_checkpoint = self.config['checkpoint']['save_checkpoint']
        self.experiment_tag = self.config['checkpoint']['experiment_tag']
        self.save_frequency = self.config['checkpoint']['save_frequency']
        self.load_pretrained = self.config['checkpoint']['load_pretrained']
        self.load_from_tag = self.config['checkpoint']['load_from_tag']

        self.qa_loss_lambda = self.config['training']['qa_loss_lambda']
        self.interaction_loss_lambda = self.config['training'][
            'interaction_loss_lambda']

        # replay buffer and updates
        self.discount_gamma = self.config['replay']['discount_gamma']
        self.replay_batch_size = self.config['replay']['replay_batch_size']
        self.command_generation_replay_memory = command_generation_memory.PrioritizedReplayMemory(
            self.config['replay']['replay_memory_capacity'],
            priority_fraction=self.config['replay']
            ['replay_memory_priority_fraction'],
            discount_gamma=self.discount_gamma)
        self.qa_replay_memory = qa_memory.PrioritizedReplayMemory(
            self.config['replay']['replay_memory_capacity'],
            priority_fraction=0.0)
        self.update_per_k_game_steps = self.config['replay'][
            'update_per_k_game_steps']
        self.multi_step = self.config['replay']['multi_step']

        # distributional RL
        self.use_distributional = self.config['distributional']['enable']
        self.atoms = self.config['distributional']['atoms']
        self.v_min = self.config['distributional']['v_min']
        self.v_max = self.config['distributional']['v_max']
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atoms)  # Support (range) of z
        if self.use_cuda:
            self.support = self.support.cuda()
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)

        # dueling networks
        self.dueling_networks = self.config['dueling_networks']

        # double dqn
        self.double_dqn = self.config['double_dqn']

        # counting reward
        self.revisit_counting_lambda_anneal_episodes = self.config[
            'episodic_counting_bonus'][
                'revisit_counting_lambda_anneal_episodes']
        self.revisit_counting_lambda_anneal_from = self.config[
            'episodic_counting_bonus']['revisit_counting_lambda_anneal_from']
        self.revisit_counting_lambda_anneal_to = self.config[
            'episodic_counting_bonus']['revisit_counting_lambda_anneal_to']
        self.revisit_counting_lambda = self.revisit_counting_lambda_anneal_from

        # valid command bonus
        self.valid_command_bonus_lambda = self.config[
            'valid_command_bonus_lambda']

        # epsilon greedy
        self.epsilon_anneal_episodes = self.config['epsilon_greedy'][
            'epsilon_anneal_episodes']
        self.epsilon_anneal_from = self.config['epsilon_greedy'][
            'epsilon_anneal_from']
        self.epsilon_anneal_to = self.config['epsilon_greedy'][
            'epsilon_anneal_to']
        self.epsilon = self.epsilon_anneal_from
        self.noisy_net = self.config['epsilon_greedy']['noisy_net']
        if self.noisy_net:
            # disable epsilon greedy
            self.epsilon_anneal_episodes = -1
            self.epsilon = 0.0

        self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
        self.single_word_verbs = set(["inventory", "look", "wait"])
        self.two_word_verbs = set(["go"])

    def train(self):
        """
        Tell the agent that it's training phase.
        """
        self.mode = "train"
        self.online_net.train()

    def eval(self):
        """
        Tell the agent that it's evaluation phase.
        """
        self.mode = "eval"
        self.online_net.eval()

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def reset_noise(self):
        if self.noisy_net:
            # Resets noisy weights in all linear layers (of online net only)
            self.online_net.reset_noise()

    def zero_noise(self):
        if self.noisy_net:
            self.online_net.zero_noise()
            self.target_net.zero_noise()

    def load_pretrained_model(self, load_from):
        """
        Load pretrained checkpoint from file.

        Arguments:
            load_from: File name of the pretrained model checkpoint.
        """
        print("loading model from %s\n" % (load_from))
        try:
            if self.use_cuda:
                state_dict = torch.load(load_from)
            else:
                state_dict = torch.load(load_from, map_location='cpu')
            self.online_net.load_state_dict(state_dict)
        except:
            print("Failed to load checkpoint...")

    def save_model_to_path(self, save_to):
        torch.save(self.online_net.state_dict(), save_to)
        print("Saved checkpoint to %s..." % (save_to))

    def init(self, obs, infos):
        """
        Prepare the agent for the upcoming games.

        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        # reset agent, get vocabulary masks for verbs / adjectives / nouns
        batch_size = len(obs)
        self.reset_binarized_counter(batch_size)
        self.not_finished_yet = np.ones((batch_size, ), dtype="float32")
        self.prev_actions = [["" for _ in range(batch_size)]]
        self.prev_step_is_still_interacting = np.ones(
            (batch_size, ), dtype="float32"
        )  # 1s and starts to be 0 when previous action is "wait"
        self.naozi.reset(batch_size=batch_size)

    def get_agent_inputs(self, string_list):
        sentence_token_list = [item.split() for item in string_list]
        sentence_id_list = [
            _words_to_ids(tokens, self.word2id)
            for tokens in sentence_token_list
        ]
        input_sentence_char = list_of_token_list_to_char_input(
            sentence_token_list, self.char2id)
        input_sentence = pad_sequences(
            sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32')
        input_sentence = to_pt(input_sentence, self.use_cuda)
        input_sentence_char = to_pt(input_sentence_char, self.use_cuda)
        return input_sentence, input_sentence_char, sentence_id_list

    def get_game_info_at_certain_step(self, obs, infos):
        """
        Get all needed info from game engine for training.
        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        batch_size = len(obs)
        feedback_strings = [preproc(item, tokenizer=self.nlp) for item in obs]
        description_strings = [
            preproc(item, tokenizer=self.nlp) for item in infos["description"]
        ]
        observation_strings = [
            d + " <|> " + fb if fb != d else d + " <|> hello"
            for fb, d in zip(feedback_strings, description_strings)
        ]

        inventory_strings = [
            preproc(item, tokenizer=self.nlp) for item in infos["inventory"]
        ]
        local_word_list = [
            obs.split() + inv.split()
            for obs, inv in zip(observation_strings, inventory_strings)
        ]

        directions = ["east", "west", "north", "south"]
        if self.question_type in ["location", "existence"]:
            # agents observes the env, but do not change them
            possible_verbs = [["go", "inventory", "wait", "open", "examine"]
                              for _ in range(batch_size)]
        else:
            possible_verbs = [
                list(set(item) - set(["", "look"])) for item in infos["verbs"]
            ]

        possible_adjs, possible_nouns = [], []
        for i in range(batch_size):
            object_nouns = [
                item.split()[-1] for item in infos["object_nouns"][i]
            ]
            object_adjs = [
                w for item in infos["object_adjs"][i] for w in item.split()
            ]
            possible_nouns.append(
                list(set(object_nouns) & set(local_word_list[i]) - set([""])) +
                directions)
            possible_adjs.append(
                list(set(object_adjs) & set(local_word_list[i]) - set([""])) +
                ["</s>"])

        return observation_strings, [
            possible_verbs, possible_adjs, possible_nouns
        ]

    def get_state_strings(self, infos):
        description_strings = infos["description"]
        inventory_strings = infos["inventory"]
        observation_strings = [
            _d + _i for (_d, _i) in zip(description_strings, inventory_strings)
        ]
        return observation_strings

    def get_local_word_masks(self, possible_words):
        possible_verbs, possible_adjs, possible_nouns = possible_words
        batch_size = len(possible_verbs)

        verb_mask = np.zeros((batch_size, len(self.word_vocab)),
                             dtype="float32")
        noun_mask = np.zeros((batch_size, len(self.word_vocab)),
                             dtype="float32")
        adj_mask = np.zeros((batch_size, len(self.word_vocab)),
                            dtype="float32")
        for i in range(batch_size):
            for w in possible_verbs[i]:
                if w in self.word2id:
                    verb_mask[i][self.word2id[w]] = 1.0
            for w in possible_adjs[i]:
                if w in self.word2id:
                    adj_mask[i][self.word2id[w]] = 1.0
            for w in possible_nouns[i]:
                if w in self.word2id:
                    noun_mask[i][self.word2id[w]] = 1.0
        adj_mask[:, self.EOS_id] = 1.0

        return [verb_mask, adj_mask, noun_mask]

    def get_match_representations(self,
                                  input_observation,
                                  input_observation_char,
                                  input_quest,
                                  input_quest_char,
                                  use_model="online"):
        model = self.online_net if use_model == "online" else self.target_net
        description_representation_sequence, description_mask = model.representation_generator(
            input_observation, input_observation_char)
        quest_representation_sequence, quest_mask = model.representation_generator(
            input_quest, input_quest_char)

        match_representation_sequence = model.get_match_representations(
            description_representation_sequence, description_mask,
            quest_representation_sequence, quest_mask)
        match_representation_sequence = match_representation_sequence * description_mask.unsqueeze(
            -1)
        return match_representation_sequence

    def get_ranks(self,
                  input_observation,
                  input_observation_char,
                  input_quest,
                  input_quest_char,
                  word_masks,
                  use_model="online"):
        """
        Given input observation and question tensors, to get Q values of words.
        """
        model = self.online_net if use_model == "online" else self.target_net
        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        action_ranks = model.action_scorer(match_representation_sequence,
                                           word_masks)  # list of 3 tensors
        return action_ranks

    def choose_maxQ_command(self, action_ranks, word_mask=None):
        """
        Generate a command by maximum q values, for epsilon greedy.
        """
        if self.use_distributional:
            action_ranks = [
                (item * self.support).sum(2) for item in action_ranks
            ]  # list of batch x n_vocab
        action_indices = []
        for i in range(len(action_ranks)):
            ar = action_ranks[i]
            ar = ar - torch.min(
                ar, -1, keepdim=True
            )[0] + 1e-2  # minus the min value, so that all values are non-negative
            if word_mask is not None:
                assert word_mask[i].size() == ar.size(), (
                    word_mask[i].size().shape, ar.size())
                ar = ar * word_mask[i]
            action_indices.append(torch.argmax(ar, -1))  # batch
        return action_indices

    def choose_random_command(self,
                              batch_size,
                              action_space_size,
                              possible_words=None):
        """
        Generate a command randomly, for epsilon greedy.
        """
        action_indices = []
        for i in range(3):
            if possible_words is None:
                indices = np.random.choice(action_space_size, batch_size)
            else:
                indices = []
                for j in range(batch_size):
                    mask_ids = []
                    for w in possible_words[i][j]:
                        if w in self.word2id:
                            mask_ids.append(self.word2id[w])
                    indices.append(np.random.choice(mask_ids))
                indices = np.array(indices)
            action_indices.append(to_pt(indices, self.use_cuda))  # batch
        return action_indices

    def get_chosen_strings(self, chosen_indices):
        """
        Turns list of word indices into actual command strings.
        chosen_indices: Word indices chosen by model.
        """
        chosen_indices_np = [to_np(item) for item in chosen_indices]
        res_str = []
        batch_size = chosen_indices_np[0].shape[0]
        for i in range(batch_size):
            verb, adj, noun = chosen_indices_np[0][i], chosen_indices_np[1][
                i], chosen_indices_np[2][i]
            res_str.append(self.word_ids_to_commands(verb, adj, noun))
        return res_str

    def word_ids_to_commands(self, verb, adj, noun):
        """
        Turn the 3 indices into actual command strings.

        Arguments:
            verb: Index of the guessing verb in vocabulary
            adj: Index of the guessing adjective in vocabulary
            noun: Index of the guessing noun in vocabulary
        """
        # turns 3 indices into actual command strings
        if self.word_vocab[verb] in self.single_word_verbs:
            return self.word_vocab[verb]
        if self.word_vocab[verb] in self.two_word_verbs:
            return " ".join([self.word_vocab[verb], self.word_vocab[noun]])
        if adj == self.EOS_id:
            return " ".join([self.word_vocab[verb], self.word_vocab[noun]])
        else:
            return " ".join([
                self.word_vocab[verb], self.word_vocab[adj],
                self.word_vocab[noun]
            ])

    def act_random(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        with torch.no_grad():
            batch_size = len(obs)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)
            chosen_indices = word_indices_random
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def act_greedy(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            batch_size = len(obs)
            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            chosen_indices = word_indices_maxq
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def act(self,
            obs,
            infos,
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            possible_words,
            random=False):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            if self.mode == "eval":
                return self.act_greedy(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            if random:
                return self.act_random(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            batch_size = len(obs)

            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)

            # random number for epsilon greedy
            rand_num = np.random.uniform(low=0.0,
                                         high=1.0,
                                         size=(batch_size, ))
            less_than_epsilon = (rand_num < self.epsilon).astype(
                "float32")  # batch
            greater_than_epsilon = 1.0 - less_than_epsilon
            less_than_epsilon = to_pt(less_than_epsilon,
                                      self.use_cuda,
                                      type='long')
            greater_than_epsilon = to_pt(greater_than_epsilon,
                                         self.use_cuda,
                                         type='long')
            chosen_indices = [
                less_than_epsilon * idx_random +
                greater_than_epsilon * idx_maxq
                for idx_random, idx_maxq in zip(word_indices_random,
                                                word_indices_maxq)
            ]
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def get_dqn_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.command_generation_replay_memory) < self.replay_batch_size:
            return None

        data = self.command_generation_replay_memory.get_batch(
            self.replay_batch_size, self.multi_step)
        if data is None:
            return None

        obs_list, quest_list, possible_words_list, chosen_indices, rewards, next_obs_list, next_possible_words_list, actual_n_list = data
        batch_size = len(actual_n_list)

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, _ = self.get_agent_inputs(
            obs_list)
        next_input_observation, next_input_observation_char, _ = self.get_agent_inputs(
            next_obs_list)

        possible_words, next_possible_words = [], []
        for i in range(3):
            possible_words.append([item[i] for item in possible_words_list])
            next_possible_words.append(
                [item[i] for item in next_possible_words_list])

        local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(possible_words)
        ]
        next_local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(next_possible_words)
        ]

        action_ranks = self.get_ranks(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            local_word_masks,
            use_model="online"
        )  # list of batch x vocab or list of batch x vocab x atoms
        # ps_a
        word_qvalues = [
            ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1)
            for w_rank, idx in zip(action_ranks, chosen_indices)
        ]  # list of batch or list of batch x atoms
        q_value = torch.mean(torch.stack(word_qvalues, -1),
                             -1)  # batch or batch x atoms
        # log_ps_a
        log_q_value = torch.log(q_value)  # batch or batch x atoms

        with torch.no_grad():
            if self.noisy_net:
                self.target_net.reset_noise()  # Sample new target net noise
            if self.double_dqn:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="online")
                # list of batch x vocab or list of batch x vocab x atoms
                # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                # pns # Probabilities p(s_t+n, ·; θtarget)
                next_action_ranks = self.get_ranks(
                    next_input_observation,
                    next_input_observation_char,
                    input_quest,
                    input_quest_char,
                    next_local_word_masks,
                    use_model="target"
                )  # batch x vocab or list of batch x vocab x atoms
                # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms
            else:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="target")
                # list of batch x vocab or list of batch x vocab x atoms
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms

            next_q_value = torch.mean(torch.stack(next_word_qvalues, -1),
                                      -1)  # batch or batch x atoms
            # Compute Tz (Bellman operator T applied to z)
            discount = to_pt((np.ones_like(actual_n_list) *
                              self.discount_gamma)**actual_n_list,
                             self.use_cuda,
                             type="float")
        if not self.use_distributional:
            rewards = rewards + next_q_value * discount  # batch
            loss = F.smooth_l1_loss(q_value, rewards)
            return loss

        with torch.no_grad():
            Tz = rewards.unsqueeze(
                -1) + discount.unsqueeze(-1) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.v_min,
                          max=self.v_max)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.v_min) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = torch.zeros(batch_size, self.atoms).float()
            if self.use_cuda:
                m = m.cuda()
            offset = torch.linspace(0, ((batch_size - 1) * self.atoms),
                                    batch_size).unsqueeze(1).expand(
                                        batch_size, self.atoms).long()
            if self.use_cuda:
                offset = offset.cuda()
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (next_q_value *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (next_q_value *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_q_value,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        loss = torch.mean(loss)
        return loss

    def update_interaction(self):
        # update neural model by replaying snapshots in replay memory
        interaction_loss = self.get_dqn_loss()
        if interaction_loss is None:
            return None
        loss = interaction_loss * self.interaction_loss_lambda
        # Backpropagate
        self.online_net.zero_grad()
        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                       self.clip_grad_norm)
        self.optimizer.step()  # apply gradients
        return to_np(torch.mean(interaction_loss))

    def answer_question(self,
                        input_observation,
                        input_observation_char,
                        observation_id_list,
                        input_quest,
                        input_quest_char,
                        use_model="online"):
        # first pad answerer_input, and get the mask
        model = self.online_net if use_model == "online" else self.target_net
        batch_size = len(observation_id_list)
        max_length = input_observation.size(1)
        mask = compute_mask(input_observation)  # batch x obs_len

        # noun mask for location question
        if self.question_type in ["location"]:
            location_mask = []
            for i in range(batch_size):
                m = [1 for item in observation_id_list[i]]
                location_mask.append(m)
            location_mask = pad_sequences(location_mask,
                                          maxlen=max_length,
                                          dtype="float32")
            location_mask = to_pt(location_mask,
                                  enable_cuda=self.use_cuda,
                                  type='float')
            assert mask.size() == location_mask.size()
            mask = mask * location_mask

        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        pred = model.answer_question(match_representation_sequence,
                                     mask)  # batch x vocab or batch x 2

        # attention sum:
        # sometimes certain word appears multiple times in the observation,
        # thus we need to merge them together before doing further computations
        # ------- but
        # if answer type is not pointing, we just use a pre-defined mapping
        # that maps 0/1 to their positions in vocab
        if self.answer_type == "2 way":
            observation_id_list = []
            max_length = 2
            for i in range(batch_size):
                observation_id_list.append(
                    [self.word2id["0"], self.word2id["1"]])

        observation = to_pt(
            pad_sequences(observation_id_list,
                          maxlen=max_length).astype('int32'), self.use_cuda)
        vocab_distribution = np.zeros(
            (batch_size, len(self.word_vocab)))  # batch x vocab
        vocab_distribution = to_pt(vocab_distribution,
                                   self.use_cuda,
                                   type='float')
        vocab_distribution = vocab_distribution.scatter_add_(
            1, observation, pred)  # batch x vocab
        non_zero_words = []
        for i in range(batch_size):
            non_zero_words.append(list(set(observation_id_list[i])))
        vocab_mask = torch.ne(vocab_distribution, 0).float()
        return vocab_distribution, non_zero_words, vocab_mask

    def point_maxq_position(self, vocab_distribution, mask):
        """
        Generate a command by maximum q values, for epsilon greedy.

        Arguments:
            point_distribution: Q values for each position (mapped to vocab).
            mask: vocab masks.
        """
        vocab_distribution = vocab_distribution - torch.min(
            vocab_distribution, -1, keepdim=True
        )[0] + 1e-2  # minus the min value, so that all values are non-negative
        vocab_distribution = vocab_distribution * mask  # batch x vocab
        indices = torch.argmax(vocab_distribution, -1)  # batch
        return indices

    def answer_question_act_greedy(self, input_observation,
                                   input_observation_char, observation_id_list,
                                   input_quest, input_quest_char):

        with torch.no_grad():
            vocab_distribution, _, vocab_mask = self.answer_question(
                input_observation,
                input_observation_char,
                observation_id_list,
                input_quest,
                input_quest_char,
                use_model="online")  # batch x time
            positions_maxq = self.point_maxq_position(vocab_distribution,
                                                      vocab_mask)
            return positions_maxq  # batch

    def get_qa_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.qa_replay_memory) < self.replay_batch_size:
            return None
        transitions = self.qa_replay_memory.sample(self.replay_batch_size)
        batch = qa_memory.qa_Transition(*zip(*transitions))

        observation_list = batch.observation_list
        quest_list = batch.quest_list
        answer_strings = batch.answer_strings
        answer_position = np.array(_words_to_ids(answer_strings, self.word2id))
        groundtruth = to_pt(answer_position, self.use_cuda)  # batch

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, observation_id_list = self.get_agent_inputs(
            observation_list)

        answer_distribution, _, _ = self.answer_question(
            input_observation,
            input_observation_char,
            observation_id_list,
            input_quest,
            input_quest_char,
            use_model="online")  # batch x vocab

        batch_loss = NegativeLogLoss(answer_distribution, groundtruth)  # batch
        return torch.mean(batch_loss)

    def update_qa(self):
        # update neural model by replaying snapshots in replay memory
        qa_loss = self.get_qa_loss()
        if qa_loss is None:
            return None
        loss = qa_loss * self.qa_loss_lambda
        # Backpropagate
        self.online_net.zero_grad()
        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                       self.clip_grad_norm)
        self.optimizer.step()  # apply gradients
        return to_np(torch.mean(qa_loss))

    def finish_of_episode(self, episode_no, batch_size):
        # Update target networt
        if (
                episode_no + batch_size
        ) % self.target_net_update_frequency <= episode_no % self.target_net_update_frequency:
            self.update_target_net()
        # decay lambdas
        if episode_no < self.learn_start_from_this_episode:
            return
        if episode_no < self.epsilon_anneal_episodes + self.learn_start_from_this_episode:
            self.epsilon -= (self.epsilon_anneal_from - self.epsilon_anneal_to
                             ) / float(self.epsilon_anneal_episodes)
            self.epsilon = max(self.epsilon, 0.0)
        if episode_no < self.revisit_counting_lambda_anneal_episodes + self.learn_start_from_this_episode:
            self.revisit_counting_lambda -= (
                self.revisit_counting_lambda_anneal_from -
                self.revisit_counting_lambda_anneal_to) / float(
                    self.revisit_counting_lambda_anneal_episodes)
            self.revisit_counting_lambda = max(self.epsilon, 0.0)

    def reset_binarized_counter(self, batch_size):
        self.binarized_counter_dict = [{} for _ in range(batch_size)]

    def get_binarized_count(self, observation_strings, update=True):
        count_rewards = []
        batch_size = len(observation_strings)
        for i in range(batch_size):
            key = observation_strings[i]
            if key not in self.binarized_counter_dict[i]:
                self.binarized_counter_dict[i][key] = 0.0
            if update:
                self.binarized_counter_dict[i][key] += 1.0
            r = self.binarized_counter_dict[i][key]
            r = float(r == 1.0)
            count_rewards.append(r)
        return count_rewards
Example #16
0
            episode_reward += reward
            episode_steps += 1
            # Insert into replay buffer
            repbuf.add_sample((state, a, reward, next_state, done))
            state = next_state
            # Stats
            total_max_q += q_values.max()
            # Check if we need to train
            if step % STEPS_TO_TRAIN == 0:
                # Get a batch from replaybuffer
                batch = repbuf.get_batch(BATCH_SIZE)
                state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
                pred_nextQ = sess.run(target_dqn.logits, feed_dict={target_dqn.input: next_state_batch})
                max_nextQ = np.max(pred_nextQ, axis=1)
                pred_values = np.array(reward_batch) + np.invert(done_batch).astype('float32') * GAMMA * max_nextQ
                cost = dqn.train(state_batch, action_batch, pred_values, sess)


    elif FLAGS.mode == 'test':
        # Testing mode
        epsilon = 0.05
        rewards = []
        for _ in trange(100):
            done = False
            obs = env.reset()
            reward = 0
            while not done:
                # Choose action
                if np.random.random() < epsilon:
                    # Choose random action
                    a = env.action_space.sample()
Example #17
0
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Example #18
0
         
         state = next_state
         episode_reward += reward
         
         if done:
             state = env.reset()
             ep_no=ep_no+1
             print('Episode {} reward was {} and resulted in {} and epsilon {} '.format(ep_no,episode_reward,(reward==10),epsilon_by_frame(frame_idx)))
             all_rewards.append(episode_reward)
             episode_reward = 0
             
             mean_10ep_reward = round(np.mean(all_rewards[-11:-1]), 1)
             num_episodes = len(all_rewards)
             
         if len(replay_buffer) > learning_starts and frame_idx % train_freq == 0:
             loss = dqn.train(replay_buffer,batch_size,device=device)
             losses.append(loss.item())
             
         if frame_idx % target_network_update_freq == 0:
             dqn.update_target()
             
         if (frame_idx > batch_size and num_episodes > 10 and frame_idx % checkpoint_freq == 0):
             if saved_mean_reward is None or mean_10ep_reward > saved_mean_reward:
                 saved_mean_reward = mean_10ep_reward
         np.save('/home/sritee/Desktop/reward_data.npy',all_rewards)
 
 except:
     pdb.set_trace()
         
 finally:
     #pdb.set_trace()
Example #19
0
                # Do the soft target update
                paramlist = list()
                for i, param in enumerate(model.parameters()):
                    paramlist.append(param)

                for i, tparam in enumerate(target.parameters()):
                    tparam.data.copy_(tau * paramlist[i].data +
                                      (1 - tau) * tparam.data)

            # Handle epsilon-greedy exploration
            state = torch.from_numpy(state).float().unsqueeze(0)
            model.eval()
            with torch.no_grad():
                Qsa = model(state)

            model.train()

            # Handle exploration/exploitation
            rand = random.uniform(0, 1)
            if rand < epsilon:  # Explore
                action = random.choice(np.arange(total_actions))  #TODO: change
            else:  # Exploit
                action = np.argmax(Qsa.data.numpy())

            # Get the next state
            next_state, reward, done, info = env.step(action)
            score += reward
            mem = (state, action, reward, next_state, done)
            replaybuffer.append(mem)
            state = next_state
Example #20
0
class DQNAgent:
    def __init__(self, state_size, action_size, config=RLConfig()):
        self.seed = random.seed(config.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = config.batch_size
        self.batch_indices = torch.arange(config.batch_size).long().to(device)
        self.samples_before_learning = config.samples_before_learning
        self.learn_interval = config.learning_interval
        self.parameter_update_interval = config.parameter_update_interval
        self.per_epsilon = config.per_epsilon
        self.tau = config.tau
        self.gamma = config.gamma

        if config.useDuelingDQN:
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             config.seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              config.seed).to(device)
        else:
            self.qnetwork_local = DQN(state_size, action_size,
                                      config.seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       config.seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=config.learning_rate)

        self.doubleDQN = config.useDoubleDQN
        self.usePER = config.usePER
        if self.usePER:
            self.memory = PrioritizedReplayBuffer(config.buffer_size,
                                                  config.per_alpha)
        else:
            self.memory = ReplayBuffer(config.buffer_size)

        self.t_step = 0

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() < eps:
            return random.choice(np.arange(self.action_size))
        else:
            return np.argmax(action_values.cpu().data.numpy())

    def step(self, state, action, reward, next_state, done, beta):
        self.memory.add(state, action, reward, next_state, done)
        self.t_step += 1
        if self.t_step % self.learn_interval == 0:
            if len(self.memory) > self.samples_before_learning:
                state = torch.from_numpy(state).float().unsqueeze(0).to(device)
                next_state = torch.from_numpy(next_state).float().unsqueeze(
                    0).to(device)
                target = self.qnetwork_local(state).data
                old_val = target[0][action]
                target_val = self.qnetwork_target(next_state).data
                if done:
                    target[0][action] = reward
                else:
                    target[0][
                        action] = reward + self.gamma * torch.max(target_val)
                if self.usePER:
                    states, actions, rewards, next_states, dones, weights, indices = self.memory.sample(
                        self.batch_size, beta)
                else:
                    indices = None
                    weights = None
                    states, actions, rewards, next_states, dones = self.memory.sample(
                        self.batch_size)

                self.learn(states, actions, rewards, next_states, dones,
                           indices, weights, self.gamma)

    def learn(self, states, actions, rewards, next_states, dones, indices,
              weights, gamma):
        states = torch.from_numpy(np.vstack(states)).float().to(device)
        actions = torch.from_numpy(np.vstack(actions)).long().to(device)
        rewards = torch.from_numpy(np.vstack(rewards)).float().to(device)
        next_states = torch.from_numpy(
            np.vstack(next_states)).float().to(device)
        dones = torch.from_numpy(np.vstack(dones.astype(
            np.uint8))).float().to(device)
        Q_targets_next = self.qnetwork_target(next_states).detach()

        if self.doubleDQN:
            # choose the best action from the local network
            next_actions = self.qnetwork_local(next_states).argmax(dim=-1)
            Q_targets_next = Q_targets_next[self.batch_indices, next_actions]
        else:
            Q_targets_next = Q_targets_next.max(1)[0]

        Q_targets = rewards + gamma * Q_targets_next.reshape(
            (self.batch_size, 1)) * (1 - dones)

        pred = self.qnetwork_local(states)
        Q_expected = pred.gather(1, actions)

        if self.usePER:
            errors = torch.abs(Q_expected -
                               Q_targets).data.numpy() + self.per_epsilon
            self.memory.update_priorities(indices, errors)

        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()

        if self.t_step % self.parameter_update_interval == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target,
                             self.tau)

    def soft_update(self, qnetwork_local, qnetwork_target, tau):
        for local_param, target_param in zip(qnetwork_local.parameters(),
                                             qnetwork_target.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #21
0
class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max,
                                      args.atoms)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space)
        if args.model and os.path.isfile(args.model):
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
        if args.cuda:
            self.online_net.cuda()
            self.target_net.cuda()
            self.support = self.support.cuda()

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):
        return (self.online_net(state.unsqueeze(0)).data *
                self.support).sum(2).max(1)[1][0]

    # Acts with an ε-greedy policy
    def act_e_greedy(self, state, epsilon=0.001):
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        # Calculate current state probabilities
        self.online_net.reset_noise()  # Sample new noise for online network
        ps = self.online_net(states)  # Probabilities p(s_t, ·; θonline)
        ps_a = ps[range(self.batch_size), actions]  # p(s_t, a_t; θonline)

        # Calculate nth next state probabilities
        self.online_net.reset_noise()  # Sample new noise for action selection
        pns = self.online_net(
            next_states).data  # Probabilities p(s_t+n, ·; θonline)
        dns = self.support.expand_as(
            pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
        argmax_indices_ns = dns.sum(2).max(
            1
        )[1]  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
        self.target_net.reset_noise()  # Sample new target net noise
        pns = self.target_net(
            next_states).data  # Probabilities p(s_t+n, ·; θtarget)
        pns_a = pns[range(
            self.batch_size
        ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

        # Compute Tz (Bellman operator T applied to z)
        Tz = returns.unsqueeze(1) + nonterminals * (
            self.discount**self.n) * self.support.unsqueeze(
                0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
        Tz = Tz.clamp(min=self.Vmin,
                      max=self.Vmax)  # Clamp between supported values
        # Compute L2 projection of Tz onto fixed support z
        b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
        l, u = b.floor().long(), b.ceil().long()
        # Fix disappearing probability mass when l = b = u (b is int)
        l[(u > 0) * (l == u)] -= 1
        u[(l < (self.atoms - 1)) * (l == u)] += 1

        # Distribute probability of Tz
        m = states.data.new(self.batch_size, self.atoms).zero_()
        offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                self.batch_size).unsqueeze(1).expand(
                                    self.batch_size,
                                    self.atoms).type_as(actions)
        m.view(-1).index_add_(
            0, (l + offset).view(-1),
            (pns_a *
             (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
        m.view(-1).index_add_(
            0, (u + offset).view(-1),
            (pns_a *
             (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        ps_a = ps_a.clamp(min=1e-3)  # Clamp for numerical stability in log
        loss = -torch.sum(
            Variable(m) * ps_a.log(),
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        (weights * loss).mean().backward()  # Importance weight losses
        self.optimiser.step()

        mem.update_priorities(
            idxs, loss.data)  # Update priorities of sampled transitions

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def save(self, path):
        torch.save(self.online_net.state_dict(),
                   os.path.join(path, 'model.pth'))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        return (self.online_net(state.unsqueeze(0)).data *
                self.support).sum(2).max(1)[0][0]

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()
Example #22
0
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()
    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    # 최종 결과값 갯수 '선택할 행동의 갯수' NUM_ACTION 설정
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    # 학습결과 저장 및 확인
    # 한판마다 얻는 점수를 저장하고 확인
    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    # 파일 저장
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 목표신경망 초기화
    brain.update_target_network()

    # 행동을 선택할떄 DQN을 이용할 시점 정함
    # 일정시간이 지나기전에 행동을 무작위 선택하고 게임 진행중 epsilon값 줄여 나감
    epsilon = 1.0

    # 학습진행 조절을 위한 진행된 프레임 횟수
    time_step = 0
    # 학습결과를 확인하기 위한 점수 저장 배열
    total_reward_list = []

    # 학습 시작
    for episode in range(MAX_EPISODE):
        terminal = False  # 게임 종료
        total_reward = 0  # 한게임당 얻은 총 점수

        state = game.reset()  # 게임 초기화
        brain.init_state(state)  # DQN에 게임 초기화

        # 녹색사각형이 다른 사각형에 충돌할때까지 게임 수행
        while not terminal:

            # 학습 초반 (100회 이전)은 무작위로 수행
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 100회 이상이면 무작위값 사용비율을 줄여가면서 수행
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 게임상태, 보상과 게임종료여부 받음
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재상태를 신경망 객체에 기억
            # 기억된 정보를 이용하여 신경망 학습 시킴
            brain.remember(state, action, reward, terminal)

            # 프레임 100번이 넘으면 4프레임마다 한번씩 학습
            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()
            # 1000프레임 마다 한번씩 목표 신경망 갱신
            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()
            time_step += 1

        # 게임 종료시 획득점수 출력하고 점수 저장
        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
        total_reward_list.append(total_reward)

        # 에피소드 10번마다 받은점수를 로그에 저장, 100마다 학습모델 저장
        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Example #23
0
class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.batch_size = args.batch_size
        self.discount = args.discount
        self.max_gradient_norm = args.max_gradient_norm

        self.policy_net = DQN(args, self.action_space)
        if args.model and os.path.isfile(args.model):
            self.policy_net.load_state_dict(torch.load(args.model))
        self.policy_net.train()

        self.target_net = DQN(args, self.action_space)
        self.update_target_net()
        self.target_net.eval()

        self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr)

    def act(self, state, epsilon):
        if random.random() > epsilon:
            return self.policy_net(state.unsqueeze(0)).max(1)[1].data[0]
        else:
            return random.randint(0, self.action_space - 1)

    def learn(self, mem):
        transitions = mem.sample(self.batch_size)
        batch = Transition(*zip(*transitions))  # Transpose the batch

        states = Variable(torch.stack(batch.state, 0))
        actions = Variable(torch.LongTensor(batch.action).unsqueeze(1))
        rewards = Variable(torch.Tensor(batch.reward))
        non_final_mask = torch.ByteTensor(
            tuple(map(
                lambda s: s is not None,
                batch.next_state)))  # Only process non-terminal next states
        next_states = Variable(
            torch.stack(tuple(s for s in batch.next_state if s is not None),
                        0),
            volatile=True
        )  # Prevent backpropagating through expected action values

        Qs = self.policy_net(states).gather(1, actions)  # Q(s_t, a_t; θpolicy)
        next_state_argmax_indices = self.policy_net(next_states).max(
            1, keepdim=True
        )[1]  # Perform argmax action selection using policy network: argmax_a[Q(s_t+1, a; θpolicy)]
        Qns = Variable(torch.zeros(
            self.batch_size))  # Q(s_t+1, a) = 0 if s_t+1 is terminal
        Qns[non_final_mask] = self.target_net(next_states).gather(
            1, next_state_argmax_indices
        )  # Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget)
        Qns.volatile = False  # Remove volatile flag to prevent propagating it through loss
        target = rewards + (
            self.discount * Qns
        )  # Double-Q target: Y = r + γ.Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget)

        loss = F.smooth_l1_loss(
            Qs, target)  # Huber loss on TD-error δ: δ = Y - Q(s_t, a_t)
        # TODO: TD-error clipping?
        self.policy_net.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm(self.policy_net.parameters(),
                                self.max_gradient_norm)  # Clamp gradients
        self.optimiser.step()

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save(self, path):
        torch.save(self.policy_net.state_dict(),
                   os.path.join(path, 'model.pth'))

    def evaluate_q(self, state):
        return self.policy_net(state.unsqueeze(0)).max(1)[0].data[0]

    def train(self):
        self.policy_net.train()

    def eval(self):
        self.policy_net.eval()
Example #24
0
def train_rl(images, targets, folds, stochastic = False, test = False, base_rand = False): 
    print('start train rl')


    #print(images.shape)
    #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold)

    #X_train = X_train.astype("float32", casting='unsafe')
    #X_val = X_val.astype("float32", casting='unsafe')
    #X_test = X_test.astype("float32", casting='unsafe')
    
    #print('check')
    #print(X_train.shape)
    with tf.Session() as sess:
        #onfig = get_config(FLAGS) or FLAGS
       
        model = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, n_act)
        
        rewards = tf.placeholder(tf.float32, [None])
        tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        
        writer = tf.summary.FileWriter('logs', sess.graph)
        summary_merged = tf.summary.merge_all()
        
        print('total %s folds', len(folds))
        
        #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold)
#        X_train = X_train.astype("float32", casting='unsafe')
#        X_val = X_val.astype("float32", casting='unsafe')
#        X_test = X_test.astype("float32", casting='unsafe')

        ###

        # init target network
        model.update_target_network()
        
        # get next action from DQN
        epsilon = 1.0
        # def frame N
        t_step = 0
        tot_reward_list = []


        MAX_EPISODE = 10000
        n_img = len(targets)
        
        n_epi = n_img
        if stochastic: n_epi = MAX_EPISODE
        

        # call pred & loss 
        n_test = 3 
        if test:  #for debugging
            pred_all, loss_all = predict_all(images[0:n_test, :], targets[0:test, :])
            if not stochastic: n_epi = n_test
        else: pred_all, loss_all = predict_all(images, targets)
        
        #pred_all_train, loss_all_train = predict_all(X_train, y_train)

        #print(pred_all)

        # run simulation
        pred_rl = []
        for epi in range(n_epi):
            terminal = False
            tot_reward = 0

            #init game & get current state
            
            #state parsing
            state = np.expand_dims(images[epi], 0)
            #state = np.expand_dims(X_train[epi], 0)
            model.init_state(state)

            if np.random.rand() < epsilon:
                act = random.randrange(n_act)
            else:
                act = model.get_action()

            if epi > OBSERVE: epsilon -= 1/100
            if base_rand: act = random.randrange(n_act)
            
            #stochastic define
            if stochastic:
                ii = random.randrange(n_img)
                state = np.expand_dims(images[ii], 0)  
                #state = np.expand_dims(X_train[ii], 0)
                state_i = ii

            else:
                state = np.expand_dims(images[epi], 0)
                #state = np.expand_dims(X_train[epi], 0)
                state_i = epi
            
            # get model str by act
            choosen_model = model_list[act]
            
            # reward function
            if pred_all[choosen_model][state_i] == 1:
                reward = 1
                pred_rl.append(1)
            else:
                reward = -2
                pred_rl.append(0)


            tot_reward += reward

            model.remember(state, act, reward, terminal)

            if t_step > OBSERVE and t_step % TRAIN_INTERVAL == 0:
                # DQN train
                model.train()

            if t_step % TARGET_UPDATE_INTERVAL == 0:
                # target update
                model.update_target_network()

            t_step += 1

            print('epi: %d score: %d' % ((epi+1), tot_reward))

            tot_reward_list.append(tot_reward)

            if epi % 10 == 0:
                summary = sess.run(summary_merged, feed_dict={rewards: tot_reward_list})
                writer.add_summary(summary, t_step)
                tot_reward_list = []

            if epi % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=t_step)

        return tot_reward_list, pred_rl, pred_all
Example #25
0
        winner_rewards = [1] * len(boards[winner])
        loser_rewards = [-1] * len(boards[loser])

        rewards = np.array(winner_rewards + loser_rewards)
        boards = np.concatenate([boards[winner], boards[loser]])

    else:
        #tie
        one_rewards = [0] * len(boards[1])
        two_rewards = [0] * len(boards[2])

        rewards = np.array(one_rewards + two_rewards)
        boards = np.concatenate([boards[1], boards[2]])
    
    rewards = rewards.reshape(rewards.shape[0], -1)
    model.train(boards, rewards)

    if games % EPOCH == 0:
        gamma *= 1.0
        win_rate.append(test_against_random(model))

        debug_run(model)

    if games % TEST_FRQ == 0:
        print(win_rate)
        plt.plot(win_rate)
        plt.ylabel('Winning Percentage')
        plt.xlabel('Epochs')
        plt.show()
        model.save('./dqn_no/')
        exit()
Example #26
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, network):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.network = network

        # Q-Network
        if self.network == "duel":
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        else:
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, count):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, count)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            # Epsilon-greedy action selection
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, count):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Q values for best actions in next_state
        # from current Q network

        if self.network == "double" or "duel":
            Q_L = self.qnetwork_local(next_states).detach()
            _, actions_prime = Q_L.max(1)

        # get Q values from frozen network for next state and chosen action

        Q_targets_next = self.qnetwork_target(next_states).detach()
        Q_targets_next_s_a_prime = Q_targets_next.gather(
            1, actions_prime.unsqueeze(1))

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next_s_a_prime * (1 - dones))

        # Get expected Q values from target model using current actions
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.smooth_l1_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #if count >= TARGET_UPDATE:
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #27
0
class Agent(object):
    def __init__(self, args, action_space):
        self.action_space = action_space
        self.batch_size = args.batch_size
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
        self.loss_func = nn.MSELoss()

    # Acts based on single state (no batch)
    def act(self, state):
        with torch.no_grad():
            return self.online_net([state]).argmax(1).item()

    # Acts with an ε-greedy policy (used for evaluation only)
    def act_e_greedy(
            self,
            state,
            epsilon=0.05):  # High ε can reduce evaluation scores drastically
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):

        # Sample transitions
        states, actions, next_states, rewards = mem.sample(self.batch_size)

        q_eval = self.online_net(states).gather(
            1, actions.unsqueeze(1)).squeeze()
        with torch.no_grad():
            q_eval_next_a = self.online_net(next_states).argmax(1)
            q_next = self.target_net(next_states)
            q_target = rewards + self.discount * q_next.gather(
                1, q_eval_next_a.unsqueeze(1)).squeeze()

        loss = self.loss_func(q_eval, q_target)
        self.online_net.zero_grad()
        loss.backward()
        self.optimiser.step()

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    # Save model parameters on current device (don't move model between devices)
    def save(self, path):
        torch.save(self.online_net.state_dict(), path + '.pth')

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        with torch.no_grad():
            return (self.online_net([state])).max(1)[0].item()

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()
Example #28
0
class Agent:
  state: int
  actions: int
  history: int = 4
  atoms: int = 5 #51
  Vmin: float = -10
  Vmax: float = 10
  
  lr: float = 1e-5
  batch_size: int = 32
  discount: float = 0.99
  norm_clip: float = 10.

  def __post_init__(self):
    self.support = torch.linspace(self.Vmin, self.Vmax, self.atoms)
    self.delta_z = (self.Vmax - self.Vmin) / (self.atoms - 1)

    self.online_net = DQN(self.state, self.actions, self.history, self.atoms)
    self.online_net.train()

    self.target_net = DQN(self.state, self.actions, self.history, self.atoms)
    self.update_target_net()
    self.target_net.train()
    for param in self.target_net.parameters(): param.requires_grad = False

    self.optimiser = optim.Adam(self.online_net.parameters(), lr=self.lr)

  def act(self, state):
    state = torch.FloatTensor(state).unsqueeze(0)
    with torch.no_grad():
      return (self.online_net(state) * self.support).sum(2).argmax(1).item()

  def act_e_greedy(self, state, epsilon=0.001):
    return random.randrange(self.actions) if random.random() < epsilon else self.act(state)

  def learn(self, buffer):
    state, action, reward, next_state, terminal, weights, idx = buffer.sample(self.batch_size)
    state = torch.FloatTensor(state)
    action = torch.LongTensor(action)
    reward = torch.FloatTensor(reward)
    next_state = torch.FloatTensor(next_state)
    terminal = torch.FloatTensor(terminal)
    weights = torch.FloatTensor(weights)

    log_ps = self.online_net(state, log=True)
    log_ps_a = log_ps[range(self.batch_size), action]

    with torch.no_grad():
      # Calculate nth next state probabilities
      pns = self.online_net(next_state)
      dns = self.support.expand_as(pns) * pns
      argmax_indices_ns = dns.sum(2).argmax(1)
      self.target_net.sample_noise()
      pns = self.target_net(next_state)
      pns_a = pns[range(self.batch_size), argmax_indices_ns]

      # Compute Bellman operator T applied to z
      Tz = reward.unsqueeze(1) + (1 - terminal).unsqueeze(1) * self.discount * self.support.unsqueeze(0) # -10 ... 10 + reward
      Tz.clamp_(min=self.Vmin, max=self.Vmax)
      
      # Compute L2 projection of Tz onto fixed support z
      b = (Tz - self.Vmin) / self.delta_z # 0 ... 4
      l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
      # Fix disappearing probability mass when l = b = u (b is int)
      l[(u > 0) * (l == u)] -= 1
      u[(l < (self.atoms - 1)) * (l == u)] += 1

      # Distribute probability of Tz
      m = state.new_zeros(self.batch_size, self.atoms)
      offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand(self.batch_size, self.atoms).to(action)
      m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
      m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

    loss = -torch.sum(m * log_ps_a, 1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
    loss = weights * loss

#     q_values = self.online_net(state)
#     q_value = q_values[range(self.batch_size), action]

#     next_q_values = self.target_net(next_state)
#     next_q_value = next_q_values.max(1)[0]

#     expected_q_value = reward + self.discount * next_q_value * (1 - terminal)
#     loss = weights * (q_value - expected_q_value).pow(2)

    self.optimiser.zero_grad()
    loss.mean().backward()
    self.optimiser.step()
    nn.utils.clip_grad_norm_(self.online_net.parameters(), self.norm_clip)

    buffer.update_priorities(idx, loss.tolist())

  def update_target_net(self):
    self.target_net.load_state_dict(self.online_net.state_dict())

  def sample_noise(self):
    self.online_net.sample_noise()

  def save(self, path):
    torch.save(self.online_net.state_dict(), path)

  # Evaluates Q-value based on single state (no batch)
  def evaluate_q(self, state):
    with torch.no_grad():
      return self.online_net(state.unsqueeze(0)).max(1)[0].item()

  def train(self):
    self.online_net.train()

  def eval(self):
    self.online_net.eval()
Example #29
0
                    else:
                        action = dqn.get_action()

                    if epsilon > args.observe:
                        epsilon -= 0.001

                    reward, done = game.step(action)
                    total_reward += reward

                    if start - time.time() > 1:
                        dqn.save_memory(action, reward, done,
                                        rgb2gray(game.state))
                        count += 1

                    if frame_count > args.observe and frame_count % args.train_term == 0 and count > 50:
                        dqn.train()

                    if frame_count % args.update_term == 0:
                        dqn.copy2target()

                    frame_count += 1
                PressKey(ENTER)
                time.sleep(0.1)
                ReleaseKey(ENTER)

                if episode % 10 == 0:
                    print("Iteration: {}, Score: {}".format(
                        episode, total_reward))
                    rewards.append(total_reward)
                    total_reward = 0
Example #30
0
def train():
    print('Training... 뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Example #31
0
class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model and os.path.isfile(args.model):
            # Always load tensors onto CPU by default, will shift to GPU if necessary
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):
        with torch.no_grad():
            return (self.online_net(state.unsqueeze(0)) *
                    self.support).sum(2).argmax(1).item()

    # Acts with an ε-greedy policy (used for evaluation only)
    def act_e_greedy(
            self,
            state,
            epsilon=0.001):  # High ε can reduce evaluation scores drastically
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        # Calculate current state probabilities (online network noise already sampled)
        log_ps = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns = self.online_net(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        (weights * loss).mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        self.optimiser.step()

        mem.update_priorities(
            idxs, loss.detach())  # Update priorities of sampled transitions

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    # Save model parameters on current device (don't move model between devices)
    def save(self, path):
        torch.save(self.online_net.state_dict(),
                   os.path.join(path, 'model.pth'))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        with torch.no_grad():
            return (self.online_net(state.unsqueeze(0)) *
                    self.support).sum(2).max(1)[0].item()

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()
Example #32
0
def train(IS_IMPORT):
    print('Loading ...')
    sess = tf.Session()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    global_step = tf.Variable(0, trainable=False, name='global_step')

    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION, global_step)
    #brain = DQN(sess, 61, global_step)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
    totalScores = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.totalScore/ep.', tf.reduce_mean(totalScores))

    total_reward_list = []
    total_score_list = []

    saver = tf.train.Saver(tf.global_variables())

    ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
    writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    summary_merged = tf.summary.merge_all()

    if IS_IMPORT == True:
        fs = FileLoad('F:\work\cocos\dqnTest\Resources\scenario - Copy.sce')
    else:
        server.accept()

    brain.update_target_network()
    print('global_step:', sess.run(global_step))

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0
        weight = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        #state = game.reset()
        if IS_IMPORT:
            id, _, _, _, state = fs.readState()
            if id == -1:
                sys.exit(1)
        else:
            id, _, _, _, state = server.readStatus()

        if id == -1:
            continue

        state = reshapeFromPacket(state)
        '''
        state.append(state[2])
        state.append(state[2])               
        '''

        brain.init_state(state)

        while not terminal:
            actionType = "Action:"

            if IS_IMPORT:
                action = fs.readAction()
                if action == -1: sys.exit(1)
                id, reward, totalScore, terminal, state = fs.readState()
                if id == -1: sys.exit(1)
            else:

                if np.random.rand() < epsilon:
                    action = random.randrange(NUM_ACTION)
                    print("Random action:", action)
                    #action = -1
                    #action = random.uniform(-1, 1)
                else:
                    action = brain.get_action()

                #action = brain.get_action()

                if episode > OBSERVE:
                    epsilon -= 1 / 1000

                server.sendX(id, action)

                if action == -1:
                    id2, action = server.readAction()
                    actionType = "Random Action:"

                    if id != id2:
                        print("Invalid Packet", id, id2)

                id, reward, totalScore, terminal, state = server.readStatus()

            reward = reward + (weight * 0.1)
            weight = weight + 1

            print(time.strftime("%H:%M:%S", time.localtime()), id, actionType,
                  action, "totalScore:", totalScore, "reward:", reward,
                  "terminal", terminal)

            if id == -1:
                break
            if terminal == True:
                total_score_list.append(totalScore)

            state = reshapeFromPacket(state)

            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.

                brain.train()
                '''
                try:
                except:
                    print("Train Error!!")
                    time_step -= 1
                '''

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('\t Count of Play: %d Score: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if (episode) % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={
                                   rewards: total_reward_list,
                                   totalScores: total_score_list
                               })
            writer.add_summary(summary, sess.run(global_step))
            total_reward_list = []
            total_score_list = []

        if (episode + 1) % 100 == 0:
            saver.save(sess, MODEL_PATH + '/dqn.ckpt', global_step=global_step)

    #모두 학습한 후에 tflite 파일로 저장
    converter = tf.lite.TFLiteConverter.from_session(sess, [brain.input_X],
                                                     [brain.Q])
    tflite_model = converter.convert()
    open(MODEL_PATH + "/dqn.tflite", "wb").write(tflite_model)
    sys.exit(1)