Python DQN.train Examples

Programming Language: Python

Namespace/Package Name: model

Class/Type: DQN

Method/Function: train

Examples at hotexamples.com: 32

Python DQN.train - 32 examples found. These are the top rated real world Python examples of model.DQN.train extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DQN(30)

load_state_dict(30)

parameters(30)

train(30)

eval(30)

state_dict(30)

get_action(25)

init_state(20)

remember(19)

cuda(18)

reset_noise(14)

update_target_network(13)

zero_grad(12)

save(12)

load(11)

act(10)

to(8)

predict(8)

sample_noise(5)

forward(5)

update_noisy_modules(4)

update(4)

sample_action(3)

chooseAction(2)

update_target(2)

share_memory(2)

save_model(2)

initState(2)

getAction(2)

learn(2)

optimize(2)

perceive(1)

train_Xy(1)

store_path(1)

store_transition(1)

copy2target(1)

to_gpu(1)

choose_action(1)

train_net(1)

step(1)

train_step(1)

memory(1)

updateTargetNetwork(1)

fit(1)

forward_with_latent(1)

apply(1)

store(1)

copy_from(1)

pick_action(1)

reset_model(1)

Example #1

Show file

def main(_config):
    env = gym.make(_config.ENV_NAME)

    agent = DQN(env, _config)

    print("[*] --- Begin Emulator Training ---")

    for episode in range(_config.EPISODE):

        obs = env.reset()

        # === Emulator ===
        for i in range(_config.STEP):
            action = agent.pick_action(obs)
            obs_next, reward, done, _ = env.step(action)

            # agent will store the newest experience into replay buffer, and training with mini-batch and off-policy
            agent.perceive(obs, action, reward, done)

            if done:
                break

            obs = obs_next

        # == train ==
        agent.train(episode)

        if (episode + 1) % agent.save_every == 0:
            agent.save(step=episode)

        # == test ==
        print("\n[*] === Enter TEST module ===")
        test(env, _config.STEP, agent)

    agent.record()

Example #2

Show file

File: agent.py Project: 300wonplz/DeepLearning

def train():
    print("뇌세포 깨우는 중..")
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)
    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all

    brain.update_target_network()
    epsilon = 1.0

    time_step = 0
    total_reward_list = []

    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            if episode > OBSERVE:
                rpdilon -= 1 / 1000

            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()

            time_step += 1

            print('게임횟수 : %d, 점수 :  %d' % (episode + 1, total_reward))

            total_reward_list.append(total_reward)

            if episode % 10 == 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if episode % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

Example #3

Show file

def train():
    print('뇌세포 꺠우는 중..')
    sess = tf.Session()

    game = Game(screenWidth, screenHeight, show_game=False)
    brain = DQN(sess, screenWidth, screenHeight, numAction)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summaryMerged = tf.summary.merge_all()

    brain.updateTargetNetwork()

    timeStep = 0
    totalRewardList = []

    for episode in range(maxEpisode):
        terminal = False
        totalReward = 0
        epsilon = 1.0

        state = game.reset()
        brain.initState(state)

        while not terminal:
            if np.random.rand() < epsilon:
                action = random.randrange(numAction)

            else:
                action = brain.getAction()
            if episode > observe:
                epsilon -= 1 / 1000

            state, reward, terminal = game.step(action)
            totalReward += reward
            brain.remember(state, action, reward, terminal)

            if timeStep > observe and timeStep % trainInterval == 0:
                brain.train()
            if timeStep % targetUpdateInterval == 0:
                brain.updateTargetNetwork()

            timeStep += 1

        totalRewardList.append(totalReward)

        if episode % 10 == 0:
            summary = sess.run(summaryMerged,
                               feed_dict={rewards: totalRewardList})
            writer.add_summary(summary, timeStep)
        if episode % 100 == 99:
            print("게임횟수 : {0}, 점수 : {1:.4f}".format(episode + 1, totalReward))
            saver.save(sess, './model/dqn.ckpt', global_step=timeStep)

Example #4

Show file

def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    sess.run(tf.global_variables_initializer())

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    time_step = 0
    epsilon = 1.0

    for episode in range(MAX_EPISODE):
        # 게임을 시작합니다.
        terminal = False

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        _, state, _, _ = game.first_step()
        brain.init_state(state)

        while not terminal:
            # 게임 기록을 가져옵니다.
            action, state, reward, terminal = game.step()

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)
            if (time_step > OBSERVE) and (time_step % TRAIN_INTERVAL) == 0:
                brain.train()
            # 타겟 네트웍을 업데이트 해 줍니다.
            # if (time_step % TARGET_UPDATE_INTERVAL) == 0:
            #     brain.update_target_network()
            time_step += 1
        # if episode % 50 == 0:
        print(episode)
    save_model(sess)

Example #5

Show file

def train():
    with tf.Session() as sess:
        tf.set_random_seed(GLOBAL_SEED)
        brain = DQN(sess, observation_size, action_size)
        rewards = tf.placeholder(tf.float32, [None])
        tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter('logs', sess.graph)
        summary_merged = tf.summary.merge_all()
        brain.update_target_network()
        time_step = 0
        total_reward_list = []

        for episode in range(MAX_EPISODE):
            done = False
            total_reward = 0
            epsilon = 1. / ((episode / 10) + 1)

            observation = env.reset()
            brain.init_state(observation)

            while not done:
                if np.random.rand() < epsilon:
                    action = random.randrange(action_size)
                else:
                    action = brain.get_action()

                observation, reward, done, _ = env.step(action)
                # print(observation, reward, done)
                total_reward += reward
                brain.remember(observation, action, reward, done)

                if time_step > 0:
                    if time_step % TRAIN_INTERVAL_FRAMES == 0:
                        _, loss = brain.train()
                    if time_step % TARGET_UPDATE_INTERVAL == 0:
                        brain.update_target_network()

                time_step += 1

            print('episode: %d total_reward: %d' % (episode, total_reward))

            total_reward_list.append(total_reward)

            if episode % 10 == 0:
                summary = sess.run(summary_merged,
                                   feed_dict={rewards: total_reward_list})
                writer.add_summary(summary, time_step)
                total_reward_list = []

            if episode % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

Example #6

Show file

File: agent.py Project: renly/Rainbow

class Agent():
  def __init__(self, args, env):
    self.action_space = env.action_space()
    self.atoms = args.atoms
    self.Vmin = args.V_min
    self.Vmax = args.V_max
    self.support = torch.linspace(args.V_min, args.V_max, args.atoms)  # Support (range) of z
    self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
    self.batch_size = args.batch_size
    self.n = args.multi_step
    self.discount = args.discount
    self.priority_exponent = args.priority_exponent
    self.max_gradient_norm = args.max_gradient_norm

    self.policy_net = DQN(args, self.action_space)
    if args.model and os.path.isfile(args.model):
      self.policy_net.load_state_dict(torch.load(args.model))
    self.policy_net.train()

    self.target_net = DQN(args, self.action_space)
    self.update_target_net()
    self.target_net.eval()

    self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps)
    if args.cuda:
      self.policy_net.cuda()
      self.target_net.cuda()
      self.support = self.support.cuda()

  # Resets noisy weights in all linear layers (of policy and target nets)
  def reset_noise(self):
    self.policy_net.reset_noise()
    self.target_net.reset_noise()

  # Acts based on single state (no batch)
  def act(self, state):
    return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[1][0]

  def learn(self, mem):
    idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(self.batch_size)
    batch_size = len(idxs)  # May return less than specified if invalid transitions sampled

    # Calculate current state probabilities
    ps = self.policy_net(states)  # Probabilities p(s_t, ·; θpolicy)
    ps_a = ps[range(batch_size), actions]  # p(s_t, a_t; θpolicy)

    # Calculate nth next state probabilities
    pns = self.policy_net(next_states).data  # Probabilities p(s_t+n, ·; θpolicy)
    dns = self.support.expand_as(pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θpolicy))
    argmax_indices_ns = dns.sum(2).max(1)[1]  # Perform argmax action selection using policy network: argmax_a[(z, p(s_t+n, a; θpolicy))]
    pns = self.target_net(next_states).data  # Probabilities p(s_t+n, ·; θtarget)
    pns_a = pns[range(batch_size), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θpolicy))]; θtarget)
    pns_a *= nonterminals  # Set p = 0 for terminal nth next states as all possible expected returns = expected reward at final transition

    # Compute Tz (Bellman operator T applied to z)
    Tz = returns.unsqueeze(1) + nonterminals * (self.discount ** self.n) * self.support.unsqueeze(0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
    Tz = Tz.clamp(min=self.Vmin, max=self.Vmax)  # Clamp between supported values
    # Compute L2 projection of Tz onto fixed support z
    b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
    l, u = b.floor().long(), b.ceil().long()

    # Distribute probability of Tz
    m = states.data.new(batch_size, self.atoms).zero_()
    offset = torch.linspace(0, ((batch_size - 1) * self.atoms), batch_size).long().unsqueeze(1).expand(batch_size, self.atoms).type_as(actions)
    m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
    m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

    loss = -torch.sum(Variable(m) * ps_a.log(), 1)  # Cross-entropy loss (minimises Kullback-Leibler divergence)
    self.policy_net.zero_grad()
    (weights * loss).mean().backward()  # Importance weight losses
    nn.utils.clip_grad_norm(self.policy_net.parameters(), self.max_gradient_norm)  # Clip gradients (normalising by max value of gradient L2 norm)
    self.optimiser.step()

    mem.update_priorities(idxs, loss.data.abs().pow(self.priority_exponent))  # Update priorities of sampled transitions

  def update_target_net(self):
    self.target_net.load_state_dict(self.policy_net.state_dict())

  def save(self, path):
    torch.save(self.policy_net.state_dict(), os.path.join(path, 'model.pth'))

  # Evaluates Q-value based on single state (no batch)
  def evaluate_q(self, state):
    return (self.policy_net(state.unsqueeze(0)).data * self.support).sum(2).max(1)[0][0]

  def train(self):
    self.policy_net.train()

  def eval(self):
    self.policy_net.eval()

Example #7

Show file

File: agent.py Project: zedoul/GolbinHacker-Tensorflow

def train():
	print('wake up the brain...')
	sess = tf.Session()

	game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
	brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

	rewards = tf.placeholder(tf.float32, [None])
	tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

	saver = tf.train.Saver()
	sess.run(tf.global_variables_initializer())

	writer = tf.summary.FileWriter('logs', sess.graph)
	summary_merged = tf.summary.merge_all()

	brain.update_target_network()

	epsilon = 1.0
	time_step = 0
	total_reward_list = []

	for episode in range(MAX_EPISODE):
		terminal = False
		total_reward = 0

		state = game.reset()
		brain.init_state(state)

		while not terminal:
			if np.random.rand() < epsilon:
				action = random.randrange(NUM_ACTION)
			else:
				action = brain.get_action()

			if episode > OBSERVE:
				epsilon -= 1 / 1000.

			state, reward, terminal = game.step(action)
			total_reward += reward

			brain.remember(state, action, reward, terminal)

			if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
				brain.train()

			if time_step % TARGET_UPDATE_INTERVAL == 0:
				brain.update_target_network()

			time_step += 1

		print('episode: %d, score: %d' % (episode + 1, total_reward))

		total_reward_list.append(total_reward)

		if episode % 10 == 0:
			summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
			writer.add_summary(summary, time_step)
			total_reward_list = []

		if episode % 100 == 0:
			saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

Example #8

Show file

File: agent.py Project: andre1M/DQN-navigation

class DQNAgent:
    """
    Interacts with and learns from the environment.
    Vanilla DQN.
    """
    def __init__(self, state_size: int, action_size: int, seed: int):
        """
        Initialize an Agent object.

        :param state_size: dimension of each state;
        :param action_size: dimension of each action;
        :param seed: random seed.
        """

        self.state_size = state_size
        self.action_size = action_size
        random.seed(seed)

        # Q-Network
        self.network_local = DQN(state_size, action_size, seed).to(DEVICE)
        self.network_target = DQN(state_size, action_size, seed).to(DEVICE)
        self.optimizer = optim.Adam(self.network_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action: int, reward: float, next_state, done):
        """
        Save experiences in the replay memory and check if it's time to learn.

        :param state: (array_like) current state;
        :param action: action taken;
        :param reward: reward received;
        :param next_state: (array_like) next state;
        :param done: terminal state indicator; int or bool.
        """

        # Save experience in replay memory
        self.memory.push(state, action, reward, next_state, done)

        # Increment time step and compare it to the network update frequency
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # Check if there is enough samples in the memory to learn
            if len(self.memory) > BATCH_SIZE:
                # sample experiences from memory
                experiences = self.memory.sample()
                # learn from sampled experiences
                self.learn(experiences, GAMMA)

    def act(self, state, eps: float = 0.):
        """
        Returns actions for given state as per current policy.

        :param state: (array_like) current state
        :param eps: epsilon, for epsilon-greedy action selection
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(DEVICE)
        self.network_local.eval()
        with torch.no_grad():
            action_values = self.network_local(state)
        self.network_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma: float):
        """
        Update value parameters using given batch of experience tuples.

        :param experiences: (Tuple[torch.Tensor]) tuple of (s, a, r, s', done) tuples;
        :param gamma: discount factor.
        """

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.network_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.network_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network_local, self.network_target, TAU)

    @staticmethod
    def soft_update(local_model, target_model, tau: float):
        """
        Soft update model parameters,
        θ_target = τ*θ_local + (1 - τ)*θ_target.

        :param local_model: (PyTorch model) weights will be copied from;
        :param target_model: (PyTorch model) weights will be copied to;
        :param tau: interpolation parameter.
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Example #9

Show file

class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = np.linspace(args.V_min, args.V_max,
                                   self.atoms)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_steps
        self.discount = args.discount
        self.norm_clip = args.max_norm_clip

        self.sess = tf.Session()

        with tf.variable_scope("online_net"):
            self.online_net = DQN(args, self.action_space)

        self.online_net.train()

        with tf.variable_scope("target_net"):
            self.target_net = DQN(args, self.action_space)
        self.target_net.train()

        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
        if tf.gfile.Exists("./models/model.ckpt"):
            self.saver.restore(self.sess, "./models/model.ckpt")

        online_net_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope="online_net")
        target_net_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope="target_net")
        update_target_op = []
        for var, var_target in zip(
                sorted(online_net_func_vars, key=lambda v: v.name),
                sorted(target_net_func_vars, key=lambda v: v.name)):
            update_target_op.append(var_target.assign(var))
        self.update_target_op = tf.group(*update_target_op)

        self.update_target_net()

        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=args.learning_rate, epsilon=args.adam_eps)

    def forward(self, network, inputs, log=False):
        if log:
            output = self.sess.run(network.action_log,
                                   feed_dict={network.inputs: inputs})
            return output
        else:
            output = self.sess.run(network.action,
                                   feed_dict={network.inputs: inputs})
            return output

    def reset_noise(self):
        self.online_net.reset_noise()

    def act(self, state):
        return np.argmax(
            np.sum(
                (self.forward(self.online_net, state.reshape(1, 84, 84, 4)) *
                 self.support),
                axis=-1))

    def act_e_greedy(self, state, epsilon=0.001):
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        log_ps = self.forward(self.online_net, states, log=True)
        log_ps_a = []
        for i in range(self.batch_size):
            log_ps_a.append(log_ps[i][actions[i]])

        pns = self.forward(self.online_net, next_states)
        dns = np.broadcast_to(self.support, (self.action_space, self.atoms))
        dns = np.multiply(
            np.broadcast_to(dns,
                            (self.batch_size, self.action_space, self.atoms)),
            pns)
        argmax_indices_ns = np.argmax(np.sum(dns, axis=2), axis=1)
        self.target_net.reset_noise()
        pns = self.forward(self.target_net, next_states)
        pns_a = pns[range(self.batch_size), argmax_indices_ns]

        Tz = np.expand_dims(
            returns, axis=1) + (self.discount**self.n) * np.multiply(
                nonterminals, np.expand_dims(self.support, axis=0))
        Tz = np.clip(Tz, self.Vmin, self.Vmax)
        b = (Tz - self.Vmin) / self.delta_z
        l, u = np.floor(b).astype(dtype=np.int64), np.ceil(b).astype(
            dtype=np.int64)
        l[(u > 0) * (l == u)] -= 1
        u[(l < (self.atoms - 1)) * (l == u)] += 1

        m = np.zeros([self.batch_size, self.atoms], dtype=states.dtype)
        offset = np.broadcast_to(
            np.expand_dims(np.linspace(0, ((self.batch_size - 1) * self.atoms),
                                       self.batch_size),
                           axis=1),
            (self.batch_size, self.atoms)).astype(actions.dtype)
        np.add.at(m.flatten(), (l + offset).flatten(),
                  (pns_a * (u.astype(np.float32) - b)).flatten())
        np.add.at(m.flatten(), (u + offset).flatten(),
                  (pns_a * (b - l.astype(np.float32))).flatten())

        loss = -np.sum(m * log_ps_a, 1)
        loss = weights * loss

    def update_target_net(self):
        self.sess.run(self.update_target_op)

    def save(self, path):
        self.save_path = self.saver.save(self.sess, "./models/model.ckpt")

    def evaluate_q(self, state):
        return np.sum(
            (self.forward(self.online_net, state.reshape(1, 84, 84, 4)) *
             self.support),
            axis=-1).max(axis=1)[0]

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()

Example #10

Show file

def train(args):
    logger = setup_logger(args)
    logger.info('---- Options ----')
    for k, v in vars(args).items():
        logger.info(k + ': ' + str(v))
    logger.info('--------\n')

    if torch.cuda.is_available():
        torch.cuda.manual_seed(0)
    else:
        torch.manual_seed(0)
    if os.path.isdir(args.tensorboard_dir):
        shutil.rmtree(args.tensorboard_dir)
    os.makedirs(args.tensorboard_dir)
    if not os.path.exists(args.saved_dir):
        os.makedirs(args.saved_dir)
    writer = SummaryWriter(args.tensorboard_dir)
    env = Tetris(width=args.width,
                 height=args.height,
                 block_size=args.block_size,
                 sim_rom_mode=args.sim_rom_mode)
    state_dim = 25
    action_dim = 2
    device = torch.device(
        'cuda:{}'.format(args.gpu) if torch.cuda.is_available() else 'cpu')
    model = DQN(input_dim=state_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    criterion = nn.MSELoss()

    state = env.reset()

    replay_memory = ReplayBufferOld(
        state_dim, action_dim, device=device,
        max_size=args.replay_memory_size)  # action = [x_axis, rotate_times]
    episode = 0
    step_cnt = 0
    seed = 0
    random.seed(seed)
    while episode < args.num_episodes:
        next_steps = env.get_next_states()

        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)
        next_states = next_states.to(device)
        model.eval()
        with torch.no_grad():
            predictions = model(next_states)[:, 0]
        index = get_action_index(args, episode, predictions, next_steps)
        model.train()

        next_state = next_states[index, :]
        next_state = next_state.cpu().numpy()
        action = next_actions[index]

        reward, done = env.step(action, render=False)
        if step_cnt > args.max_episode_length:
            done = True

        replay_memory.add(state, action, next_state, reward, done)
        if done:
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            state = env.reset()
            step_cnt = 0
        else:
            state = next_state
            step_cnt += 1
            continue

        if len(replay_memory) < args.replay_memory_size / 10:
            # logger.info("Episode:%d Current Memory Size: %d" % (episode, len(replay_memory)))
            continue
        episode += 1
        batch = replay_memory.sample(args.batch_size)
        state_batch, action_batch, next_state_batch, reward_batch, done_batch = batch

        q_values = model(state_batch)
        model.eval()
        with torch.no_grad():
            next_prediction_batch = model(next_state_batch)
        model.train()

        next_prediction_batch[done_batch < 0.5] = 0.0
        y_batch = reward_batch + args.gamma * next_prediction_batch

        optimizer.zero_grad()
        loss = criterion(q_values, y_batch)
        loss.backward()
        optimizer.step()

        logger.info(
            "Episode: {}/{}, Score: {}, Tetrominoes {}, Cleared lines: {}".
            format(episode, args.num_episodes, final_score, final_tetrominoes,
                   final_cleared_lines))
        writer.add_scalar('Train/Score', final_score, episode - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, episode - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines,
                          episode - 1)

        if episode > 2000 and episode % args.save_interval == 0:
            torch.save(model,
                       "{}/tetris_{}.pth".format(args.saved_dir, episode))
        if episode % 100:
            random.seed(seed % 10)
            seed += 1

    torch.save(model, "{}/tetris.pth".format(args.saved_dir))

Example #11

Show file

File: agent.py Project: teshnizi/curl_rainbow

class Agent():
    def __init__(self, args, env):
        self.args = args
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount
        self.norm_clip = args.norm_clip
        self.coeff = 0.01 if args.game in [
            'pong', 'boxing', 'private_eye', 'freeway'
        ] else 1.

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        self.momentum_net = DQN(args, self.action_space).to(device=args.device)
        # self.predictor = prediction_MLP(in_dim=128, hidden_dim=128, out_dim=128)

        if args.model:  # Load pretrained model if provided
            if os.path.isfile(args.model):
                state_dict = torch.load(
                    args.model, map_location='cpu'
                )  # Always load tensors onto CPU by default, will shift to GPU if necessary
                if 'conv1.weight' in state_dict.keys():
                    for old_key, new_key in (('conv1.weight',
                                              'convs.0.weight'),
                                             ('conv1.bias', 'convs.0.bias'),
                                             ('conv2.weight',
                                              'convs.2.weight'),
                                             ('conv2.bias', 'convs.2.bias'),
                                             ('conv3.weight',
                                              'convs.4.weight'),
                                             ('conv3.bias', 'convs.4.bias')):
                        state_dict[new_key] = state_dict[
                            old_key]  # Re-map state dict for old pretrained models
                        del state_dict[
                            old_key]  # Delete old keys for strict load_state_dict
                self.online_net.load_state_dict(state_dict)
                print("Loading pretrained model: " + args.model)
            else:  # Raise error if incorrect model path provided
                raise FileNotFoundError(args.model)

        self.online_net.train()
        # self.pred.train()
        self.initialize_momentum_net()
        self.momentum_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        for param in self.momentum_net.parameters():
            param.requires_grad = False
        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.learning_rate,
                                    eps=args.adam_eps)

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):
        with torch.no_grad():
            a, _, _ = self.online_net(state.unsqueeze(0))
            return (a * self.support).sum(2).argmax(1).item()

    # Acts with an ε-greedy policy (used for evaluation only)
    def act_e_greedy(
            self,
            state,
            epsilon=0.001):  # High ε can reduce evaluation scores drastically
        return np.random.randint(
            0, self.action_space
        ) if np.random.random() < epsilon else self.act(state)

    def learn(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)
        # print('\n\n---------------')
        # print(f'idxs: {idxs}, ')
        # print(f'states: {states.shape}, ')
        # print(f'actions: {actions.shape}, ')
        # print(f'returns: {returns.shape}, ')
        # print(f'next_states: {next_states.shape}, ')
        # print(f'nonterminals: {nonterminals.shape}, ')
        # print(f'weights: {weights.shape},')

        aug_states_1 = aug(states).to(device=self.args.device)
        aug_states_2 = aug(states).to(device=self.args.device)

        # print(f'aug_states_1: {aug_states_1.shape}')
        # print(f'aug_states_2: {aug_states_2.shape}')

        # Calculate current state probabilities (online network noise already sampled)
        log_ps, _, _ = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)

        _, z_1, p_1 = self.online_net(aug_states_1, log=True)
        _, z_2, p_2 = self.online_net(aug_states_2, log=True)
        # p_1, p_2 = self.pred(z_1), self.pred(z_2)

        # with torch.no_grad():
        #     p_2 = self.pred(z_2)

        simsiam_loss = 2 + D(p_1, z_2) / 2 + D(p_2, z_1) / 2
        # simsiam_loss = p_1.mean() + p_2.mean()
        # simsiam_loss = p_1.mean() * 128
        # simsiam_loss = - F.cosine_similarity(p_1, z_2.detach(), dim=-1).mean()
        # print(simsiam_loss)
        # simsiam_loss = 0

        # _, z_target = self.momentum_net(aug_states_2, log=True) #z_k
        # z_proj = torch.matmul(self.online_net.W, z_target.T)
        # logits = torch.matmul(z_anch, z_proj)
        # logits = (logits - torch.max(logits, 1)[0][:, None])
        # logits = logits * 0.1
        # labels = torch.arange(logits.shape[0]).long().to(device=self.args.device)
        # moco_loss = (nn.CrossEntropyLoss()(logits, labels)).to(device=self.args.device)

        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)

        # print(f'z_1: {z_1.shape}')
        # print(f'p_1: {p_1.shape}')
        # print('---------------\n\n')

        # 1/0

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns, _, _ = self.online_net(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns, _, _ = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        # loss = loss + (moco_loss * self.coeff)
        loss = loss + (simsiam_loss * self.coeff)
        self.online_net.zero_grad()
        # self.pred.zero_grad()
        curl_loss = (weights * loss).mean()
        # print(curl_loss)
        curl_loss.mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        clip_grad_norm_(self.online_net.parameters(),
                        self.norm_clip)  # Clip gradients by L2 norm
        self.optimiser.step()

        mem.update_priorities(idxs,
                              loss.detach().cpu().numpy()
                              )  # Update priorities of sampled transitions

    def learn_old(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)
        # print('\n\n---------------')
        # print(f'idxs: {idxs}, ')
        # print(f'states: {states.shape}, ')
        # print(f'actions: {actions.shape}, ')
        # print(f'returns: {returns.shape}, ')
        # print(f'next_states: {next_states.shape}, ')
        # print(f'nonterminals: {nonterminals.shape}, ')
        # print(f'weights: {weights.shape},')

        aug_states_1 = aug(states).to(device=self.args.device)
        aug_states_2 = aug(states).to(device=self.args.device)

        # print(f'aug_states_1: {aug_states_1.shape}')
        # print(f'aug_states_2: {aug_states_2.shape}')

        # Calculate current state probabilities (online network noise already sampled)
        log_ps, _, _ = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        _, z_anch, _ = self.online_net(aug_states_1, log=True)  #z_q
        _, z_target, _ = self.momentum_net(aug_states_2, log=True)  #z_k
        z_proj = torch.matmul(self.online_net.W, z_target.T)
        logits = torch.matmul(z_anch, z_proj)
        logits = (logits - torch.max(logits, 1)[0][:, None])
        logits = logits * 0.1
        labels = torch.arange(
            logits.shape[0]).long().to(device=self.args.device)
        moco_loss = (nn.CrossEntropyLoss()(logits,
                                           labels)).to(device=self.args.device)

        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)

        # print(f'z_anch: {z_anch.shape}')
        # print(f'z_target: {z_target.shape}')
        # print(f'z_proj: {z_proj.shape}')
        # print(f'logits: {logits.shape}')
        # print(logits)
        # print(f'labels: {labels.shape}')
        # print(labels)
        # print('---------------\n\n')

        # 1/0

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns, _, _ = self.online_net(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns, _, _ = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        print(moco_loss)
        loss = loss + (moco_loss * self.coeff)
        self.online_net.zero_grad()
        curl_loss = (weights * loss).mean()
        curl_loss.mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        clip_grad_norm_(self.online_net.parameters(),
                        self.norm_clip)  # Clip gradients by L2 norm
        self.optimiser.step()

        mem.update_priorities(idxs,
                              loss.detach().cpu().numpy()
                              )  # Update priorities of sampled transitions

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def initialize_momentum_net(self):
        for param_q, param_k in zip(self.online_net.parameters(),
                                    self.momentum_net.parameters()):
            param_k.data.copy_(param_q.data)  # update
            param_k.requires_grad = False  # not update by gradient

    # Code for this function from https://github.com/facebookresearch/moco
    @torch.no_grad()
    def update_momentum_net(self, momentum=0.999):
        for param_q, param_k in zip(self.online_net.parameters(),
                                    self.momentum_net.parameters()):
            param_k.data.copy_(momentum * param_k.data +
                               (1. - momentum) * param_q.data)  # update

    # Save model parameters on current device (don't move model between devices)
    def save(self, path, name='model.pth'):
        torch.save(self.online_net.state_dict(), os.path.join(path, name))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        with torch.no_grad():
            a, _, _ = self.online_net(state.unsqueeze(0))
            return (a * self.support).sum(2).max(1)[0].item()

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()

Example #12

Show file

File: agent.py Project: lili-chen/SEER

class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount
        self.norm_clip = args.norm_clip

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model:  # Load pretrained model if provided
            if os.path.isfile(args.model):
                state_dict = torch.load(
                    args.model, map_location='cpu'
                )  # Always load tensors onto CPU by default, will shift to GPU if necessary
                if 'conv1.weight' in state_dict.keys():
                    for old_key, new_key in (('conv1.weight',
                                              'convs.0.weight'),
                                             ('conv1.bias', 'convs.0.bias'),
                                             ('conv2.weight',
                                              'convs.2.weight'),
                                             ('conv2.bias', 'convs.2.bias'),
                                             ('conv3.weight',
                                              'convs.4.weight'),
                                             ('conv3.bias', 'convs.4.bias')):
                        state_dict[new_key] = state_dict[
                            old_key]  # Re-map state dict for old pretrained models
                        del state_dict[
                            old_key]  # Delete old keys for strict load_state_dict
                self.online_net.load_state_dict(state_dict)
                print("Loading pretrained model: " + args.model)
            else:  # Raise error if incorrect model path provided
                raise FileNotFoundError(args.model)

        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        # self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
        self.convs_optimiser = optim.Adam(self.online_net.convs.parameters(),
                                          lr=args.learning_rate,
                                          eps=args.adam_eps)
        self.linear_optimiser = optim.Adam(chain(
            self.online_net.fc_h_v.parameters(),
            self.online_net.fc_h_a.parameters(),
            self.online_net.fc_z_v.parameters(),
            self.online_net.fc_z_a.parameters()),
                                           lr=args.learning_rate,
                                           eps=args.adam_eps)

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):

        with torch.no_grad():
            # don't count these calls since it is accounted for after "action = dqn.act(state)" in main.py
            ret = (self.online_net(state.unsqueeze(0)) *
                   self.support).sum(2).argmax(1).item()
            return ret

    # Acts with an ε-greedy policy (used for evaluation only)
    def act_e_greedy(
            self,
            state,
            epsilon=0.001):  # High ε can reduce evaluation scores drastically
        return np.random.randint(
            0, self.action_space
        ) if np.random.random() < epsilon else self.act(state)

    def learn(self, mem, freeze=False):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights, _ = mem.sample(
            self.batch_size)

        # Calculate current state probabilities (online network noise already sampled)
        log_ps = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns = self.online_net(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        loss.mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        clip_grad_norm_(self.online_net.parameters(),
                        self.norm_clip)  # Clip gradients by L2 norm
        # self.optimiser.step()
        if not freeze:
            self.convs_optimiser.step()
        self.linear_optimiser.step()

    def learn_with_latent(self, latent_mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights, ns = latent_mem.sample(
            self.batch_size)

        # Calculate current state probabilities (online network noise already sampled)
        log_ps = self.online_net.forward_with_latent(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)
        with torch.no_grad():
            # Calculate nth next state probabilities
            pns = self.online_net.forward_with_latent(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution ds_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns = self.target_net.forward_with_latent(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # use ns instead of self.n since n is possibly different for each sequence in the batch
            ns = torch.tensor(ns, device=latent_mem.device).unsqueeze(1)
            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**ns) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        loss.mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        clip_grad_norm_(self.online_net.parameters(),
                        self.norm_clip)  # Clip gradients by L2 norm
        # self.optimiser.step()
        self.linear_optimiser.step()

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    # Save model parameters on current device (don't move model between devices)
    def save(self, path, name='model.pth'):
        torch.save(self.online_net.state_dict(), os.path.join(path, name))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        with torch.no_grad():
            return (self.online_net(state.unsqueeze(0)) *
                    self.support).sum(2).max(1)[0].item()

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()

Example #13

Show file

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 gamma=0.99,
                 step_size=1,
                 dueling_dqn=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        if dueling_dqn:
            print("Use dueling dqn")
            self.qnetwork_local = NoisyDuelingDQN(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = NoisyDuelingDQN(state_size, action_size,
                                                   seed).to(device)
        else:
            print("Use non-dueling dqn")
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.gamma = gamma
        self.step_size = step_size

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        return np.argmax(action_values.cpu().data.numpy())

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Compute and minimize loss
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        ## gamma ^ step_size for nstep dqn
        Q_targets = rewards + (pow(self.gamma, self.step_size) *
                               Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Example #14

Show file

File: agent.py Project: ChrisProgramming2018/DeepReinforcementLearningProjects

class Agent(object):
    """ all improvments from Rainbow research work
    """
    def __init__(self, args, state_size, action_size):
        """
        Args:
           param1 (args): args
           param2 (int): args
           param3 (int): args
        """
        self.action_size = action_size
        self.state_size = state_size
        self.atoms = args.atoms
        self.V_min = args.V_min
        self.V_max = args.V_max
        self.device = args.device
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=self.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.qnetwork_local = DQN(args, self.state_size,
                                  self.action_size).to(device=args.device)
        if args.model and os.path.isfile(args.model):
            # Always load tensors onto CPU by default, will shift to GPU if necessary
            self.qnetwork_local.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.qnetwork_local.train()

        self.target_net = DQN(args, self.state_size,
                              self.action_size).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)

    def reset_noise(self):
        """ resets noisy weights in all linear layers """
        self.qnetwork_local.reset_noise()

    def act(self, state):
        """
          acts greedy(max) based on a single state
          Args:
             param1 (int) : state
        """
        with torch.no_grad():
            return (self.qnetwork_local(state.unsqueeze(0).to(self.device)) *
                    self.support).sum(2).argmax(1).item()

    def act_e_greedy(self, state, epsilon=0.001):
        """ acts with epsilon greedy policy
            epsilon exploration vs exploitation traide off
        Args:
            param1(int): state
            param2(float): epsilon
        Return : action int number between 0 and 4
        """
        return np.random.randint(
            0, self.action_size) if np.random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        """ uses samples with the given batch size to improve the Q function
        Args:
            param1 (Experince Replay Buffer) : mem
        """
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)
        # Calculate current state probabilities (online network noise already sampled)
        log_ps = self.qnetwork_local(
            states, log=True)  # Log probabilities log p(s_t, *; theta online)
        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; theat online)

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns = self.qnetwork_local(
                next_states)  # Probabilities p(s_t+n, *; theta online)
            dns = self.support.expand_as(
                pns
            ) * pns  # Distribution d_t+n = (z, p(s_t+n, *; theat online))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a;  theat online))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns = self.target_net(
                next_states)  # Probabilities p(s_t+n,  ; theata target)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; theat online))]; theat target)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n
            ) * self.support.unsqueeze(
                0)  # Tz = R^n + (discoit ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.V_min,
                          max=self.V_max)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.V_min) / self.delta_z  # b = (Tz - Vmin) / delta z
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.qnetwork_local.zero_grad()
        (weights * loss).mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        self.optimizer.step()

        mem.update_priorities(idxs,
                              loss.detach().cpu().numpy()
                              )  # Update priorities of sampled transitions
        self.soft_update()

    def soft_update(self, tau=1e-3):
        """ swaps the network weights from the online to the target

        Args:
           param1 (float): tau
        """
        for target_param, local_param in zip(self.target_net.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update_target_net(self):
        """ copy the model weights from the online to the target network """
        self.target_net.load_state_dict(self.qnetwork_local.state_dict())

    def save(self, path):
        """ save the model weights to a file
        Args:
           param1 (string): pathname
        """
        torch.save(self.qnetwork_local.state_dict(),
                   os.path.join(path, 'model.pth'))

    def evaluate_q(self, state):
        """ Evaluates Q-value based on single state
        """
        with torch.no_grad():
            return (self.qnetwork_local(state.unsqueeze(0)) *
                    self.support).sum(2).max(1)[0].item()

    def train(self):
        """
        activates the backprob. layers for the online network
        """
        self.qnetwork_local.train()

    def eval(self):
        """ invoke the eval from the online network
            deactivates the backprob
            layers like dropout will work in eval model instead
        """
        self.qnetwork_local.eval()

Example #15

Show file

File: agent.py Project: yyht/qait_public

class Agent:
    def __init__(self):
        self.mode = "train"
        with open("config.yaml") as reader:
            self.config = yaml.safe_load(reader)
        print(self.config)
        self.load_config()

        self.online_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.target_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.online_net.train()
        self.target_net.train()
        self.update_target_net()
        for param in self.target_net.parameters():
            param.requires_grad = False

        if self.use_cuda:
            self.online_net.cuda()
            self.target_net.cuda()

        self.naozi = ObservationPool(capacity=self.naozi_capacity)
        # optimizer
        self.optimizer = torch.optim.Adam(
            self.online_net.parameters(),
            lr=self.config['training']['optimizer']['learning_rate'])
        self.clip_grad_norm = self.config['training']['optimizer'][
            'clip_grad_norm']

    def load_config(self):
        # word vocab
        with open("vocabularies/word_vocab.txt") as f:
            self.word_vocab = f.read().split("\n")
        self.word2id = {}
        for i, w in enumerate(self.word_vocab):
            self.word2id[w] = i
        # char vocab
        with open("vocabularies/char_vocab.txt") as f:
            self.char_vocab = f.read().split("\n")
        self.char2id = {}
        for i, w in enumerate(self.char_vocab):
            self.char2id[w] = i

        self.EOS_id = self.word2id["</s>"]
        self.train_data_size = self.config['general']['train_data_size']
        self.question_type = self.config['general']['question_type']
        self.random_map = self.config['general']['random_map']
        self.testset_path = self.config['general']['testset_path']
        self.naozi_capacity = self.config['general']['naozi_capacity']
        self.eval_folder = pjoin(
            self.testset_path, self.question_type,
            ("random_map" if self.random_map else "fixed_map"))
        self.eval_data_path = pjoin(self.testset_path, "data.json")

        self.batch_size = self.config['training']['batch_size']
        self.max_nb_steps_per_episode = self.config['training'][
            'max_nb_steps_per_episode']
        self.max_episode = self.config['training']['max_episode']
        self.target_net_update_frequency = self.config['training'][
            'target_net_update_frequency']
        self.learn_start_from_this_episode = self.config['training'][
            'learn_start_from_this_episode']

        self.run_eval = self.config['evaluate']['run_eval']
        self.eval_batch_size = self.config['evaluate']['batch_size']
        self.eval_max_nb_steps_per_episode = self.config['evaluate'][
            'max_nb_steps_per_episode']

        # Set the random seed manually for reproducibility.
        self.random_seed = self.config['general']['random_seed']
        np.random.seed(self.random_seed)
        torch.manual_seed(self.random_seed)
        if torch.cuda.is_available():
            if not self.config['general']['use_cuda']:
                print(
                    "WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml"
                )
                self.use_cuda = False
            else:
                torch.backends.cudnn.deterministic = True
                torch.cuda.manual_seed(self.random_seed)
                self.use_cuda = True
        else:
            self.use_cuda = False

        if self.question_type == "location":
            self.answer_type = "pointing"
        elif self.question_type in ["attribute", "existence"]:
            self.answer_type = "2 way"
        else:
            raise NotImplementedError

        self.save_checkpoint = self.config['checkpoint']['save_checkpoint']
        self.experiment_tag = self.config['checkpoint']['experiment_tag']
        self.save_frequency = self.config['checkpoint']['save_frequency']
        self.load_pretrained = self.config['checkpoint']['load_pretrained']
        self.load_from_tag = self.config['checkpoint']['load_from_tag']

        self.qa_loss_lambda = self.config['training']['qa_loss_lambda']
        self.interaction_loss_lambda = self.config['training'][
            'interaction_loss_lambda']

        # replay buffer and updates
        self.discount_gamma = self.config['replay']['discount_gamma']
        self.replay_batch_size = self.config['replay']['replay_batch_size']
        self.command_generation_replay_memory = command_generation_memory.PrioritizedReplayMemory(
            self.config['replay']['replay_memory_capacity'],
            priority_fraction=self.config['replay']
            ['replay_memory_priority_fraction'],
            discount_gamma=self.discount_gamma)
        self.qa_replay_memory = qa_memory.PrioritizedReplayMemory(
            self.config['replay']['replay_memory_capacity'],
            priority_fraction=0.0)
        self.update_per_k_game_steps = self.config['replay'][
            'update_per_k_game_steps']
        self.multi_step = self.config['replay']['multi_step']

        # distributional RL
        self.use_distributional = self.config['distributional']['enable']
        self.atoms = self.config['distributional']['atoms']
        self.v_min = self.config['distributional']['v_min']
        self.v_max = self.config['distributional']['v_max']
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.atoms)  # Support (range) of z
        if self.use_cuda:
            self.support = self.support.cuda()
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)

        # dueling networks
        self.dueling_networks = self.config['dueling_networks']

        # double dqn
        self.double_dqn = self.config['double_dqn']

        # counting reward
        self.revisit_counting_lambda_anneal_episodes = self.config[
            'episodic_counting_bonus'][
                'revisit_counting_lambda_anneal_episodes']
        self.revisit_counting_lambda_anneal_from = self.config[
            'episodic_counting_bonus']['revisit_counting_lambda_anneal_from']
        self.revisit_counting_lambda_anneal_to = self.config[
            'episodic_counting_bonus']['revisit_counting_lambda_anneal_to']
        self.revisit_counting_lambda = self.revisit_counting_lambda_anneal_from

        # valid command bonus
        self.valid_command_bonus_lambda = self.config[
            'valid_command_bonus_lambda']

        # epsilon greedy
        self.epsilon_anneal_episodes = self.config['epsilon_greedy'][
            'epsilon_anneal_episodes']
        self.epsilon_anneal_from = self.config['epsilon_greedy'][
            'epsilon_anneal_from']
        self.epsilon_anneal_to = self.config['epsilon_greedy'][
            'epsilon_anneal_to']
        self.epsilon = self.epsilon_anneal_from
        self.noisy_net = self.config['epsilon_greedy']['noisy_net']
        if self.noisy_net:
            # disable epsilon greedy
            self.epsilon_anneal_episodes = -1
            self.epsilon = 0.0

        self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger'])
        self.single_word_verbs = set(["inventory", "look", "wait"])
        self.two_word_verbs = set(["go"])

    def train(self):
        """
        Tell the agent that it's training phase.
        """
        self.mode = "train"
        self.online_net.train()

    def eval(self):
        """
        Tell the agent that it's evaluation phase.
        """
        self.mode = "eval"
        self.online_net.eval()

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def reset_noise(self):
        if self.noisy_net:
            # Resets noisy weights in all linear layers (of online net only)
            self.online_net.reset_noise()

    def zero_noise(self):
        if self.noisy_net:
            self.online_net.zero_noise()
            self.target_net.zero_noise()

    def load_pretrained_model(self, load_from):
        """
        Load pretrained checkpoint from file.

        Arguments:
            load_from: File name of the pretrained model checkpoint.
        """
        print("loading model from %s\n" % (load_from))
        try:
            if self.use_cuda:
                state_dict = torch.load(load_from)
            else:
                state_dict = torch.load(load_from, map_location='cpu')
            self.online_net.load_state_dict(state_dict)
        except:
            print("Failed to load checkpoint...")

    def save_model_to_path(self, save_to):
        torch.save(self.online_net.state_dict(), save_to)
        print("Saved checkpoint to %s..." % (save_to))

    def init(self, obs, infos):
        """
        Prepare the agent for the upcoming games.

        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        # reset agent, get vocabulary masks for verbs / adjectives / nouns
        batch_size = len(obs)
        self.reset_binarized_counter(batch_size)
        self.not_finished_yet = np.ones((batch_size, ), dtype="float32")
        self.prev_actions = [["" for _ in range(batch_size)]]
        self.prev_step_is_still_interacting = np.ones(
            (batch_size, ), dtype="float32"
        )  # 1s and starts to be 0 when previous action is "wait"
        self.naozi.reset(batch_size=batch_size)

    def get_agent_inputs(self, string_list):
        sentence_token_list = [item.split() for item in string_list]
        sentence_id_list = [
            _words_to_ids(tokens, self.word2id)
            for tokens in sentence_token_list
        ]
        input_sentence_char = list_of_token_list_to_char_input(
            sentence_token_list, self.char2id)
        input_sentence = pad_sequences(
            sentence_id_list, maxlen=max_len(sentence_id_list)).astype('int32')
        input_sentence = to_pt(input_sentence, self.use_cuda)
        input_sentence_char = to_pt(input_sentence_char, self.use_cuda)
        return input_sentence, input_sentence_char, sentence_id_list

    def get_game_info_at_certain_step(self, obs, infos):
        """
        Get all needed info from game engine for training.
        Arguments:
            obs: Previous command's feedback for each game.
            infos: Additional information for each game.
        """
        batch_size = len(obs)
        feedback_strings = [preproc(item, tokenizer=self.nlp) for item in obs]
        description_strings = [
            preproc(item, tokenizer=self.nlp) for item in infos["description"]
        ]
        observation_strings = [
            d + " <|> " + fb if fb != d else d + " <|> hello"
            for fb, d in zip(feedback_strings, description_strings)
        ]

        inventory_strings = [
            preproc(item, tokenizer=self.nlp) for item in infos["inventory"]
        ]
        local_word_list = [
            obs.split() + inv.split()
            for obs, inv in zip(observation_strings, inventory_strings)
        ]

        directions = ["east", "west", "north", "south"]
        if self.question_type in ["location", "existence"]:
            # agents observes the env, but do not change them
            possible_verbs = [["go", "inventory", "wait", "open", "examine"]
                              for _ in range(batch_size)]
        else:
            possible_verbs = [
                list(set(item) - set(["", "look"])) for item in infos["verbs"]
            ]

        possible_adjs, possible_nouns = [], []
        for i in range(batch_size):
            object_nouns = [
                item.split()[-1] for item in infos["object_nouns"][i]
            ]
            object_adjs = [
                w for item in infos["object_adjs"][i] for w in item.split()
            ]
            possible_nouns.append(
                list(set(object_nouns) & set(local_word_list[i]) - set([""])) +
                directions)
            possible_adjs.append(
                list(set(object_adjs) & set(local_word_list[i]) - set([""])) +
                ["</s>"])

        return observation_strings, [
            possible_verbs, possible_adjs, possible_nouns
        ]

    def get_state_strings(self, infos):
        description_strings = infos["description"]
        inventory_strings = infos["inventory"]
        observation_strings = [
            _d + _i for (_d, _i) in zip(description_strings, inventory_strings)
        ]
        return observation_strings

    def get_local_word_masks(self, possible_words):
        possible_verbs, possible_adjs, possible_nouns = possible_words
        batch_size = len(possible_verbs)

        verb_mask = np.zeros((batch_size, len(self.word_vocab)),
                             dtype="float32")
        noun_mask = np.zeros((batch_size, len(self.word_vocab)),
                             dtype="float32")
        adj_mask = np.zeros((batch_size, len(self.word_vocab)),
                            dtype="float32")
        for i in range(batch_size):
            for w in possible_verbs[i]:
                if w in self.word2id:
                    verb_mask[i][self.word2id[w]] = 1.0
            for w in possible_adjs[i]:
                if w in self.word2id:
                    adj_mask[i][self.word2id[w]] = 1.0
            for w in possible_nouns[i]:
                if w in self.word2id:
                    noun_mask[i][self.word2id[w]] = 1.0
        adj_mask[:, self.EOS_id] = 1.0

        return [verb_mask, adj_mask, noun_mask]

    def get_match_representations(self,
                                  input_observation,
                                  input_observation_char,
                                  input_quest,
                                  input_quest_char,
                                  use_model="online"):
        model = self.online_net if use_model == "online" else self.target_net
        description_representation_sequence, description_mask = model.representation_generator(
            input_observation, input_observation_char)
        quest_representation_sequence, quest_mask = model.representation_generator(
            input_quest, input_quest_char)

        match_representation_sequence = model.get_match_representations(
            description_representation_sequence, description_mask,
            quest_representation_sequence, quest_mask)
        match_representation_sequence = match_representation_sequence * description_mask.unsqueeze(
            -1)
        return match_representation_sequence

    def get_ranks(self,
                  input_observation,
                  input_observation_char,
                  input_quest,
                  input_quest_char,
                  word_masks,
                  use_model="online"):
        """
        Given input observation and question tensors, to get Q values of words.
        """
        model = self.online_net if use_model == "online" else self.target_net
        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        action_ranks = model.action_scorer(match_representation_sequence,
                                           word_masks)  # list of 3 tensors
        return action_ranks

    def choose_maxQ_command(self, action_ranks, word_mask=None):
        """
        Generate a command by maximum q values, for epsilon greedy.
        """
        if self.use_distributional:
            action_ranks = [
                (item * self.support).sum(2) for item in action_ranks
            ]  # list of batch x n_vocab
        action_indices = []
        for i in range(len(action_ranks)):
            ar = action_ranks[i]
            ar = ar - torch.min(
                ar, -1, keepdim=True
            )[0] + 1e-2  # minus the min value, so that all values are non-negative
            if word_mask is not None:
                assert word_mask[i].size() == ar.size(), (
                    word_mask[i].size().shape, ar.size())
                ar = ar * word_mask[i]
            action_indices.append(torch.argmax(ar, -1))  # batch
        return action_indices

    def choose_random_command(self,
                              batch_size,
                              action_space_size,
                              possible_words=None):
        """
        Generate a command randomly, for epsilon greedy.
        """
        action_indices = []
        for i in range(3):
            if possible_words is None:
                indices = np.random.choice(action_space_size, batch_size)
            else:
                indices = []
                for j in range(batch_size):
                    mask_ids = []
                    for w in possible_words[i][j]:
                        if w in self.word2id:
                            mask_ids.append(self.word2id[w])
                    indices.append(np.random.choice(mask_ids))
                indices = np.array(indices)
            action_indices.append(to_pt(indices, self.use_cuda))  # batch
        return action_indices

    def get_chosen_strings(self, chosen_indices):
        """
        Turns list of word indices into actual command strings.
        chosen_indices: Word indices chosen by model.
        """
        chosen_indices_np = [to_np(item) for item in chosen_indices]
        res_str = []
        batch_size = chosen_indices_np[0].shape[0]
        for i in range(batch_size):
            verb, adj, noun = chosen_indices_np[0][i], chosen_indices_np[1][
                i], chosen_indices_np[2][i]
            res_str.append(self.word_ids_to_commands(verb, adj, noun))
        return res_str

    def word_ids_to_commands(self, verb, adj, noun):
        """
        Turn the 3 indices into actual command strings.

        Arguments:
            verb: Index of the guessing verb in vocabulary
            adj: Index of the guessing adjective in vocabulary
            noun: Index of the guessing noun in vocabulary
        """
        # turns 3 indices into actual command strings
        if self.word_vocab[verb] in self.single_word_verbs:
            return self.word_vocab[verb]
        if self.word_vocab[verb] in self.two_word_verbs:
            return " ".join([self.word_vocab[verb], self.word_vocab[noun]])
        if adj == self.EOS_id:
            return " ".join([self.word_vocab[verb], self.word_vocab[noun]])
        else:
            return " ".join([
                self.word_vocab[verb], self.word_vocab[adj],
                self.word_vocab[noun]
            ])

    def act_random(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        with torch.no_grad():
            batch_size = len(obs)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)
            chosen_indices = word_indices_random
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def act_greedy(self, obs, infos, input_observation, input_observation_char,
                   input_quest, input_quest_char, possible_words):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            batch_size = len(obs)
            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            chosen_indices = word_indices_maxq
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def act(self,
            obs,
            infos,
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            possible_words,
            random=False):
        """
        Acts upon the current list of observations.
        One text command must be returned for each observation.
        """
        with torch.no_grad():
            if self.mode == "eval":
                return self.act_greedy(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            if random:
                return self.act_random(obs, infos, input_observation,
                                       input_observation_char, input_quest,
                                       input_quest_char, possible_words)
            batch_size = len(obs)

            local_word_masks_np = self.get_local_word_masks(possible_words)
            local_word_masks = [
                to_pt(item, self.use_cuda, type="float")
                for item in local_word_masks_np
            ]

            # generate commands for one game step, epsilon greedy is applied, i.e.,
            # there is epsilon of chance to generate random commands
            action_ranks = self.get_ranks(
                input_observation,
                input_observation_char,
                input_quest,
                input_quest_char,
                local_word_masks,
                use_model="online")  # list of batch x vocab
            word_indices_maxq = self.choose_maxQ_command(
                action_ranks, local_word_masks)
            word_indices_random = self.choose_random_command(
                batch_size, len(self.word_vocab), possible_words)

            # random number for epsilon greedy
            rand_num = np.random.uniform(low=0.0,
                                         high=1.0,
                                         size=(batch_size, ))
            less_than_epsilon = (rand_num < self.epsilon).astype(
                "float32")  # batch
            greater_than_epsilon = 1.0 - less_than_epsilon
            less_than_epsilon = to_pt(less_than_epsilon,
                                      self.use_cuda,
                                      type='long')
            greater_than_epsilon = to_pt(greater_than_epsilon,
                                         self.use_cuda,
                                         type='long')
            chosen_indices = [
                less_than_epsilon * idx_random +
                greater_than_epsilon * idx_maxq
                for idx_random, idx_maxq in zip(word_indices_random,
                                                word_indices_maxq)
            ]
            chosen_strings = self.get_chosen_strings(chosen_indices)

            for i in range(batch_size):
                if chosen_strings[i] == "wait":
                    self.not_finished_yet[i] = 0.0

            # info for replay memory
            for i in range(batch_size):
                if self.prev_actions[-1][i] == "wait":
                    self.prev_step_is_still_interacting[i] = 0.0
            # previous step is still interacting, this is because DQN requires one step extra computation
            replay_info = [
                chosen_indices,
                to_pt(self.prev_step_is_still_interacting, self.use_cuda,
                      "float")
            ]

            # cache new info in current game step into caches
            self.prev_actions.append(chosen_strings)
            return chosen_strings, replay_info

    def get_dqn_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.command_generation_replay_memory) < self.replay_batch_size:
            return None

        data = self.command_generation_replay_memory.get_batch(
            self.replay_batch_size, self.multi_step)
        if data is None:
            return None

        obs_list, quest_list, possible_words_list, chosen_indices, rewards, next_obs_list, next_possible_words_list, actual_n_list = data
        batch_size = len(actual_n_list)

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, _ = self.get_agent_inputs(
            obs_list)
        next_input_observation, next_input_observation_char, _ = self.get_agent_inputs(
            next_obs_list)

        possible_words, next_possible_words = [], []
        for i in range(3):
            possible_words.append([item[i] for item in possible_words_list])
            next_possible_words.append(
                [item[i] for item in next_possible_words_list])

        local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(possible_words)
        ]
        next_local_word_masks = [
            to_pt(item, self.use_cuda, type="float")
            for item in self.get_local_word_masks(next_possible_words)
        ]

        action_ranks = self.get_ranks(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            local_word_masks,
            use_model="online"
        )  # list of batch x vocab or list of batch x vocab x atoms
        # ps_a
        word_qvalues = [
            ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1)
            for w_rank, idx in zip(action_ranks, chosen_indices)
        ]  # list of batch or list of batch x atoms
        q_value = torch.mean(torch.stack(word_qvalues, -1),
                             -1)  # batch or batch x atoms
        # log_ps_a
        log_q_value = torch.log(q_value)  # batch or batch x atoms

        with torch.no_grad():
            if self.noisy_net:
                self.target_net.reset_noise()  # Sample new target net noise
            if self.double_dqn:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="online")
                # list of batch x vocab or list of batch x vocab x atoms
                # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                # pns # Probabilities p(s_t+n, ·; θtarget)
                next_action_ranks = self.get_ranks(
                    next_input_observation,
                    next_input_observation_char,
                    input_quest,
                    input_quest_char,
                    next_local_word_masks,
                    use_model="target"
                )  # batch x vocab or list of batch x vocab x atoms
                # pns_a # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms
            else:
                # pns Probabilities p(s_t+n, ·; θonline)
                next_action_ranks = self.get_ranks(next_input_observation,
                                                   next_input_observation_char,
                                                   input_quest,
                                                   input_quest_char,
                                                   next_local_word_masks,
                                                   use_model="target")
                # list of batch x vocab or list of batch x vocab x atoms
                next_word_indices = self.choose_maxQ_command(
                    next_action_ranks,
                    next_local_word_masks)  # list of batch x 1
                next_word_qvalues = [
                    ez_gather_dim_1(w_rank, idx.unsqueeze(-1)).squeeze(1) for
                    w_rank, idx in zip(next_action_ranks, next_word_indices)
                ]  # list of batch or list of batch x atoms

            next_q_value = torch.mean(torch.stack(next_word_qvalues, -1),
                                      -1)  # batch or batch x atoms
            # Compute Tz (Bellman operator T applied to z)
            discount = to_pt((np.ones_like(actual_n_list) *
                              self.discount_gamma)**actual_n_list,
                             self.use_cuda,
                             type="float")
        if not self.use_distributional:
            rewards = rewards + next_q_value * discount  # batch
            loss = F.smooth_l1_loss(q_value, rewards)
            return loss

        with torch.no_grad():
            Tz = rewards.unsqueeze(
                -1) + discount.unsqueeze(-1) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.v_min,
                          max=self.v_max)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.v_min) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = torch.zeros(batch_size, self.atoms).float()
            if self.use_cuda:
                m = m.cuda()
            offset = torch.linspace(0, ((batch_size - 1) * self.atoms),
                                    batch_size).unsqueeze(1).expand(
                                        batch_size, self.atoms).long()
            if self.use_cuda:
                offset = offset.cuda()
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (next_q_value *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (next_q_value *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_q_value,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        loss = torch.mean(loss)
        return loss

    def update_interaction(self):
        # update neural model by replaying snapshots in replay memory
        interaction_loss = self.get_dqn_loss()
        if interaction_loss is None:
            return None
        loss = interaction_loss * self.interaction_loss_lambda
        # Backpropagate
        self.online_net.zero_grad()
        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                       self.clip_grad_norm)
        self.optimizer.step()  # apply gradients
        return to_np(torch.mean(interaction_loss))

    def answer_question(self,
                        input_observation,
                        input_observation_char,
                        observation_id_list,
                        input_quest,
                        input_quest_char,
                        use_model="online"):
        # first pad answerer_input, and get the mask
        model = self.online_net if use_model == "online" else self.target_net
        batch_size = len(observation_id_list)
        max_length = input_observation.size(1)
        mask = compute_mask(input_observation)  # batch x obs_len

        # noun mask for location question
        if self.question_type in ["location"]:
            location_mask = []
            for i in range(batch_size):
                m = [1 for item in observation_id_list[i]]
                location_mask.append(m)
            location_mask = pad_sequences(location_mask,
                                          maxlen=max_length,
                                          dtype="float32")
            location_mask = to_pt(location_mask,
                                  enable_cuda=self.use_cuda,
                                  type='float')
            assert mask.size() == location_mask.size()
            mask = mask * location_mask

        match_representation_sequence = self.get_match_representations(
            input_observation,
            input_observation_char,
            input_quest,
            input_quest_char,
            use_model=use_model)
        pred = model.answer_question(match_representation_sequence,
                                     mask)  # batch x vocab or batch x 2

        # attention sum:
        # sometimes certain word appears multiple times in the observation,
        # thus we need to merge them together before doing further computations
        # ------- but
        # if answer type is not pointing, we just use a pre-defined mapping
        # that maps 0/1 to their positions in vocab
        if self.answer_type == "2 way":
            observation_id_list = []
            max_length = 2
            for i in range(batch_size):
                observation_id_list.append(
                    [self.word2id["0"], self.word2id["1"]])

        observation = to_pt(
            pad_sequences(observation_id_list,
                          maxlen=max_length).astype('int32'), self.use_cuda)
        vocab_distribution = np.zeros(
            (batch_size, len(self.word_vocab)))  # batch x vocab
        vocab_distribution = to_pt(vocab_distribution,
                                   self.use_cuda,
                                   type='float')
        vocab_distribution = vocab_distribution.scatter_add_(
            1, observation, pred)  # batch x vocab
        non_zero_words = []
        for i in range(batch_size):
            non_zero_words.append(list(set(observation_id_list[i])))
        vocab_mask = torch.ne(vocab_distribution, 0).float()
        return vocab_distribution, non_zero_words, vocab_mask

    def point_maxq_position(self, vocab_distribution, mask):
        """
        Generate a command by maximum q values, for epsilon greedy.

        Arguments:
            point_distribution: Q values for each position (mapped to vocab).
            mask: vocab masks.
        """
        vocab_distribution = vocab_distribution - torch.min(
            vocab_distribution, -1, keepdim=True
        )[0] + 1e-2  # minus the min value, so that all values are non-negative
        vocab_distribution = vocab_distribution * mask  # batch x vocab
        indices = torch.argmax(vocab_distribution, -1)  # batch
        return indices

    def answer_question_act_greedy(self, input_observation,
                                   input_observation_char, observation_id_list,
                                   input_quest, input_quest_char):

        with torch.no_grad():
            vocab_distribution, _, vocab_mask = self.answer_question(
                input_observation,
                input_observation_char,
                observation_id_list,
                input_quest,
                input_quest_char,
                use_model="online")  # batch x time
            positions_maxq = self.point_maxq_position(vocab_distribution,
                                                      vocab_mask)
            return positions_maxq  # batch

    def get_qa_loss(self):
        """
        Update neural model in agent. In this example we follow algorithm
        of updating model in dqn with replay memory.
        """
        if len(self.qa_replay_memory) < self.replay_batch_size:
            return None
        transitions = self.qa_replay_memory.sample(self.replay_batch_size)
        batch = qa_memory.qa_Transition(*zip(*transitions))

        observation_list = batch.observation_list
        quest_list = batch.quest_list
        answer_strings = batch.answer_strings
        answer_position = np.array(_words_to_ids(answer_strings, self.word2id))
        groundtruth = to_pt(answer_position, self.use_cuda)  # batch

        input_quest, input_quest_char, _ = self.get_agent_inputs(quest_list)
        input_observation, input_observation_char, observation_id_list = self.get_agent_inputs(
            observation_list)

        answer_distribution, _, _ = self.answer_question(
            input_observation,
            input_observation_char,
            observation_id_list,
            input_quest,
            input_quest_char,
            use_model="online")  # batch x vocab

        batch_loss = NegativeLogLoss(answer_distribution, groundtruth)  # batch
        return torch.mean(batch_loss)

    def update_qa(self):
        # update neural model by replaying snapshots in replay memory
        qa_loss = self.get_qa_loss()
        if qa_loss is None:
            return None
        loss = qa_loss * self.qa_loss_lambda
        # Backpropagate
        self.online_net.zero_grad()
        self.optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(self.online_net.parameters(),
                                       self.clip_grad_norm)
        self.optimizer.step()  # apply gradients
        return to_np(torch.mean(qa_loss))

    def finish_of_episode(self, episode_no, batch_size):
        # Update target networt
        if (
                episode_no + batch_size
        ) % self.target_net_update_frequency <= episode_no % self.target_net_update_frequency:
            self.update_target_net()
        # decay lambdas
        if episode_no < self.learn_start_from_this_episode:
            return
        if episode_no < self.epsilon_anneal_episodes + self.learn_start_from_this_episode:
            self.epsilon -= (self.epsilon_anneal_from - self.epsilon_anneal_to
                             ) / float(self.epsilon_anneal_episodes)
            self.epsilon = max(self.epsilon, 0.0)
        if episode_no < self.revisit_counting_lambda_anneal_episodes + self.learn_start_from_this_episode:
            self.revisit_counting_lambda -= (
                self.revisit_counting_lambda_anneal_from -
                self.revisit_counting_lambda_anneal_to) / float(
                    self.revisit_counting_lambda_anneal_episodes)
            self.revisit_counting_lambda = max(self.epsilon, 0.0)

    def reset_binarized_counter(self, batch_size):
        self.binarized_counter_dict = [{} for _ in range(batch_size)]

    def get_binarized_count(self, observation_strings, update=True):
        count_rewards = []
        batch_size = len(observation_strings)
        for i in range(batch_size):
            key = observation_strings[i]
            if key not in self.binarized_counter_dict[i]:
                self.binarized_counter_dict[i][key] = 0.0
            if update:
                self.binarized_counter_dict[i][key] += 1.0
            r = self.binarized_counter_dict[i][key]
            r = float(r == 1.0)
            count_rewards.append(r)
        return count_rewards

Example #16

Show file

File: atari.py Project: nicomon24/reinforcement-learning

            episode_reward += reward
            episode_steps += 1
            # Insert into replay buffer
            repbuf.add_sample((state, a, reward, next_state, done))
            state = next_state
            # Stats
            total_max_q += q_values.max()
            # Check if we need to train
            if step % STEPS_TO_TRAIN == 0:
                # Get a batch from replaybuffer
                batch = repbuf.get_batch(BATCH_SIZE)
                state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
                pred_nextQ = sess.run(target_dqn.logits, feed_dict={target_dqn.input: next_state_batch})
                max_nextQ = np.max(pred_nextQ, axis=1)
                pred_values = np.array(reward_batch) + np.invert(done_batch).astype('float32') * GAMMA * max_nextQ
                cost = dqn.train(state_batch, action_batch, pred_values, sess)


    elif FLAGS.mode == 'test':
        # Testing mode
        epsilon = 0.05
        rewards = []
        for _ in trange(100):
            done = False
            obs = env.reset()
            reward = 0
            while not done:
                # Choose action
                if np.random.random() < epsilon:
                    # Choose random action
                    a = env.action_space.sample()

Example #17

Show file

File: agent.py Project: superhg2012/TensorFlow-Tutorials

def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

Example #18

Show file

File: rllearn.py Project: osuprg/Gazebo-OpenAIGym

         
         state = next_state
         episode_reward += reward
         
         if done:
             state = env.reset()
             ep_no=ep_no+1
             print('Episode {} reward was {} and resulted in {} and epsilon {} '.format(ep_no,episode_reward,(reward==10),epsilon_by_frame(frame_idx)))
             all_rewards.append(episode_reward)
             episode_reward = 0
             
             mean_10ep_reward = round(np.mean(all_rewards[-11:-1]), 1)
             num_episodes = len(all_rewards)
             
         if len(replay_buffer) > learning_starts and frame_idx % train_freq == 0:
             loss = dqn.train(replay_buffer,batch_size,device=device)
             losses.append(loss.item())
             
         if frame_idx % target_network_update_freq == 0:
             dqn.update_target()
             
         if (frame_idx > batch_size and num_episodes > 10 and frame_idx % checkpoint_freq == 0):
             if saved_mean_reward is None or mean_10ep_reward > saved_mean_reward:
                 saved_mean_reward = mean_10ep_reward
         np.save('/home/sritee/Desktop/reward_data.npy',all_rewards)
 
 except:
     pdb.set_trace()
         
 finally:
     #pdb.set_trace()

Example #19

Show file

File: tune - self.py Project: bjs250/LunarLander

                # Do the soft target update
                paramlist = list()
                for i, param in enumerate(model.parameters()):
                    paramlist.append(param)

                for i, tparam in enumerate(target.parameters()):
                    tparam.data.copy_(tau * paramlist[i].data +
                                      (1 - tau) * tparam.data)

            # Handle epsilon-greedy exploration
            state = torch.from_numpy(state).float().unsqueeze(0)
            model.eval()
            with torch.no_grad():
                Qsa = model(state)

            model.train()

            # Handle exploration/exploitation
            rand = random.uniform(0, 1)
            if rand < epsilon:  # Explore
                action = random.choice(np.arange(total_actions))  #TODO: change
            else:  # Exploit
                action = np.argmax(Qsa.data.numpy())

            # Get the next state
            next_state, reward, done, info = env.step(action)
            score += reward
            mem = (state, action, reward, next_state, done)
            replaybuffer.append(mem)
            state = next_state

Example #20

Show file

class DQNAgent:
    def __init__(self, state_size, action_size, config=RLConfig()):
        self.seed = random.seed(config.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = config.batch_size
        self.batch_indices = torch.arange(config.batch_size).long().to(device)
        self.samples_before_learning = config.samples_before_learning
        self.learn_interval = config.learning_interval
        self.parameter_update_interval = config.parameter_update_interval
        self.per_epsilon = config.per_epsilon
        self.tau = config.tau
        self.gamma = config.gamma

        if config.useDuelingDQN:
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             config.seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              config.seed).to(device)
        else:
            self.qnetwork_local = DQN(state_size, action_size,
                                      config.seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       config.seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=config.learning_rate)

        self.doubleDQN = config.useDoubleDQN
        self.usePER = config.usePER
        if self.usePER:
            self.memory = PrioritizedReplayBuffer(config.buffer_size,
                                                  config.per_alpha)
        else:
            self.memory = ReplayBuffer(config.buffer_size)

        self.t_step = 0

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() < eps:
            return random.choice(np.arange(self.action_size))
        else:
            return np.argmax(action_values.cpu().data.numpy())

    def step(self, state, action, reward, next_state, done, beta):
        self.memory.add(state, action, reward, next_state, done)
        self.t_step += 1
        if self.t_step % self.learn_interval == 0:
            if len(self.memory) > self.samples_before_learning:
                state = torch.from_numpy(state).float().unsqueeze(0).to(device)
                next_state = torch.from_numpy(next_state).float().unsqueeze(
                    0).to(device)
                target = self.qnetwork_local(state).data
                old_val = target[0][action]
                target_val = self.qnetwork_target(next_state).data
                if done:
                    target[0][action] = reward
                else:
                    target[0][
                        action] = reward + self.gamma * torch.max(target_val)
                if self.usePER:
                    states, actions, rewards, next_states, dones, weights, indices = self.memory.sample(
                        self.batch_size, beta)
                else:
                    indices = None
                    weights = None
                    states, actions, rewards, next_states, dones = self.memory.sample(
                        self.batch_size)

                self.learn(states, actions, rewards, next_states, dones,
                           indices, weights, self.gamma)

    def learn(self, states, actions, rewards, next_states, dones, indices,
              weights, gamma):
        states = torch.from_numpy(np.vstack(states)).float().to(device)
        actions = torch.from_numpy(np.vstack(actions)).long().to(device)
        rewards = torch.from_numpy(np.vstack(rewards)).float().to(device)
        next_states = torch.from_numpy(
            np.vstack(next_states)).float().to(device)
        dones = torch.from_numpy(np.vstack(dones.astype(
            np.uint8))).float().to(device)
        Q_targets_next = self.qnetwork_target(next_states).detach()

        if self.doubleDQN:
            # choose the best action from the local network
            next_actions = self.qnetwork_local(next_states).argmax(dim=-1)
            Q_targets_next = Q_targets_next[self.batch_indices, next_actions]
        else:
            Q_targets_next = Q_targets_next.max(1)[0]

        Q_targets = rewards + gamma * Q_targets_next.reshape(
            (self.batch_size, 1)) * (1 - dones)

        pred = self.qnetwork_local(states)
        Q_expected = pred.gather(1, actions)

        if self.usePER:
            errors = torch.abs(Q_expected -
                               Q_targets).data.numpy() + self.per_epsilon
            self.memory.update_priorities(indices, errors)

        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()

        if self.t_step % self.parameter_update_interval == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target,
                             self.tau)

    def soft_update(self, qnetwork_local, qnetwork_target, tau):
        for local_param, target_param in zip(qnetwork_local.parameters(),
                                             qnetwork_target.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Example #21

Show file

File: agent.py Project: Ashutosh-Adhikari/Rainbow

class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max,
                                      args.atoms)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space)
        if args.model and os.path.isfile(args.model):
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
        if args.cuda:
            self.online_net.cuda()
            self.target_net.cuda()
            self.support = self.support.cuda()

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):
        return (self.online_net(state.unsqueeze(0)).data *
                self.support).sum(2).max(1)[1][0]

    # Acts with an ε-greedy policy
    def act_e_greedy(self, state, epsilon=0.001):
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        # Calculate current state probabilities
        self.online_net.reset_noise()  # Sample new noise for online network
        ps = self.online_net(states)  # Probabilities p(s_t, ·; θonline)
        ps_a = ps[range(self.batch_size), actions]  # p(s_t, a_t; θonline)

        # Calculate nth next state probabilities
        self.online_net.reset_noise()  # Sample new noise for action selection
        pns = self.online_net(
            next_states).data  # Probabilities p(s_t+n, ·; θonline)
        dns = self.support.expand_as(
            pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
        argmax_indices_ns = dns.sum(2).max(
            1
        )[1]  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
        self.target_net.reset_noise()  # Sample new target net noise
        pns = self.target_net(
            next_states).data  # Probabilities p(s_t+n, ·; θtarget)
        pns_a = pns[range(
            self.batch_size
        ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

        # Compute Tz (Bellman operator T applied to z)
        Tz = returns.unsqueeze(1) + nonterminals * (
            self.discount**self.n) * self.support.unsqueeze(
                0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
        Tz = Tz.clamp(min=self.Vmin,
                      max=self.Vmax)  # Clamp between supported values
        # Compute L2 projection of Tz onto fixed support z
        b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
        l, u = b.floor().long(), b.ceil().long()
        # Fix disappearing probability mass when l = b = u (b is int)
        l[(u > 0) * (l == u)] -= 1
        u[(l < (self.atoms - 1)) * (l == u)] += 1

        # Distribute probability of Tz
        m = states.data.new(self.batch_size, self.atoms).zero_()
        offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                self.batch_size).unsqueeze(1).expand(
                                    self.batch_size,
                                    self.atoms).type_as(actions)
        m.view(-1).index_add_(
            0, (l + offset).view(-1),
            (pns_a *
             (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
        m.view(-1).index_add_(
            0, (u + offset).view(-1),
            (pns_a *
             (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        ps_a = ps_a.clamp(min=1e-3)  # Clamp for numerical stability in log
        loss = -torch.sum(
            Variable(m) * ps_a.log(),
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        (weights * loss).mean().backward()  # Importance weight losses
        self.optimiser.step()

        mem.update_priorities(
            idxs, loss.data)  # Update priorities of sampled transitions

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    def save(self, path):
        torch.save(self.online_net.state_dict(),
                   os.path.join(path, 'model.pth'))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        return (self.online_net(state.unsqueeze(0)).data *
                self.support).sum(2).max(1)[0][0]

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()

Example #22

Show file

def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()
    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    # 최종 결과값 갯수 '선택할 행동의 갯수' NUM_ACTION 설정
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    # 학습결과 저장 및 확인
    # 한판마다 얻는 점수를 저장하고 확인
    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    # 파일 저장
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 목표신경망 초기화
    brain.update_target_network()

    # 행동을 선택할떄 DQN을 이용할 시점 정함
    # 일정시간이 지나기전에 행동을 무작위 선택하고 게임 진행중 epsilon값 줄여 나감
    epsilon = 1.0

    # 학습진행 조절을 위한 진행된 프레임 횟수
    time_step = 0
    # 학습결과를 확인하기 위한 점수 저장 배열
    total_reward_list = []

    # 학습 시작
    for episode in range(MAX_EPISODE):
        terminal = False  # 게임 종료
        total_reward = 0  # 한게임당 얻은 총 점수

        state = game.reset()  # 게임 초기화
        brain.init_state(state)  # DQN에 게임 초기화

        # 녹색사각형이 다른 사각형에 충돌할때까지 게임 수행
        while not terminal:

            # 학습 초반 (100회 이전)은 무작위로 수행
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 100회 이상이면 무작위값 사용비율을 줄여가면서 수행
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 게임상태, 보상과 게임종료여부 받음
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재상태를 신경망 객체에 기억
            # 기억된 정보를 이용하여 신경망 학습 시킴
            brain.remember(state, action, reward, terminal)

            # 프레임 100번이 넘으면 4프레임마다 한번씩 학습
            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                brain.train()
            # 1000프레임 마다 한번씩 목표 신경망 갱신
            if time_step % TARGET_UPDATE_INTERVAL == 0:
                brain.update_target_network()
            time_step += 1

        # 게임 종료시 획득점수 출력하고 점수 저장
        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
        total_reward_list.append(total_reward)

        # 에피소드 10번마다 받은점수를 로그에 저장, 100마다 학습모델 저장
        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

Example #23

Show file

class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.batch_size = args.batch_size
        self.discount = args.discount
        self.max_gradient_norm = args.max_gradient_norm

        self.policy_net = DQN(args, self.action_space)
        if args.model and os.path.isfile(args.model):
            self.policy_net.load_state_dict(torch.load(args.model))
        self.policy_net.train()

        self.target_net = DQN(args, self.action_space)
        self.update_target_net()
        self.target_net.eval()

        self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr)

    def act(self, state, epsilon):
        if random.random() > epsilon:
            return self.policy_net(state.unsqueeze(0)).max(1)[1].data[0]
        else:
            return random.randint(0, self.action_space - 1)

    def learn(self, mem):
        transitions = mem.sample(self.batch_size)
        batch = Transition(*zip(*transitions))  # Transpose the batch

        states = Variable(torch.stack(batch.state, 0))
        actions = Variable(torch.LongTensor(batch.action).unsqueeze(1))
        rewards = Variable(torch.Tensor(batch.reward))
        non_final_mask = torch.ByteTensor(
            tuple(map(
                lambda s: s is not None,
                batch.next_state)))  # Only process non-terminal next states
        next_states = Variable(
            torch.stack(tuple(s for s in batch.next_state if s is not None),
                        0),
            volatile=True
        )  # Prevent backpropagating through expected action values

        Qs = self.policy_net(states).gather(1, actions)  # Q(s_t, a_t; θpolicy)
        next_state_argmax_indices = self.policy_net(next_states).max(
            1, keepdim=True
        )[1]  # Perform argmax action selection using policy network: argmax_a[Q(s_t+1, a; θpolicy)]
        Qns = Variable(torch.zeros(
            self.batch_size))  # Q(s_t+1, a) = 0 if s_t+1 is terminal
        Qns[non_final_mask] = self.target_net(next_states).gather(
            1, next_state_argmax_indices
        )  # Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget)
        Qns.volatile = False  # Remove volatile flag to prevent propagating it through loss
        target = rewards + (
            self.discount * Qns
        )  # Double-Q target: Y = r + γ.Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget)

        loss = F.smooth_l1_loss(
            Qs, target)  # Huber loss on TD-error δ: δ = Y - Q(s_t, a_t)
        # TODO: TD-error clipping?
        self.policy_net.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm(self.policy_net.parameters(),
                                self.max_gradient_norm)  # Clamp gradients
        self.optimiser.step()

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save(self, path):
        torch.save(self.policy_net.state_dict(),
                   os.path.join(path, 'model.pth'))

    def evaluate_q(self, state):
        return self.policy_net(state.unsqueeze(0)).max(1)[0].data[0]

    def train(self):
        self.policy_net.train()

    def eval(self):
        self.policy_net.eval()

Example #24

Show file

def train_rl(images, targets, folds, stochastic = False, test = False, base_rand = False): 
    print('start train rl')


    #print(images.shape)
    #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold)

    #X_train = X_train.astype("float32", casting='unsafe')
    #X_val = X_val.astype("float32", casting='unsafe')
    #X_test = X_test.astype("float32", casting='unsafe')
    
    #print('check')
    #print(X_train.shape)
    with tf.Session() as sess:
        #onfig = get_config(FLAGS) or FLAGS
       
        model = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, n_act)
        
        rewards = tf.placeholder(tf.float32, [None])
        tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        
        writer = tf.summary.FileWriter('logs', sess.graph)
        summary_merged = tf.summary.merge_all()
        
        print('total %s folds', len(folds))
        
        #(X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput_rl(images, targets, fold)
#        X_train = X_train.astype("float32", casting='unsafe')
#        X_val = X_val.astype("float32", casting='unsafe')
#        X_test = X_test.astype("float32", casting='unsafe')

        ###

        # init target network
        model.update_target_network()
        
        # get next action from DQN
        epsilon = 1.0
        # def frame N
        t_step = 0
        tot_reward_list = []


        MAX_EPISODE = 10000
        n_img = len(targets)
        
        n_epi = n_img
        if stochastic: n_epi = MAX_EPISODE
        

        # call pred & loss 
        n_test = 3 
        if test:  #for debugging
            pred_all, loss_all = predict_all(images[0:n_test, :], targets[0:test, :])
            if not stochastic: n_epi = n_test
        else: pred_all, loss_all = predict_all(images, targets)
        
        #pred_all_train, loss_all_train = predict_all(X_train, y_train)

        #print(pred_all)

        # run simulation
        pred_rl = []
        for epi in range(n_epi):
            terminal = False
            tot_reward = 0

            #init game & get current state
            
            #state parsing
            state = np.expand_dims(images[epi], 0)
            #state = np.expand_dims(X_train[epi], 0)
            model.init_state(state)

            if np.random.rand() < epsilon:
                act = random.randrange(n_act)
            else:
                act = model.get_action()

            if epi > OBSERVE: epsilon -= 1/100
            if base_rand: act = random.randrange(n_act)
            
            #stochastic define
            if stochastic:
                ii = random.randrange(n_img)
                state = np.expand_dims(images[ii], 0)  
                #state = np.expand_dims(X_train[ii], 0)
                state_i = ii

            else:
                state = np.expand_dims(images[epi], 0)
                #state = np.expand_dims(X_train[epi], 0)
                state_i = epi
            
            # get model str by act
            choosen_model = model_list[act]
            
            # reward function
            if pred_all[choosen_model][state_i] == 1:
                reward = 1
                pred_rl.append(1)
            else:
                reward = -2
                pred_rl.append(0)


            tot_reward += reward

            model.remember(state, act, reward, terminal)

            if t_step > OBSERVE and t_step % TRAIN_INTERVAL == 0:
                # DQN train
                model.train()

            if t_step % TARGET_UPDATE_INTERVAL == 0:
                # target update
                model.update_target_network()

            t_step += 1

            print('epi: %d score: %d' % ((epi+1), tot_reward))

            tot_reward_list.append(tot_reward)

            if epi % 10 == 0:
                summary = sess.run(summary_merged, feed_dict={rewards: tot_reward_list})
                writer.add_summary(summary, t_step)
                tot_reward_list = []

            if epi % 100 == 0:
                saver.save(sess, 'model/dqn.ckpt', global_step=t_step)

        return tot_reward_list, pred_rl, pred_all

Example #25

Show file

File: main.py Project: mkrum/Kasparov

        winner_rewards = [1] * len(boards[winner])
        loser_rewards = [-1] * len(boards[loser])

        rewards = np.array(winner_rewards + loser_rewards)
        boards = np.concatenate([boards[winner], boards[loser]])

    else:
        #tie
        one_rewards = [0] * len(boards[1])
        two_rewards = [0] * len(boards[2])

        rewards = np.array(one_rewards + two_rewards)
        boards = np.concatenate([boards[1], boards[2]])
    
    rewards = rewards.reshape(rewards.shape[0], -1)
    model.train(boards, rewards)

    if games % EPOCH == 0:
        gamma *= 1.0
        win_rate.append(test_against_random(model))

        debug_run(model)

    if games % TEST_FRQ == 0:
        print(win_rate)
        plt.plot(win_rate)
        plt.ylabel('Winning Percentage')
        plt.xlabel('Epochs')
        plt.show()
        model.save('./dqn_no/')
        exit()

Example #26

Show file

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, network):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.network = network

        # Q-Network
        if self.network == "duel":
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        else:
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, count):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, count)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            # Epsilon-greedy action selection
            if random.random() > eps:
                return np.argmax(action_values.cpu().data.numpy())
            else:
                return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma, count):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Q values for best actions in next_state
        # from current Q network

        if self.network == "double" or "duel":
            Q_L = self.qnetwork_local(next_states).detach()
            _, actions_prime = Q_L.max(1)

        # get Q values from frozen network for next state and chosen action

        Q_targets_next = self.qnetwork_target(next_states).detach()
        Q_targets_next_s_a_prime = Q_targets_next.gather(
            1, actions_prime.unsqueeze(1))

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next_s_a_prime * (1 - dones))

        # Get expected Q values from target model using current actions
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.smooth_l1_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #if count >= TARGET_UPDATE:
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Example #27

Show file

class Agent(object):
    def __init__(self, args, action_space):
        self.action_space = action_space
        self.batch_size = args.batch_size
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
        self.loss_func = nn.MSELoss()

    # Acts based on single state (no batch)
    def act(self, state):
        with torch.no_grad():
            return self.online_net([state]).argmax(1).item()

    # Acts with an ε-greedy policy (used for evaluation only)
    def act_e_greedy(
            self,
            state,
            epsilon=0.05):  # High ε can reduce evaluation scores drastically
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):

        # Sample transitions
        states, actions, next_states, rewards = mem.sample(self.batch_size)

        q_eval = self.online_net(states).gather(
            1, actions.unsqueeze(1)).squeeze()
        with torch.no_grad():
            q_eval_next_a = self.online_net(next_states).argmax(1)
            q_next = self.target_net(next_states)
            q_target = rewards + self.discount * q_next.gather(
                1, q_eval_next_a.unsqueeze(1)).squeeze()

        loss = self.loss_func(q_eval, q_target)
        self.online_net.zero_grad()
        loss.backward()
        self.optimiser.step()

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    # Save model parameters on current device (don't move model between devices)
    def save(self, path):
        torch.save(self.online_net.state_dict(), path + '.pth')

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        with torch.no_grad():
            return (self.online_net([state])).max(1)[0].item()

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()

Example #28

Show file

File: agent.py Project: htdt/rainbow

class Agent:
  state: int
  actions: int
  history: int = 4
  atoms: int = 5 #51
  Vmin: float = -10
  Vmax: float = 10
  
  lr: float = 1e-5
  batch_size: int = 32
  discount: float = 0.99
  norm_clip: float = 10.

  def __post_init__(self):
    self.support = torch.linspace(self.Vmin, self.Vmax, self.atoms)
    self.delta_z = (self.Vmax - self.Vmin) / (self.atoms - 1)

    self.online_net = DQN(self.state, self.actions, self.history, self.atoms)
    self.online_net.train()

    self.target_net = DQN(self.state, self.actions, self.history, self.atoms)
    self.update_target_net()
    self.target_net.train()
    for param in self.target_net.parameters(): param.requires_grad = False

    self.optimiser = optim.Adam(self.online_net.parameters(), lr=self.lr)

  def act(self, state):
    state = torch.FloatTensor(state).unsqueeze(0)
    with torch.no_grad():
      return (self.online_net(state) * self.support).sum(2).argmax(1).item()

  def act_e_greedy(self, state, epsilon=0.001):
    return random.randrange(self.actions) if random.random() < epsilon else self.act(state)

  def learn(self, buffer):
    state, action, reward, next_state, terminal, weights, idx = buffer.sample(self.batch_size)
    state = torch.FloatTensor(state)
    action = torch.LongTensor(action)
    reward = torch.FloatTensor(reward)
    next_state = torch.FloatTensor(next_state)
    terminal = torch.FloatTensor(terminal)
    weights = torch.FloatTensor(weights)

    log_ps = self.online_net(state, log=True)
    log_ps_a = log_ps[range(self.batch_size), action]

    with torch.no_grad():
      # Calculate nth next state probabilities
      pns = self.online_net(next_state)
      dns = self.support.expand_as(pns) * pns
      argmax_indices_ns = dns.sum(2).argmax(1)
      self.target_net.sample_noise()
      pns = self.target_net(next_state)
      pns_a = pns[range(self.batch_size), argmax_indices_ns]

      # Compute Bellman operator T applied to z
      Tz = reward.unsqueeze(1) + (1 - terminal).unsqueeze(1) * self.discount * self.support.unsqueeze(0) # -10 ... 10 + reward
      Tz.clamp_(min=self.Vmin, max=self.Vmax)
      
      # Compute L2 projection of Tz onto fixed support z
      b = (Tz - self.Vmin) / self.delta_z # 0 ... 4
      l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
      # Fix disappearing probability mass when l = b = u (b is int)
      l[(u > 0) * (l == u)] -= 1
      u[(l < (self.atoms - 1)) * (l == u)] += 1

      # Distribute probability of Tz
      m = state.new_zeros(self.batch_size, self.atoms)
      offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand(self.batch_size, self.atoms).to(action)
      m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
      m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

    loss = -torch.sum(m * log_ps_a, 1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
    loss = weights * loss

#     q_values = self.online_net(state)
#     q_value = q_values[range(self.batch_size), action]

#     next_q_values = self.target_net(next_state)
#     next_q_value = next_q_values.max(1)[0]

#     expected_q_value = reward + self.discount * next_q_value * (1 - terminal)
#     loss = weights * (q_value - expected_q_value).pow(2)

    self.optimiser.zero_grad()
    loss.mean().backward()
    self.optimiser.step()
    nn.utils.clip_grad_norm_(self.online_net.parameters(), self.norm_clip)

    buffer.update_priorities(idx, loss.tolist())

  def update_target_net(self):
    self.target_net.load_state_dict(self.online_net.state_dict())

  def sample_noise(self):
    self.online_net.sample_noise()

  def save(self, path):
    torch.save(self.online_net.state_dict(), path)

  # Evaluates Q-value based on single state (no batch)
  def evaluate_q(self, state):
    with torch.no_grad():
      return self.online_net(state.unsqueeze(0)).max(1)[0].item()

  def train(self):
    self.online_net.train()

  def eval(self):
    self.online_net.eval()

Example #29

Show file

File: agent.py Project: curaai00/HexagonBot

                    else:
                        action = dqn.get_action()

                    if epsilon > args.observe:
                        epsilon -= 0.001

                    reward, done = game.step(action)
                    total_reward += reward

                    if start - time.time() > 1:
                        dqn.save_memory(action, reward, done,
                                        rgb2gray(game.state))
                        count += 1

                    if frame_count > args.observe and frame_count % args.train_term == 0 and count > 50:
                        dqn.train()

                    if frame_count % args.update_term == 0:
                        dqn.copy2target()

                    frame_count += 1
                PressKey(ENTER)
                time.sleep(0.1)
                ReleaseKey(ENTER)

                if episode % 10 == 0:
                    print("Iteration: {}, Score: {}".format(
                        episode, total_reward))
                    rewards.append(total_reward)
                    total_reward = 0

Example #30

Show file

def train():
    print('Training... 뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)

Example #31

Show file

class Agent():
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model and os.path.isfile(args.model):
            # Always load tensors onto CPU by default, will shift to GPU if necessary
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)

    # Resets noisy weights in all linear layers (of online net only)
    def reset_noise(self):
        self.online_net.reset_noise()

    # Acts based on single state (no batch)
    def act(self, state):
        with torch.no_grad():
            return (self.online_net(state.unsqueeze(0)) *
                    self.support).sum(2).argmax(1).item()

    # Acts with an ε-greedy policy (used for evaluation only)
    def act_e_greedy(
            self,
            state,
            epsilon=0.001):  # High ε can reduce evaluation scores drastically
        return random.randrange(
            self.action_space) if random.random() < epsilon else self.act(
                state)

    def learn(self, mem):
        # Sample transitions
        idxs, states, actions, returns, next_states, nonterminals, weights = mem.sample(
            self.batch_size)

        # Calculate current state probabilities (online network noise already sampled)
        log_ps = self.online_net(
            states, log=True)  # Log probabilities log p(s_t, ·; θonline)
        log_ps_a = log_ps[range(self.batch_size),
                          actions]  # log p(s_t, a_t; θonline)

        with torch.no_grad():
            # Calculate nth next state probabilities
            pns = self.online_net(
                next_states)  # Probabilities p(s_t+n, ·; θonline)
            dns = self.support.expand_as(
                pns) * pns  # Distribution d_t+n = (z, p(s_t+n, ·; θonline))
            argmax_indices_ns = dns.sum(2).argmax(
                1
            )  # Perform argmax action selection using online network: argmax_a[(z, p(s_t+n, a; θonline))]
            self.target_net.reset_noise()  # Sample new target net noise
            pns = self.target_net(
                next_states)  # Probabilities p(s_t+n, ·; θtarget)
            pns_a = pns[range(
                self.batch_size
            ), argmax_indices_ns]  # Double-Q probabilities p(s_t+n, argmax_a[(z, p(s_t+n, a; θonline))]; θtarget)

            # Compute Tz (Bellman operator T applied to z)
            Tz = returns.unsqueeze(1) + nonterminals * (
                self.discount**self.n) * self.support.unsqueeze(
                    0)  # Tz = R^n + (γ^n)z (accounting for terminal states)
            Tz = Tz.clamp(min=self.Vmin,
                          max=self.Vmax)  # Clamp between supported values
            # Compute L2 projection of Tz onto fixed support z
            b = (Tz - self.Vmin) / self.delta_z  # b = (Tz - Vmin) / Δz
            l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
            # Fix disappearing probability mass when l = b = u (b is int)
            l[(u > 0) * (l == u)] -= 1
            u[(l < (self.atoms - 1)) * (l == u)] += 1

            # Distribute probability of Tz
            m = states.new_zeros(self.batch_size, self.atoms)
            offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms),
                                    self.batch_size).unsqueeze(1).expand(
                                        self.batch_size,
                                        self.atoms).to(actions)
            m.view(-1).index_add_(
                0, (l + offset).view(-1),
                (pns_a *
                 (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
            m.view(-1).index_add_(
                0, (u + offset).view(-1),
                (pns_a *
                 (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

        loss = -torch.sum(
            m * log_ps_a,
            1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
        self.online_net.zero_grad()
        (weights * loss).mean().backward(
        )  # Backpropagate importance-weighted minibatch loss
        self.optimiser.step()

        mem.update_priorities(
            idxs, loss.detach())  # Update priorities of sampled transitions

    def update_target_net(self):
        self.target_net.load_state_dict(self.online_net.state_dict())

    # Save model parameters on current device (don't move model between devices)
    def save(self, path):
        torch.save(self.online_net.state_dict(),
                   os.path.join(path, 'model.pth'))

    # Evaluates Q-value based on single state (no batch)
    def evaluate_q(self, state):
        with torch.no_grad():
            return (self.online_net(state.unsqueeze(0)) *
                    self.support).sum(2).max(1)[0].item()

    def train(self):
        self.online_net.train()

    def eval(self):
        self.online_net.eval()

Example #32

Show file

def train(IS_IMPORT):
    print('Loading ...')
    sess = tf.Session()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    global_step = tf.Variable(0, trainable=False, name='global_step')

    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION, global_step)
    #brain = DQN(sess, 61, global_step)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))
    totalScores = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.totalScore/ep.', tf.reduce_mean(totalScores))

    total_reward_list = []
    total_score_list = []

    saver = tf.train.Saver(tf.global_variables())

    ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
    writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())

    summary_merged = tf.summary.merge_all()

    if IS_IMPORT == True:
        fs = FileLoad('F:\work\cocos\dqnTest\Resources\scenario - Copy.sce')
    else:
        server.accept()

    brain.update_target_network()
    print('global_step:', sess.run(global_step))

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0
        weight = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        #state = game.reset()
        if IS_IMPORT:
            id, _, _, _, state = fs.readState()
            if id == -1:
                sys.exit(1)
        else:
            id, _, _, _, state = server.readStatus()

        if id == -1:
            continue

        state = reshapeFromPacket(state)
        '''
        state.append(state[2])
        state.append(state[2])               
        '''

        brain.init_state(state)

        while not terminal:
            actionType = "Action:"

            if IS_IMPORT:
                action = fs.readAction()
                if action == -1: sys.exit(1)
                id, reward, totalScore, terminal, state = fs.readState()
                if id == -1: sys.exit(1)
            else:

                if np.random.rand() < epsilon:
                    action = random.randrange(NUM_ACTION)
                    print("Random action:", action)
                    #action = -1
                    #action = random.uniform(-1, 1)
                else:
                    action = brain.get_action()

                #action = brain.get_action()

                if episode > OBSERVE:
                    epsilon -= 1 / 1000

                server.sendX(id, action)

                if action == -1:
                    id2, action = server.readAction()
                    actionType = "Random Action:"

                    if id != id2:
                        print("Invalid Packet", id, id2)

                id, reward, totalScore, terminal, state = server.readStatus()

            reward = reward + (weight * 0.1)
            weight = weight + 1

            print(time.strftime("%H:%M:%S", time.localtime()), id, actionType,
                  action, "totalScore:", totalScore, "reward:", reward,
                  "terminal", terminal)

            if id == -1:
                break
            if terminal == True:
                total_score_list.append(totalScore)

            state = reshapeFromPacket(state)

            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.

                brain.train()
                '''
                try:
                except:
                    print("Train Error!!")
                    time_step -= 1
                '''

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('\t Count of Play: %d Score: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if (episode) % 10 == 0:
            summary = sess.run(summary_merged,
                               feed_dict={
                                   rewards: total_reward_list,
                                   totalScores: total_score_list
                               })
            writer.add_summary(summary, sess.run(global_step))
            total_reward_list = []
            total_score_list = []

        if (episode + 1) % 100 == 0:
            saver.save(sess, MODEL_PATH + '/dqn.ckpt', global_step=global_step)

    #모두 학습한 후에 tflite 파일로 저장
    converter = tf.lite.TFLiteConverter.from_session(sess, [brain.input_X],
                                                     [brain.Q])
    tflite_model = converter.convert()
    open(MODEL_PATH + "/dqn.tflite", "wb").write(tflite_model)
    sys.exit(1)