Esempio n. 1
0
def main():
    policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
    policy_net.apply(init_weights)
    if pretrained:
        ckp = torch.load('/data2/jiangjigang/ckp/dqn.pth')
        policy_net.load_state_dict(
            {k.replace('module.', ''): v
             for k, v in ckp.items()})
    target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
    target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
    target_net.eval()
    optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                        lr=learning_rate)  #定义优化器Adam,可以更换
    buffer = ReplayBuffer(
        buffer_size
    )  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
    criterion = torch.nn.MSELoss(reduction='sum')

    # training
    for i_episode in range(num_episodes):

        state0 = [user_loc, user_dis, node_loc, use_buff]  #获得一个初始化状态
        error = 0.0
        all_reward = 0
        for t in count():
            # 选择动作
            action = e_greedy_select_action(state0, policy_net)
            a = np.array([action.data.cpu().numpy()])
            #print("action selected by e_greedy is {}".format(action))
            # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况
            state1, done, flag = transition_function(state0, action)
            # 利用奖励函数,获得当前的奖励值
            reward, cost_migration = reward_function(state0, action, state1,
                                                     flag)
            all_reward = all_reward + reward
            # 将经验数据存储至buffer中
            buffer.add(state0, a, reward, state1, done)

            # exit an episode after MAX_T steps
            if t > MAX_T:
                break

            #当episode>10时进行网络参数更新,目的是为了让经验池中有较多的数据,使得训练较为稳定。
            if i_episode > 1:

                # 从buffer中取出一批训练样本,训练数据batch由BATCH_SIZE参数决定
                batch = buffer.getBatch(BATCH_SIZE)

                policy_net, target_net, bellman_error = optimize_model(
                    batch, policy_net, target_net, optimizer_policy, criterion)
                error = error + bellman_error.data.cpu().numpy()
            # 进入下一状态
            state0 = state1
        ave_error = error / (t * 1.00)
        ave_reward = all_reward / (t * 1.00)
        print(ave_error, ave_reward)
    torch.save(policy_net.state_dict(), '/data2/jiangjigang/ckp/dqn.pth')
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # Declare variables
        self.exp_id = uuid.uuid4().__str__().replace('-', '_')
        self.args = args
        self.env = env
        self.eps_threshold = None
        self.nA = env.action_space.n
        self.action_list = np.arange(self.nA)
        self.reward_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.max_q_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.loss_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.probability_list = np.zeros(env.action_space.n, np.float32)
        self.cur_eps = self.args.eps
        self.t = 0
        self.ep_len = 0
        self.mode = None
        if self.args.use_pri_buffer:
            self.replay_buffer = NaivePrioritizedBuffer(
                capacity=self.args.capacity, args=self.args)
        else:
            self.replay_buffer = ReplayBuffer(capacity=self.args.capacity,
                                              args=self.args)
        self.position = 0

        self.args.save_dir += f'/{self.exp_id}/'
        os.system(f"mkdir -p {self.args.save_dir}")
        self.meta = MetaData(fp=open(
            os.path.join(self.args.save_dir, 'result.csv'), 'w'),
                             args=self.args)
        self.eps_delta = (self.args.eps -
                          self.args.eps_min) / self.args.eps_decay_window
        self.beta_by_frame = lambda frame_idx: min(
            1.0, args.pri_beta_start + frame_idx *
            (1.0 - args.pri_beta_start) / args.pri_beta_decay)

        # Create Policy and Target Networks
        if self.args.use_dueling:
            print("Using dueling dqn . . .")
            self.policy_net = DuelingDQN(env, self.args).to(self.args.device)
            self.target_net = DuelingDQN(env, self.args).to(self.args.device)
        elif self.args.use_crnn:
            print("Using dueling crnn . . .")
            self.policy_net = CrnnDQN(env).to(self.args.device)
            self.target_net = CrnnDQN(env).to(self.args.device)
        else:
            self.policy_net = DQN(env, self.args).to(self.args.device)
            self.target_net = DQN(env, self.args).to(self.args.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.args.lr,
                                    eps=self.args.optimizer_eps)
        if self.args.lr_scheduler:
            print("Enabling LR Decay . . .")
            self.scheduler = optim.lr_scheduler.ExponentialLR(
                optimizer=self.optimizer, gamma=self.args.lr_decay)
        self.cur_lr = self.optimizer.param_groups[0]['lr']

        # Compute Huber loss
        self.loss = F.smooth_l1_loss

        # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370
        self.policy_net.share_memory()
        self.target_net.share_memory()

        # Set defaults for networks
        self.policy_net.train()
        self.target_net.eval()
        self.target_net.load_state_dict(self.policy_net.state_dict())

        if args.test_dqn:
            # you can load your model here
            ###########################
            # YOUR IMPLEMENTATION HERE #
            print('loading trained model')
            self.load_model()

        if args.use_pri_buffer:
            print('Using priority buffer . . .')
        if args.use_double_dqn:
            print('Using double dqn . . .')

        if args.use_bnorm:
            print("Using batch normalization . . .")

        print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n')

    def init_game_setting(self):
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        with torch.no_grad():
            if self.args.test_dqn:
                q, argq = self.policy_net(
                    Variable(
                        self.channel_first(observation))).data.cpu().max(1)
                return self.action_list[argq]
            # Fill up probability list equal for all actions
            self.probability_list.fill(self.cur_eps / self.nA)
            # Fetch q from the model prediction
            q, argq = self.policy_net(Variable(
                self.channel_first(observation))).data.cpu().max(1)
            # Increase the probability for the selected best action
            self.probability_list[argq[0].item()] += 1 - self.cur_eps
            # Use random choice to decide between a random action / best action
            action = torch.tensor(
                [np.random.choice(self.action_list, p=self.probability_list)])

        ###########################
        return action.item(), q.item()

    def optimize_model(self):
        """
        Function to perform optimization on DL Network
        :return: Loss
        """
        # Return if initial buffer is not filled.
        if len(self.replay_buffer.memory) < self.args.mem_init_size:
            return 0
        if self.args.use_pri_buffer:
            batch_state, batch_action, batch_next_state, batch_reward, batch_done, indices, weights = self.replay_buffer.sample(
                self.args.batch_size, beta=self.beta_by_frame(self.t))
        else:
            batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.replay_buffer.sample(
                self.args.batch_size)
        batch_state = Variable(
            self.channel_first(
                torch.tensor(np.array(batch_state), dtype=torch.float32)))
        batch_action = Variable(
            torch.tensor(np.array(batch_action), dtype=torch.long))
        batch_next_state = Variable(
            self.channel_first(
                torch.tensor(np.array(batch_next_state), dtype=torch.float32)))
        batch_reward = Variable(
            torch.tensor(np.array(batch_reward), dtype=torch.float32))
        batch_done = Variable(
            torch.tensor(np.array(batch_done), dtype=torch.float32))
        policy_max_q = self.policy_net(batch_state).gather(
            1, batch_action.unsqueeze(1)).squeeze(1)
        if self.args.use_double_dqn:
            policy_ns_max_q = self.policy_net(batch_next_state)
            next_q_value = self.target_net(batch_next_state).gather(
                1,
                torch.max(policy_ns_max_q, 1)[1].unsqueeze(1)).squeeze(1)
            target_max_q = next_q_value * self.args.gamma * (1 - batch_done)
        else:
            target_max_q = self.target_net(batch_next_state).detach().max(
                1)[0].squeeze(0) * self.args.gamma * (1 - batch_done)
        # Compute Huber loss
        if self.args.use_pri_buffer:
            loss = (policy_max_q -
                    (batch_reward + target_max_q.detach())).pow(2) * Variable(
                        torch.tensor(weights, dtype=torch.float32))
            prios = loss + 1e-5
            loss = loss.mean()
        else:
            loss = self.loss(policy_max_q, batch_reward + target_max_q)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        # Clip gradients between -1 and 1
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)

        if self.args.use_pri_buffer:
            self.replay_buffer.update_priorities(indices,
                                                 prios.data.cpu().numpy())

        self.optimizer.step()
        return loss.cpu().detach().numpy()

    def train(self):
        """
        Implement your training algorithm here
        """

        ###########################
        # YOUR IMPLEMENTATION HERE #
        def train_fn():
            self.t = 1
            self.mode = "Random"
            train_start = time.time()
            if not self.args.load_dir == '':
                self.load_model()
            for i_episode in range(1, self.args.max_episodes + 1):
                # Initialize the environment and state
                start_time = time.time()
                state = self.env.reset()
                self.reward_list.append(0)
                self.loss_list.append(0)
                self.max_q_list.append(0)
                self.ep_len = 0
                done = False

                # Save Model
                self.save_model(i_episode)
                # Collect garbage
                self.collect_garbage(i_episode)

                # Run the game
                while not done:
                    # Update the target network, copying all weights and biases in DQN
                    if self.t % self.args.target_update == 0:
                        print("Updating target network . . .")
                        self.target_net.load_state_dict(
                            self.policy_net.state_dict())
                    # Select and perform an action
                    self.cur_eps = max(self.args.eps_min,
                                       self.cur_eps - self.eps_delta)
                    if self.cur_eps == self.args.eps_min:
                        self.mode = 'Exploit'
                    else:
                        self.mode = "Explore"
                    action, q = self.make_action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    self.reward_list[-1] += reward
                    self.max_q_list[-1] = max(self.max_q_list[-1], q)
                    # Store the transition in memory
                    self.replay_buffer.push(state, action, next_state, reward,
                                            done)
                    self.meta.update_step(self.t, self.cur_eps,
                                          self.reward_list[-1],
                                          self.max_q_list[-1],
                                          self.loss_list[-1], self.cur_lr)

                    # Increment step and Episode Length
                    self.t += 1
                    self.ep_len += 1

                    # Move to the next state
                    state = next_state

                    # Perform one step of the optimization (on the target network)
                    if self.ep_len % self.args.learn_freq == 0:
                        loss = self.optimize_model()
                        self.loss_list[-1] += loss
                self.loss_list[-1] /= self.ep_len

                # Decay Step:
                if self.args.lr_scheduler:
                    self.cur_lr = self.scheduler.get_lr()[0]
                    if i_episode % self.args.lr_decay_step == 0 and self.cur_lr > self.args.lr_min:
                        self.scheduler.step(i_episode)

                # Update meta
                self.meta.update_episode(
                    i_episode, self.t,
                    time.time() - start_time,
                    time.time() - train_start, self.ep_len,
                    len(self.replay_buffer.memory),
                    self.cur_eps, self.reward_list[-1],
                    np.mean(self.reward_list), self.max_q_list[-1],
                    np.mean(self.max_q_list), self.loss_list[-1],
                    np.mean(self.loss_list), self.mode, self.cur_lr)

        import multiprocessing as mp
        processes = []
        for rank in range(4):
            p = mp.Process(target=train_fn)
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
Esempio n. 3
0
class Agent_DQN():
    def __init__(self, env, test=False):
        self.cuda = torch.device('cuda')
        print("Using device: " + torch.cuda.get_device_name(self.cuda),
              flush=True)

        self.env = env
        self.state_shape = env.observation_space.shape
        self.n_actions = env.action_space.n

        self.memory = deque(maxlen=100000)
        self.batch_size = 32
        self.mem_threshold = 50000

        self.gamma = 0.99

        self.learning_rate = 1e-4

        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_period = 10000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.epsilon_period

        self.update_rate = 4

        self.start_epoch = 1
        self.epochs = 10
        self.epoch = 10000

        self.model = DQN(self.state_shape, self.n_actions).to(self.cuda)
        print("DQN parameters: {}".format(count_parameters(self.model)))

        self.target = DQN(self.state_shape, self.n_actions).to(self.cuda)
        self.target.eval()
        self.target_update = 10000

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)
        if test:
            self.model.load_state_dict(torch.load('model.pt'))

    def init_game_setting(self):
        pass

    def make_action(self, observation, test=False):
        epsilon = 0.01 if test else self.epsilon
        # turn action into tensor
        observation = torch.tensor(observation,
                                   device=self.cuda,
                                   dtype=torch.float)
        # turn off learning
        self.model.eval()
        # epsilon greedy policy
        if random.random() > epsilon:
            # no need to calculate gradient
            with torch.no_grad():
                # choose highest value action
                b = self.model(observation)
                b = b.cpu().data.numpy()
                action = np.random.choice(
                    np.flatnonzero(np.isclose(b, b.max())))
        else:
            # random action
            action = random.choice(np.arange(self.n_actions))
        # turn learning back on
        self.model.train()
        return action

    def replay_buffer(self):
        # Return tuple of sars transitions
        states, actions, rewards, next_states, dones = zip(
            *random.sample(self.memory, self.batch_size))
        states = torch.tensor(np.vstack(states),
                              device=self.cuda,
                              dtype=torch.float)
        actions = torch.tensor(np.array(actions),
                               device=self.cuda,
                               dtype=torch.long)
        rewards = torch.tensor(np.array(rewards, dtype=np.float32),
                               device=self.cuda,
                               dtype=torch.float)
        next_states = torch.tensor(np.vstack(next_states),
                                   device=self.cuda,
                                   dtype=torch.float)
        dones = torch.tensor(np.array(dones, dtype=np.float32),
                             device=self.cuda,
                             dtype=torch.float)
        return states, actions, rewards, next_states, dones

    def experience_replay(self, n=0):
        # clamp gradient
        clamp = False
        # Reset gradient (because it accumulates by default)
        self.optimizer.zero_grad()
        # sample experience memory
        states, actions, rewards, next_states, dones = self.replay_buffer()
        # get Q(s,a) for sample
        Q = self.model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        # get max_a' Q(s',a')
        Q_prime = self.target(next_states).detach().max(1)[0]
        # calculate y = r + gamma * max_a' Q(s',a') for non-terminal states
        Y = rewards + (self.gamma * Q_prime) * (1 - dones)
        # Huber loss of Q and Y
        loss = F.smooth_l1_loss(Q, Y)
        # Compute dloss/dx
        loss.backward()
        # Clamp gradient
        if clamp:
            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
        # Change the weights
        self.optimizer.step()

    def train(self):
        step = 0
        learn_step = 0
        print("Begin Training:", flush=True)
        learn_curve = []
        last30 = deque(maxlen=30)
        for epoch in range(self.start_epoch, self.epochs + 1):
            durations = []
            rewards = []
            flag = []
            # progress bar
            epoch_bar = tqdm(range(self.epoch), total=self.epoch, ncols=200)
            for episode in epoch_bar:
                # reset state
                state = self.env.reset()
                # decay epsilon
                if self.epsilon > self.epsilon_min:
                    self.epsilon -= self.epsilon_decay
                # run one episode
                done = False
                ep_duration = 0
                ep_reward = 0
                while not done:
                    step += 1
                    ep_duration += 1
                    # get epsilon-greedy action
                    action = self.make_action(state)
                    # do action
                    next_state, reward, done, info = self.env.step(action)
                    ep_reward += reward
                    # add transition to replay memory
                    self.memory.append(
                        Transition(state, action, reward, next_state, done))
                    state = next_state
                    # learn from experience, if available
                    if step % self.update_rate == 0 and len(
                            self.memory) > self.mem_threshold:
                        self.experience_replay(learn_step)
                        learn_step += 1
                    # update target network
                    if step % self.target_update == 1:
                        self.target.load_state_dict(self.model.state_dict())

                durations.append(ep_duration)
                rewards.append(ep_reward)
                last30.append(ep_reward)
                learn_curve.append(np.mean(last30))
                flag.append(info['flag_get'])
                epoch_bar.set_description(
                    "epoch {}/{}, avg duration = {:.2f}, avg reward = {:.2f}, last30 = {:2f}"
                    .format(epoch, self.epochs, np.mean(durations),
                            np.mean(rewards), learn_curve[-1]))
            # save model every epoch
            plt.clf()
            plt.plot(learn_curve)
            plt.title(f"DQN Epoch {epoch} with {save_prefix} Reward")
            plt.xlabel('Episodes')
            plt.ylabel('Moving Average Reward')
            if not os.path.exists(f"{save_prefix}_DQN"):
                os.mkdir(f"{save_prefix}_DQN")
            torch.save(self.model.state_dict(),
                       f'{save_prefix}_DQN/DQN_model_ep{epoch}.pt')
            pickle.dump(
                rewards,
                open(f"{save_prefix}_DQN/DQN_reward_ep{epoch}.pkl", 'wb'))
            pickle.dump(flag,
                        open(f"{save_prefix}_DQN/flag_ep{epoch}.pkl", 'wb'))
            plt.savefig(f"{save_prefix}_DQN/epoch{epoch}.png")
            learn_curve = []
Esempio n. 4
0
    0, 101,
    U_num).tolist()  #边缘节点位置 1-100号 (so we suppose that node_num == U_num???)

user_loc = np.random.randint(0, 101, U_num).tolist()  #用户位置 1-100号
user_dis = random_displacement(user_loc)  #用户未来位移 上下左右 -10,10,-1,1
use_buff = np.random.randint(3, 8, U_num).tolist()  #资源所需
state0 = [user_loc, user_dis, node_loc, use_buff]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#主程序部分

policy_net = DQN(U_num, num_actions).to(device)  #初始化Q网络
target_net = DQN(U_num, num_actions).to(device)  #初始化target_Q网络
target_net.load_state_dict(policy_net.state_dict())  #用Q网络的参数初始化target_Q网络
target_net.eval()
optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=learning_rate)  #定义优化器Adam,可以更换
buffer = ReplayBuffer(
    buffer_size)  #定义一个经验池  PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中
criterion = torch.nn.MSELoss(reduction='sum')

# training
for i_episode in range(num_episodes):

    #state0 #获得一个初始化状态

    for t in count():
        # 选择动作
        action = e_greedy_select_action(state0)
        print("action selected by e_greedy is {}".format(action))
Esempio n. 5
0
class Agent():
    """
    RL Agent that interacts with a given environment, learns and adapts succesfull behaviour.
    """


    def __init__(self,state_size, action_size
                 ,batch_size,learn_step_size,buffer_size
                 ,gamma , learning_rate, tau
                 ,seed):
        """
        Intialize the agent and its learning parameter set.

        Parameters
        =========
        state_size (int): Size of the state space
        action_size (int): Size of the action space

        batch_size (int): Size of the batch size used in each learning step
        learn_step_size (int): Number of steps until agent ist trained again
        buffer_size (int): Size of replay memory buffer

        gamma (float): Discount rate that scales future discounts
        learning_rate (float): Learning rate of neural network
        tau (float): Update strenght between local and target network

        seed (float): Random set for initialization
        """

        # ----- Parameter init -----
        # State and action size from environment
        self.state_size = state_size
        self.action_size = action_size

        # Replay buffer and learning properties
        self.batch_size      = batch_size
        self.learn_step_size = learn_step_size
        self.gamma = gamma
        self.tau  = tau

        # General
        self.seed = random.seed(seed)


        # ----- Network and memory init -----
        # Init identical NN as local and target networks and set optimizer
        self.qnetwork_local  = DQN(state_size, action_size, seed).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate)

        # Initialize replay memory and time step (for updating every learn_step_size steps)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0


    def step(self, state, action, reward, next_state, done):
        """
        Append information of past step in memory and trigger learning.

        Parameters
        ==========
        state (array_like): State before action
        action (array_like): Action that was taken
        reward (float): Reward for action
        next_state (array_like): State after action
        done (bool): Indicator if env was solved after action
        """

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every learn_step_size time steps.
        self.t_step = (self.t_step + 1) % self.learn_step_size
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.get_memory_size()  > self.batch_size:
                self.learn()


    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Parameters
        ==========
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Transform state to PyTorch tensor
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        # Get action scores for state from network
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        """
        Get sample of experience tuples and value parameters target network.
        """

        # Get tuples from experience buffer
        experiences = self.memory.get_sample()
        states, actions, rewards, next_states, dones = experiences

        #  -----DQN -----
        #Optional: to be replaced with Double DQN (see below)
        #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # ----- Double DQN -----
        # Detach to not update weights during learning
        # Select maximum value
        # Unsqueeze to reduce the tensor dimension to one
        expected_next_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
        # Get Q values for next actions from target Q-network
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, expected_next_actions)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        # Gather values alon an axis specified by dim
        Q_expected = self.qnetwork_local(states).gather(1, actions)


        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----- Update target network -----
        #Soft update model parameters.
        #θ_target = τ*θ_local + (1 - τ)*θ_target
        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
Esempio n. 6
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.env = env
        self.args = args
        self.gamma = self.args.gamma
        self.batch_size = self.args.batch_size
        self.memory_cap = self.args.memory_cap
        self.n_episode = self.args.n_episode
        self.lr = self.args.learning_rate

        self.epsilon = self.args.epsilon
        self.epsilon_decay_window = self.args.epsilon_decay_window
        self.epsilon_min = self.args.epsilon_min
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.epsilon_decay_window

        self.n_step = self.args.n_step
        self.f_update = self.args.f_update
        self.load_model = self.args.load_model
        self.action_size = self.args.action_size
        #         self.algorithm = self.args.algorithm

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        print('using device ', torch.cuda.get_device_name(0))
        self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
        self.LongTensor = torch.cuda.LongTensor if self.use_cuda else torch.LongTensor
        self.ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor
        self.Tensor = self.FloatTensor

        # Create the policy net and the target net
        self.policy_net = DQN()
        self.policy_net.to(self.device)
        #         if self.algorithm == 'DDQN':
        #             self.policy_net_2 = DQN()
        #             self.policy_net_2.to(self.device)
        self.target_net = DQN()
        self.target_net.to(self.device)
        self.policy_net.train()
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        # buffer
        self.memory = []

        ##
        self.mean_window = 100
        self.print_frequency = 100
        self.out_dir = "DQN_Module_b1_1/"

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            self.policy_net.load_state_dict(
                torch.load('model.pth', map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            if self.algorithm == 'DDQN':
                self.policy_net_2.load_state_dict(
                    torch.load('model.pth', map_location=self.device))
            self.print_test = True

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=False):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if test:
            self.epsilon = self.epsilon_min * 0.5
            observation = observation / 255.
        else:
            self.epsilon = max(self.epsilon - self.epsilon_decay,
                               self.epsilon_min)
        if random.random() > self.epsilon:
            observation = self.Tensor(observation.reshape(
                (1, 84, 84, 4))).transpose(1, 3).transpose(2, 3)
            state_action_value = self.policy_net(
                observation).data.cpu().numpy()
            action = np.argmax(state_action_value)
        else:
            action = random.randint(0, self.action_size - 1)
        ###########################
        return action

    def push(self, state, action, reward, next_state, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if len(self.memory) >= self.memory_cap:
            self.memory.pop(0)
        self.memory.append((state, action, reward, next_state, done))
        ###########################

    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.mini_batch = random.sample(self.memory, self.batch_size)
        ###########################
        return

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.steps_done = 0
        self.steps = []
        self.rewards = []
        self.mean_rewards = []
        self.time = []
        self.best_reward = 0
        self.last_saved_reward = 0
        self.start_time = time.time()
        print('train')
        # continue training from where it stopped
        if self.load_model:
            self.policy_net.load_state_dict(
                torch.load(self.out_dir + 'model.pth',
                           map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.epsilon = self.epsilon_min
            print('Loaded')
        for episode in range(self.n_episode):
            # Initialize the environment and state
            state = self.env.reset() / 255.
            #             self.last_life = 5
            total_reward = 0
            self.step = 0
            done = False

            while (not done) and self.step < 10000:
                # move to next state
                self.step += 1
                self.steps_done += 1
                action = self.make_action(state)
                next_state, reward, done, life = self.env.step(action)
                # lives matter
                #                 self.now_life = life['ale.lives']
                #                 dead = self.now_life < self.last_life
                #                 self.last_life = self.now_life
                next_state = next_state / 255.
                # Store the transition in memory
                self.push(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

                if done:
                    self.rewards.append(total_reward)
                    self.mean_reward = np.mean(
                        self.rewards[-self.mean_window:])
                    self.mean_rewards.append(self.mean_reward)
                    self.time.append(time.time() - self.start_time)
                    self.steps.append(self.step)

                    # print the process to terminal
                    progress = "episode: " + str(
                        episode) + ",\t epsilon: " + str(
                            self.epsilon
                        ) + ",\t Current mean reward: " + "{:.2f}".format(
                            self.mean_reward)
                    progress += ',\t Best mean reward: ' + "{:.2f}".format(
                        self.best_reward) + ",\t time: " + time.strftime(
                            '%H:%M:%S', time.gmtime(self.time[-1]))
                    print(progress)

                    if episode % self.print_frequency == 0:
                        self.print_and_plot()
                    # save the best model
                    if self.mean_reward > self.best_reward and len(
                            self.memory) >= 5000:
                        print('~~~~~~~~~~<Model updated with best reward = ',
                              self.mean_reward, '>~~~~~~~~~~')
                        checkpoint_path = self.out_dir + 'model.pth'
                        torch.save(self.policy_net.state_dict(),
                                   checkpoint_path)
                        self.last_saved_reward = self.mean_reward
                        self.best_reward = self.mean_reward

                if len(self.memory) >= 5000 and self.steps_done % 4 == 0:
                    #                     if self.algorithm == 'DQN':
                    self.optimize_DQN()
                if self.steps_done % self.f_update == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
#                     print('-------<target net updated at step,',self.steps_done,'>-------')

###########################

    def optimize_DQN(self):
        # sample
        self.replay_buffer()
        state, action, reward, next_state, done = zip(*self.mini_batch)

        # transfer 1*84*84*4 to 1*4*84*84, which is 0,3,1,2
        state = self.Tensor(np.float32(state)).permute(0, 3, 1,
                                                       2).to(self.device)
        action = self.LongTensor(action).to(self.device)
        reward = self.Tensor(reward).to(self.device)
        next_state = self.Tensor(np.float32(next_state)).permute(
            0, 3, 1, 2).to(self.device)

        done = self.Tensor(done).to(self.device)

        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        # Compute next Q, including the mask
        next_state_values = self.target_net(next_state).detach().max(1)[0]
        # Compute the expected Q value. stop update if done
        expected_state_action_values = reward + (next_state_values *
                                                 self.gamma) * (1 - done)
        # Compute Huber loss
        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.data)
        # Optimize the model
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
        return

    def print_and_plot(self):
        fig1 = plt.figure(1)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Steps')
        plt.plot(self.steps)
        fig1.savefig(self.out_dir + 'steps.png')

        fig2 = plt.figure(2)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.plot(self.mean_rewards)
        fig2.savefig(self.out_dir + 'rewards.png')

        fig2 = plt.figure(3)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Time')
        plt.plot(self.time)
        fig2.savefig(self.out_dir + 'time.png')
Esempio n. 7
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # import arguments
        self.args = args
        self.env = env
        self.batch_size = self.args.batch_size
        self.gamma = self.args.gamma
        self.lr = self.args.learning_rate
        self.memory_cap = self.args.memory_cap
        self.n_episode = self.args.n_episode
        self.n_step = self.args.n_step
        self.update_f = self.args.update_f
        self.explore_step = self.args.explore_step
        self.action_size = self.args.action_size
        self.algorithm = self.args.algorithm
        self.save_path = "dqn/"
        print('using algorithm ', self.algorithm)

        # whether continue training
        self.load_model = self.args.load_model

        # unify tensor tpye according to device names
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        print('using device ', torch.cuda.get_device_name(0))
        self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
        self.LongTensor = torch.cuda.LongTensor if self.use_cuda else torch.LongTensor
        self.ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor
        self.Tensor = self.FloatTensor  # default type

        # epsilon decay
        self.epsilon = 1.0
        self.epsilon_min = 0.025
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_step

        # Create the policy net and the target net
        self.policy_net = DQN()
        self.policy_net.to(self.device)
        if self.algorithm == 'DDQN':
            self.policy_net_2 = DQN()
            self.policy_net_2.to(self.device)
        self.target_net = DQN()
        self.target_net.to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        # replay buffer
        self.memory = []

        # optimizer
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        if self.algorithm == 'DDQN':
            self.optimizer_2 = optim.Adam(
                params=self.policy_net_2.parameters(), lr=self.lr)

        # other
        self.f_skip = 4  # frame skip
        self.n_avg_reward = 100
        self.f_print = 100
        self.print_test = False

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            self.policy_net.load_state_dict(
                torch.load('model.pth', map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            if self.algorithm == 'DDQN':
                self.policy_net_2.load_state_dict(
                    torch.load('model.pth', map_location=self.device))
            self.print_test = True

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        state = self.env.reset() / 255.
        self.last_life = 5
        self.step = 0
        done = False
        total_reward = 0
        ###########################
        return state, done, total_reward

    def make_action(self, observation, test=False):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if test:
            self.epsilon = self.epsilon_min
            observation = observation / 255.
        else:
            self.epsilon = max(self.epsilon - self.epsilon_decay,
                               self.epsilon_min)
        if random.random() > self.epsilon:
            observation = self.Tensor(observation.reshape(
                (1, 84, 84, 4))).transpose(1, 3).transpose(2, 3)
            state_action_value = self.policy_net(
                observation).data.cpu().numpy()
            action = np.argmax(state_action_value)
        else:
            action = random.randint(0, self.action_size - 1)
        ###########################
        return action

    def push(self, state, action, reward, next_state, dead, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if len(self.memory) >= self.memory_cap:
            self.memory.pop(0)
        self.memory.append((state, action, reward, next_state, dead, done))
        ###########################

    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.mini_batch = random.sample(self.memory, self.batch_size)
        ###########################
        return

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # initialize
        self.steps_done = 0
        self.steps = []
        self.rewards = []
        self.mean_rewards = []
        self.best_reward = 0
        self.last_saved_reward = 0

        start = time.time()
        logfile = open('dqn.log', 'w+')
        # continue training
        if self.load_model:
            self.policy_net.load_state_dict(
                torch.load(self.save_path + 'model.pth',
                           map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.epsilon = self.epsilon_min

        for episode in range(self.n_episode):
            state, done, total_reward = self.init_game_setting()
            while (not done) and self.step < 10000:
                # move to next state
                self.step += 1
                self.steps_done += 1
                action = self.make_action(state)
                next_state, reward, done, life = self.env.step(action)
                # lives matter
                now_life = life['ale.lives']
                dead = (now_life < self.last_life)
                self.last_life = now_life
                next_state = next_state / 255.
                # Store the transition in memory
                self.push(state, action, reward, next_state, dead, done)
                state = next_state
                total_reward += reward

                if len(self.memory
                       ) >= self.n_step and self.steps_done % self.f_skip == 0:
                    if self.algorithm == 'DQN':
                        self.optimize_DQN()
                    elif self.algorithm == 'DDQN':
                        self.optimize_DDQN()
                if self.steps_done % self.update_f == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())

            self.rewards.append(total_reward)
            self.mean_reward = np.mean(self.rewards[-self.n_avg_reward:])
            self.mean_rewards.append(self.mean_reward)
            self.steps.append(self.step)
            # print progress in terminal
            progress = "Episode: " + str(
                episode) + ",\tCurrent mean reward: " + "{:.2f}".format(
                    self.mean_reward
                ) + ',\tBest mean reward: ' + "{:.2f}".format(self.best_reward)
            progress += ",\tCurerent Reward: " + str(
                total_reward) + ",\tTime: " + time.strftime(
                    '%H:%M:%S', time.gmtime(time.time() - start))
            print(progress)
            print(episode,
                  self.mean_reward,
                  self.best_reward,
                  total_reward,
                  time.time() - start,
                  file=logfile)
            logfile.flush()
            if (episode + 1) % self.f_print == 0:
                self.plots()
            # save the best model
            if self.mean_reward > self.best_reward and self.steps_done > self.n_step:
                checkpoint_path = self.save_path + 'model.pth'
                torch.save(self.policy_net.state_dict(), checkpoint_path)
                self.last_saved_reward = self.mean_reward
                self.best_reward = max(self.mean_reward, self.best_reward)
        ###########################

    def optimize_DQN(self):
        # sample
        self.replay_buffer()
        state, action, reward, next_state, dead, done = zip(*self.mini_batch)

        state = self.Tensor(np.float32(state)).permute(0, 3, 1,
                                                       2).to(self.device)
        action = self.LongTensor(action).to(self.device)
        reward = self.Tensor(reward).to(self.device)
        next_state = self.Tensor(np.float32(next_state)).permute(
            0, 3, 1, 2).to(self.device)
        dead = self.Tensor(dead).to(self.device)
        done = self.Tensor(done).to(self.device)

        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        # Compute next Q, including the mask
        next_state_values = self.target_net(next_state).detach().max(1)[0]
        # Compute the expected Q value. stop update if done
        expected_state_action_values = reward + (next_state_values *
                                                 self.gamma) * (1 - done)
        # Compute Huber loss
        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.data)
        # Optimize the model
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
        return

    def optimize_DDQN(self):
        # sample
        self.replay_buffer()
        state, action, reward, next_state, dead, done = zip(*self.mini_batch)

        # transfer 1*84*84*4 to 1*4*84*84, which is 0,3,1,2
        state = self.Tensor(np.float32(state)).permute(0, 3, 1,
                                                       2).to(self.device)
        action = self.LongTensor(action).to(self.device)
        reward = self.Tensor(reward).to(self.device)
        next_state = self.Tensor(np.float32(next_state)).permute(
            0, 3, 1, 2).to(self.device)
        dead = self.Tensor(dead).to(self.device)
        done = self.Tensor(done).to(self.device)

        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        state_action_values_2 = self.policy_net_2(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        # Compute next Q, including the mask
        next_state_values = self.target_net(next_state).detach().max(1)[0]
        next_state_values_2 = self.target_net(next_state).detach().max(1)[0]
        next_state_values = torch.min(next_state_values, next_state_values_2)
        # Compute the expected Q value. stop update if done
        expected_state_action_values = reward + (next_state_values *
                                                 self.gamma) * (1 - done)
        # Compute Huber loss
        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.data)
        self.loss_2 = F.smooth_l1_loss(state_action_values_2,
                                       expected_state_action_values.data)
        # Optimize the model
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
        self.optimizer_2.zero_grad()
        self.loss_2.backward()
        self.optimizer_2.step()
        return

    def plots(self):
        fig1 = plt.figure(1)
        plt.clf()
        plt.title('Training_Steps_per_Episode')
        plt.xlabel('Episode')
        plt.ylabel('Steps')
        plt.plot(self.steps)
        fig1.savefig(self.save_path + 'steps.png')

        fig2 = plt.figure(2)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.plot(self.rewards)

        if len(self.rewards) >= self.n_avg_reward:
            plt.plot(self.mean_rewards)
        fig2.savefig(self.save_path + 'rewards.png')

        rewards = np.array(self.rewards)
        np.save(self.save_path + 'rewards.npy', rewards)
Esempio n. 8
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """

        super(Agent_DQN, self).__init__(env)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')

        ##################
        # YOUR CODE HERE #
        ##################
        self.env = env
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.eps_start = EPS_START
        self.eps_decay = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE

        self.policy_net = DQN(self.env.action_space.n)
        self.target_net = DQN(self.env.action_space.n)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.policy_net.to(device)
        self.target_net.to(device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1.5e-4)
        self.memory = ReplayMemory(10000)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
            self.policy_net.load_state_dict(
                torch.load(os.path.join('save_dir/'
                                        'model-best.pth'),
                           map_location=torch.device('cpu')))
            self.policy_net.eval()

    def init_game_setting(self):
        """

        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary

        """
        ##################
        # YOUR CODE HERE #
        ##################
        pass

    def train(self):
        """
        Implement your training algorithm here
        """
        ##################
        # YOUR CODE HERE #
        ##################
        logfile = open('simple_dqn.log', 'w+')
        step = 0
        num_episodes = 1400000
        for i_episode in range(num_episodes):
            # Initialize the environment and state
            observation = self.env.reset()
            observation = observation.transpose((2, 0, 1))
            observation = observation[np.newaxis, :]
            state = observation
            sum_reward = 0
            for t in count():
                # Select and perform an action
                action = self.make_action(state, test=False)
                next_state, reward, done, _ = self.env.step(action.item())
                reward = np.clip(reward, -1., 1.)
                next_state = next_state.transpose((2, 0, 1))
                next_state = next_state[np.newaxis, :]
                sum_reward += reward
                reward = Tensor([reward])
                step += 1

                # Store the transition in memory
                self.memory.push(torch.from_numpy(state), action,
                                 torch.from_numpy(next_state), reward)

                # Observe new state
                if not done:
                    state = next_state
                else:
                    state = None

                if step >= 5000 and step % 5000 == 0:
                    self.optimize_model()
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
                    # Perform one step of the optimization (on the target network)

                if done:
                    print(
                        'resetting env. episode %d \'s step=%d reward total was %d.'
                        % (i_episode + 1, step, sum_reward))
                    print(
                        'resetting env. episode %d \'s step=%d reward total was %d.'
                        % (i_episode + 1, step, sum_reward),
                        file=logfile)
                    logfile.flush()
                    break

            # Update the target network
            # if i_episode % TARGET_UPDATE == 0:
            #     print("Update the target net.")
            #     # print(self.policy_net.state_dict())
            #     self.target_net.load_state_dict(self.policy_net.state_dict())
            if i_episode % 50 == 0:
                checkpoint_path = os.path.join('save_dir', 'model-best.pth')
                torch.save(self.policy_net.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent

        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)

        Return:
            action: int
                the predicted action from trained model
        """
        ##################
        # YOUR CODE HERE #
        ##################
        global steps_done
        if test:
            observation = observation.transpose((2, 0, 1))
            observation = observation[np.newaxis, :]
            # self.policy_net.eval()
            return self.policy_net(
                Variable(torch.from_numpy(observation),
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1).item()
        else:
            self.policy_net.eval()
            sample = random.random()
            eps_threshold = EPS_END + (EPS_START - EPS_END) * \
                math.exp(-1. * steps_done / EPS_DECAY)
            steps_done += 1
            if sample > eps_threshold:
                return self.policy_net(
                    Variable(
                        torch.from_numpy(observation),
                        volatile=True).type(FloatTensor)).data.max(1)[1].view(
                            1, 1)
            else:
                return LongTensor([[random.randrange(self.env.action_space.n)]
                                   ])

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None]).to(device)
        state_batch = torch.cat(batch.state).to(device)
        action_batch = torch.cat(batch.action).to(device)
        reward_batch = torch.cat(batch.reward).to(device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch.float()).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(BATCH_SIZE, device=device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states.float()).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Esempio n. 9
0
class DQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, alpha, gamma, tau):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.alpha = alpha
        self.gamma = gamma
        self.tau = tau

        # Q Learning Network
        self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.alpha)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.fill_replay_buffer(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.__len__() > BATCH_SIZE:
                experiences = self.memory.get_sample_replay_buffer()
                self.learn_DDQN(experiences, self.gamma, self.alpha, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn_DDQN(self, experiences, gamma, alpha, tau):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get index of maximum value for next state from Q_expected
        Q_argmax = self.qnetwork_local(next_states).detach()
        _, a_prime = Q_argmax.max(1)
        #print (self.qnetwork_local(states).detach())
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(
            1, a_prime.unsqueeze(1))
        #print (Q_targets_next.shape)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        #print (Q_targets.shape)
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        #print (Q_expected.shape)
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 10
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        #Gym parameters
        self.num_actions = env.action_space.n

        # parameters for repaly buffer
        self.buffer_max_len = 20000
        self.buffer = deque(maxlen=self.buffer_max_len)
        self.episode_reward_list = []
        self.moving_reward_avg = []

        # paramters for neural network
        self.batch_size = 32
        self.gamma = 0.999
        self.eps_threshold = 0
        self.eps_start = 1
        self.eps_end = 0.025
        self.max_expisode_decay = 10000
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        #Training
        self.steps_done = 0
        self.num_episode = 20000
        self.target_update = 5000
        self.learning_rate = 1.5e-4

        # Neural Network
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.learning_rate)

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            self.policy_net = torch.load('policy_net.hb5')
            self.policy_net.eval()
            ###########################
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        with torch.no_grad():
            sample = random.random()

            ## Check if this is the best way to decline
            observation = torch.tensor(observation,
                                       dtype=torch.float,
                                       device=self.device).permute(
                                           2, 0, 1).unsqueeze(0)

            if test:
                print("testing")
                return self.policy_net(observation).max(1)[1].item()

            if sample > self.eps_threshold:
                #print("Above threshold")
                return self.policy_net(observation).max(1)[1].item()
            else:
                #print("Below Threshold")
                return self.env.action_space.sample()
        ###########################

    def push(self, state, reward, action, next_state, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.buffer.append((state, reward, action, next_state, done))
        ###########################

    def replay_buffer(self, batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        batch = random.sample(self.buffer, batch_size)
        states = []
        rewards = []
        actions = []
        next_states = []
        dones = []
        for sample in batch:
            state, reward, action, next_state, done = sample
            states.append(state)
            rewards.append(reward)
            actions.append(action)
            next_states.append(next_state)
            dones.append(done)
        ###########################
        return states, rewards, actions, next_states, dones

    def update(self):
        if self.steps_done < 5000:
            return
        states, rewards, actions, next_states, dones = self.replay_buffer(
            self.batch_size)
        loss = self.compute_loss(states, rewards, actions, next_states, dones)
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp(-1, 1)
        self.optimizer.step()

    def compute_loss(self, states, rewards, actions, next_states, dones):
        non_final_mask = [not done for done in dones]

        states = torch.tensor(states,
                              dtype=torch.float).permute(0, 3, 1,
                                                         2).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float).permute(
            0, 3, 1, 2).to(self.device)
        dones = torch.tensor(dones, dtype=torch.long).to(self.device)

        Q_current = self.policy_net.forward(states).gather(
            1, actions.unsqueeze(1))
        Q_current = Q_current.squeeze(1)
        ## Should do this with no grad

        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            next_states[non_final_mask]).max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.gamma) + rewards

        loss = F.smooth_l1_loss(Q_current, expected_state_action_values)

        del states, rewards, actions, next_states, dones, Q_current, next_state_values, expected_state_action_values

        return loss

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        for episode in range(self.num_episode):
            #Check this please
            observation = self.env.reset() / 255

            self.eps_threshold = max(
                1 +
                (((self.eps_end - self.eps_start) / self.max_expisode_decay) *
                 episode), self.eps_end)
            episode_steps = 0
            done = False
            episode_reward = 0
            ## Not sure if this is the right way to do this?
            while not done:
                action = self.make_action(observation, test=False)
                new_observation, reward, done, _ = self.env.step(action)

                new_observation = new_observation / 255
                episode_reward += reward
                self.steps_done += 1
                episode_steps += 1

                self.push(observation, reward, action, new_observation, done)

                ## Updating the network
                self.update()

                observation = new_observation

                if self.steps_done % self.target_update == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
            self.episode_reward_list.append(episode_reward)

            if episode % 100 == 0:
                print('episode: {} reward: {} episode length: {}'.format(
                    episode, episode_reward, episode_steps))
                torch.save(self.policy_net.state_dict(), 'test_model.pt')
        ###########################
        print("Done")
Esempio n. 11
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.env = env
        self.batch_size = BATCH_SIZE
        self.gamma = 0.999
        self.eps_start = EPS_START
        self.eps_decay = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE

        self.policy_net = DQN(self.env.action_space.n)
        self.target_net = DQN(self.env.action_space.n)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        if use_cuda:
            self.policy_net.cuda()
            self.target_net.cuda()

        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-5)
        self.memory = deque(maxlen=10000)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        global steps_done
        self.policy_net.eval()
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * steps_done / EPS_DECAY)
        steps_done += 1
        if sample > eps_threshold:
            return self.policy_net(
                Variable(torch.from_numpy(observation),
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1)
        else:
            return LongTensor([[random.randrange(self.env.action_space.n)]])
        ###########################
        return action

    def push(self, s, a, r, s_, done):
        """ You can add additional arguments as you need.
        Push new data to buffer and remove the old one if the buffer is full.

        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.memory.append((s, a, r, s_, done))
        if len(self.memory) > self.maxlen:
            self.replay_memory_store.popleft()
        self.memory_counter += 1

        ###########################

    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        #print("memory", len(self.memory), self.BATCH_SIZE)
        minibatch = random.sample(self.memory, self.BATCH_SIZE)
        minibatch = np.array(minibatch).transpose(0, 3, 1, 2)
        minibatch = torch.tensor(minibatch / 255.0)
        ###########################
        return minibatch

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
        # detailed explanation).
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True).cuda()
        state_batch = Variable(torch.cat(batch.state)).cuda()
        action_batch = Variable(torch.cat(batch.action)).cuda()
        reward_batch = Variable(torch.cat(batch.reward)).cuda()

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        self.policy_net.train()
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_state_values = Variable(
            torch.zeros(BATCH_SIZE).type(Tensor)).cuda()
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0]
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch
        # Undo volatility (which was used to prevent unnecessary gradients)
        expected_state_action_values = Variable(
            expected_state_action_values.data).cuda()

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        num_episodes = 1400000
        for i_episode in range(num_episodes):
            # Initialize the environment and state
            observation = self.env.reset()
            observation = observation.transpose((2, 0, 1))
            observation = observation[np.newaxis, :]
            state = observation

            for t in count():
                # Select and perform an action
                action = self.make_action(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                next_state = next_state.transpose((2, 0, 1))
                next_state = next_state[np.newaxis, :]
                reward = Tensor([reward])

                # Store the transition in memory
                self.memory.push(torch.from_numpy(state), action,
                                 torch.from_numpy(next_state), reward)

                # Observe new state
                if not done:
                    state = next_state
                else:
                    state = None

                # Perform one step of the optimization (on the target network)
                self.optimize_model()
                if done:
                    print(
                        'resetting env. episode %d \'s reward total was %d.' %
                        (i_episode + 1, t + 1))
                    break
            # Update the target network
            if i_episode % TARGET_UPDATE == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            if i_episode % 50 == 0:
                checkpoint_path = os.path.join('save_dir', 'model-best.pth')
                torch.save(self.policy_net.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
Esempio n. 12
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.env = env
        self.buffer = ReplayBuffer()
        self.num_action = self.env.get_action_space().n
        self.cur_step = 0
        self.greedyPolicy = EpsilonGreedyStrategy(1, 0.025, 0.01)
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.num_episode = args.num_episode
        self.learning_rate = args.learning_rate
        self.sample_batch_size = args.sample_batch_size
        self.gamma = args.gamma
        self.e = 1
        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        if test:
            with torch.no_grad():
                action = self.policy_net(observation).argmax(dim=1).item()
        else:
            if self.e > random.random():
                action = random.randrange(self.num_action)
            else:
                observation = self.transform(observation)
                with torch.no_grad():
                    action = self.policy_net(observation).argmax(dim=1).item()
            self.e -= self.greedyPolicy.get_exploration_rate()
        ###########################
        return action

    def push(self,experience):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.buffer.append(experience)
        ###########################

    def replay_buffer(self, batch_size=32):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        experience = self.buffer.sample(batch_size)
        ###########################
        return  experience

    def transform(self, state):
        state = np.asarray(state) / 255.
        state = torch.tensor(state)
        state = state.unsqueeze(0)
        state = state.permute(0, 3, 1, 2)
        state = state.to(device=self.device, dtype=torch.float)
        return state

    def extract_tensors(self, experiences):
        batch = Experience(*zip(*experiences))
        t1 = batch.state
        t2 = batch.action
        t3 = batch.next_state
        t4 = batch.reward
        t5 = batch.termination
        return t1, t2, t3, t4, t5

    def get_current_q(self, states, actions):
        states = np.asarray(states) / 255.
        a = np.count_nonzero(states)
        states = torch.tensor(states, device=self.device, dtype=torch.float)
        states = states.permute(0, 3, 1, 2)
        actions = torch.tensor(np.asarray(actions), device=self.device, dtype=torch.long).unsqueeze(-1)
        QS = self.policy_net(states).gather(1,  actions)#.requires_grad_(True)
        QS = QS.permute(1, 0)
        return QS[0]

    def get_next_q(self, next_states, terminations):
        next_states = np.asarray(next_states) / 255.
        next_states = torch.tensor(next_states,device=self.device, dtype=torch.float)
        next_states = next_states.permute(0, 3, 1, 2)
        QS = self.target_net(next_states).max(1)[0].detach()#.requires_grad_(True)
        QS = QS * torch.tensor(terminations, device=self.device, dtype=torch.float, requires_grad= True)

        return QS

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        optimizor = optim.Adam(params=self.policy_net.parameters(), lr = self.learning_rate,
                               betas = (0.5, 0.999))
        max_reward = -1
        rewards_sum = 0
        reward_collection = []
        episode_collection = []
        print(self.device)

        for episode in range(self.num_episode):
            done = False
            state = self.env.reset()
            while not done:
                action = self.make_action(state, False)
                next_state, reward, done, info = self.env.step(action)
                self.push(
                    Experience(state,
                               action,
                               next_state,
                               reward,
                               (not done)
                               )
                          )
                rewards_sum += reward
                state = next_state
                if self.buffer.can_sample():
                    experiences = self.buffer.sample(self.sample_batch_size)

                    states, actions, next_states, rewards, terminations = self.extract_tensors(experiences)

                    current_q = self.get_current_q(states, actions)

                    next_q = self.get_next_q(next_states, terminations)

                    target_q = self.gamma * next_q +  torch.tensor(rewards, device=self.device, dtype=torch.float)

                    loss = F.smooth_l1_loss(current_q, target_q)

                    optimizor.zero_grad()
                    loss.backward()
                    for param in self.policy_net.parameters():
                        param.grad.data.clamp_(-1, 1)
                    optimizor.step()
            if episode % 3000 == 0:
                    self.target_net.load_state_dict(self.policy_net.state_dict())
            if episode % 30 == 0:
                print("episode: ", episode, "\t", "average reward :", rewards_sum/30)
                reward_collection.append(rewards_sum/30)
                episode_collection.append(episode)

                if rewards_sum > max_reward:
                    torch.save(self.policy_net.state_dict(), "model/policy_net_max_reward.pth")
                rewards_sum = 0
            if episode%1000 == 0:
                torch.save(self.policy_net.state_dict(), "model/policy_net.pth")
        torch.save(self.policy_net.state_dict(), "model/policy_net.pth")
        x = episode_collection
        y = reward_collection
        plt.plot(x,y)
        plt.show()
        plt.savefig('episode-reward.png')
class Agent_DQN_Trainer(object):
    def __init__(self, env, args):

        # Training Parameters
        self.args = args
        self.env = env
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.gamma = args.gamma_reward_decay
        self.n_actions = env.action_space.n
        self.output_logs = args.output_logs
        self.step = 8e6
        self.curr_step = 0
        self.ckpt_path = args.save_dir
        self.epsilon = args.eps_start
        self.eps_end = args.eps_end
        self.target_update = args.update_target
        self.observe_steps = args.observe_steps
        self.explore_steps = args.explore_steps
        self.saver_steps = args.saver_steps
        self.resume = args.resume
        self.writer = TensorboardSummary(self.args.log_dir).create_summary()
        # Model Settings

        self.cuda = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.policy_net = DQN(4, self.n_actions)
        self.target_net = DQN(4, self.n_actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        if self.cuda:
            self.policy_net.to(self.cuda)
            self.target_net.to(self.cuda)

        self.target_net.eval()
        train_params = self.policy_net.parameters()
        self.optimizer = optim.RMSprop(train_params,
                                       self.lr,
                                       momentum=0.95,
                                       eps=0.01)
        self.memory = ReplayMemory(args.replay_memory_size)

        if args.resume:
            if not os.path.isfile(args.resume):
                raise RuntimeError("=> no checkpoint found at '{}'".format(
                    args.resume))
            checkpoint = torch.load(args.resume)

            self.epsilon = checkpoint['epsilon']
            self.curr_step = checkpoint['step']

            self.policy_net.load_state_dict(checkpoint['policy_state_dict'])
            self.target_net.load_state_dict(checkpoint['target_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['episode']))

    def epsilon_greedy_policy(self, observation, nA, test=False):

        observation = to_float(observation).to(self.cuda)
        # print("size of observation->"+str(sys.getsizeof(observation.storage())))
        sample = random.random()

        if test:
            return self.policy_net(observation).max(1)[1].view(1, 1).item()

        if sample <= self.epsilon:
            action = torch.tensor([[random.randrange(self.n_actions)]],
                                  device=self.cuda,
                                  dtype=torch.long)
        else:
            with torch.no_grad():
                action = self.policy_net(observation).max(1)[1].view(1, 1)

        return action

    def optimize_model(self):

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.cuda,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [to_float(s) for s in batch.next_state if s is not None])
        state_batch = torch.cat(
            [to_float(s).to(self.cuda) for s in batch.state])
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)
        next_state_values = torch.zeros(self.batch_size, device=self.cuda)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        loss = F.smooth_l1_loss(
            state_action_values.float(),
            expected_state_action_values.unsqueeze(1).float())
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)

        self.optimizer.step()
        return loss.item()

    def train(self):

        current_loss = 0
        train_rewards = []
        train_episode_len = 0.0
        file_loss = open(self.output_logs, "a")
        file_loss.write("episode,step,epsilon,reward,loss,length\n")
        print("Training Started")
        episode = 0
        loss = 0.0

        while self.curr_step < self.step:
            state = to_tensor(self.env.reset())

            # * State is in torch.uint8 format , convert before passing to model*#
            done = False
            episode_reward = 0.0
            train_loss = 0
            s = 0  # length of episode
            while not done:
                # self.env.env.render()

                action = self.epsilon_greedy_policy(state, self.n_actions)

                new_state, reward, done, _ = self.env.step(
                    action.item())  # new_state torch.uint8 format
                new_state, reward = to_tensor(new_state).to(
                    self.cuda), torch.tensor([reward], device=self.cuda)
                episode_reward += reward
                self.memory.push(state, action, new_state, reward)

                if (self.curr_step > self.observe_steps) and (
                        self.curr_step % self.args.update_current) == 0:
                    loss = self.optimize_model()
                    train_loss += loss

                print(
                    'Step: %i,  Episode: %i,  Action: %i,  Reward: %.0f,  Epsilon: %.5f, Loss: %.5f'
                    % (self.curr_step, episode, action.item(), reward.item(),
                       self.epsilon, loss),
                    end='\r')

                if self.curr_step > self.observe_steps and self.curr_step % self.target_update == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
                    # TO CHECK APPROXIMATELY HOW MUCH GPU MEMORY OUR REPLAY MEMORY IS CONSUMING
                    print(torch.cuda.get_device_name(0))
                    print('Memory Usage:')
                    print('Allocated:',
                          round(torch.cuda.memory_allocated(0) / 1024**3, 1),
                          'GB')
                    print('Cached:   ',
                          round(torch.cuda.memory_cached(0) / 1024**3, 1),
                          'GB')

                if self.epsilon > self.args.eps_end and self.curr_step > self.observe_steps:
                    interval = self.args.eps_start - self.args.eps_end
                    self.epsilon -= interval / float(self.args.explore_steps)

                self.curr_step += 1
                state = new_state
                s += 1

                if self.curr_step % self.args.saver_steps == 0 and episode != 0 and self.curr_step != 0:
                    k = {
                        'step': self.curr_step,
                        'epsilon': self.epsilon,
                        'episode': episode,
                        'policy_state_dict': self.policy_net.state_dict(),
                        'target_state_dict': self.target_net.state_dict(),
                        'optimizer': self.optimizer.state_dict()
                    }
                    filename = os.path.join(self.ckpt_path, 'ckpt.pth.tar')
                    torch.save(k, filename)

            episode += 1
            train_rewards.append(episode_reward.item())
            train_episode_len += s

            if episode % self.args.num_eval == 0 and episode != 0:
                current_loss = train_loss
                avg_reward_train = np.mean(train_rewards)
                train_rewards = []
                avg_episode_len_train = train_episode_len / float(
                    self.args.num_eval)
                train_episode_len = 0.0
                file_loss.write(
                    str(episode) + "," + str(self.curr_step) + "," +
                    "{:.4f}".format(self.epsilon) + "," +
                    "{:.2f}".format(avg_reward_train) + "," +
                    "{:.4f}".format(current_loss) + "," +
                    "{:.2f}".format(avg_episode_len_train) + "\n")
                file_loss.flush()
                self.writer.add_scalar('train_loss/episode(avg100)',
                                       current_loss, episode)
                self.writer.add_scalar('episode_reward/episode(avg100)',
                                       avg_reward_train, episode)
                self.writer.add_scalar('length of episode/episode(avg100)',
                                       avg_episode_len_train, episode)

            self.writer.add_scalar('train_loss/episode', train_loss, episode)
            self.writer.add_scalar('episode_reward/episode', episode_reward,
                                   episode)
            self.writer.add_scalar('epsilon/num_steps', self.epsilon,
                                   self.curr_step)
            self.writer.add_scalar('length of episode/episode', s, episode)

        print("GAME OVER")
Esempio n. 14
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        self.action = env.get_action_space()

        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Using device:', self.device)
        self.model = DQN().to(self.device)
        self.model_target = DQN().to(self.device)
        self.episode = 100000
        self.max_steps_per_episode = 14000
        self.update_target_network = 10000
        self.epsilon = 1.0
        self.min_epsilon = 0.1
        self.step_epsilon = (self.epsilon - self.min_epsilon) / (1E6)
        self.env = env
        self.history = []
        self.buffer_size = min(args.history_size // 5, 2000)
        self.history_size = args.history_size
        self.learning_rate = 1e-4
        self.name = args.name
        self.batch_size = 32
        self.gamma = 0.99
        self.priority = []
        self.w = 144
        self.h = 256
        self.mode = args.mode
        self.delay = args.delay
        self.epoch = args.continue_epoch
        if args.test_dqn or self.epoch > 0:
            #you can load your model here
            print('loading trained model')
            ###########################
            self.model.load_state_dict(
                torch.load(self.name + '.pth', map_location=self.device))
            self.model_target.load_state_dict(
                torch.load(self.name + '.pth', map_location=self.device))
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.model.eval()
        with torch.no_grad():
            if test == False:
                if np.random.random() < self.epsilon or len(
                        self.history) < self.buffer_size:
                    action = int(np.random.choice([0, 1], 1)[0])
                else:
                    obs = torch.from_numpy(observation).to(self.device).float()
                    action_prob = self.model(obs.view(1, 12, self.h, self.w))
                    action = torch.argmax(action_prob).detach().item()
                return action

            else:
                observation = np.swapaxes(observation, 0, 2) / 255.
                obs = torch.from_numpy(observation).to(self.device).float()
                action_prob = self.model(obs.view(1, 12, self.h, self.w))
                action = torch.argmax(action_prob).detach().item()

                return self.action[action]
        ###########################

    def push(self, state, action, reward, done, state_next, smooth=None):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.history.append(
            np.array([state, action, reward, done, state_next, smooth]))

        if len(self.history) > self.history_size:
            self.history.pop(0)

        ###########################

    def replay_buffer(self, refresh=False):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if 'prioritized' in self.mode.split('_'):
            if refresh:
                self.priority = np.zeros(len(self.history))
                for i in range(len(self.history)):
                    max_reward, _ = torch.max(self.model_target(
                        torch.from_numpy(self.history[i][4]).to(
                            self.device).float().view(1, 12, self.h, self.w)),
                                              axis=1)
                    max_reward = max_reward.detach().item()
                    Q = self.model(
                        torch.from_numpy(
                            self.history[i][0]).to(self.device).float().view(
                                1, 12, self.h,
                                self.w))[0,
                                         self.history[i][1]].detach().item()
                    self.priority[i] = abs(
                        (self.history[i][2] + self.gamma * max_reward - Q))
                self.priority = self.priority / sum(self.priority)
                return 0
            priority = np.zeros(len(self.history))
            priority[:len(self.priority)] = self.priority
            if sum(priority) == 0:
                indices = np.random.choice(range(len(self.history)),
                                           size=self.batch_size)
            else:
                indices = np.random.choice(range(len(self.history)),
                                           size=self.batch_size,
                                           p=priority)

            ###########################
            return indices
        else:
            return np.random.choice(range(len(self.history)),
                                    size=self.batch_size)

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        episode_reward_history = []
        best_reward = -10
        optimizer = torch.optim.Adam(self.model.parameters(),
                                     lr=self.learning_rate)
        # optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.learning_rate,momentum=0.5)
        loss_fn = torch.nn.SmoothL1Loss()
        frame_count = 0
        if self.epoch > 0:
            f = open(self.name + '.txt', "a")
        else:
            f = open(self.name + '.txt', "w")
        done = False
        for ep in range(self.epoch, self.episode):
            state = self.env.reset()
            state = np.swapaxes(state, 0, 2) / 255.
            episode_reward = 0
            pre_action = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            smooth = 0
            for timestep in range(0, self.max_steps_per_episode):
                frame_count += 1
                action = self.make_action(state, test=False)
                if done:
                    action = 1

                # Decay
                self.epsilon -= self.step_epsilon
                self.epsilon = max(self.epsilon, self.min_epsilon)

                # next frame
                state_next, reward, done, _ = self.env.step(
                    self.action[action])
                state_next = np.swapaxes(state_next, 0, 2) / 255.
                episode_reward += reward
                # print(reward)
                #normalize reward
                # reward = np.sign(reward)
                # Save actions and states in replay buffer

                state = state_next
                if 'smooth1' in self.mode.split('_'):
                    pre_action.pop(0)
                    pre_action.append(action)
                    smooth = float(np.mean(pre_action) - 0.5)

                self.push(state, action, reward, done, state_next, smooth)

                if frame_count % 8 == 0 and len(
                        self.history) >= self.buffer_size:
                    if frame_count % self.history_size // 10 == 0 and 'prioritized' in self.mode.split(
                            '_'):
                        #update priority vector
                        self.replay_buffer(refresh=True)
                    indice = self.replay_buffer()
                    self.model.train()
                    # data_batch = torch.from_numpy(np.array(self.history)[indice]).to(self.device).float()
                    state_sample = torch.from_numpy(
                        np.array([self.history[i][0]
                                  for i in indice])).to(self.device).float()
                    action_sample = torch.from_numpy(
                        np.array([self.history[i][1]
                                  for i in indice])).to(self.device).float()
                    rewards_sample = torch.from_numpy(
                        np.array([self.history[i][2]
                                  for i in indice])).to(self.device).float()
                    done_sample = torch.from_numpy(
                        np.array([self.history[i][3]
                                  for i in indice])).to(self.device).float()
                    next_state_sample = torch.from_numpy(
                        np.array([self.history[i][4]
                                  for i in indice])).to(self.device).float()
                    smooth_sample = torch.from_numpy(
                        np.array([self.history[i][5]
                                  for i in indice])).to(self.device).float()
                    future_rewards = self.model_target(next_state_sample)

                    max_reward, _ = torch.max(future_rewards, axis=1)
                    updated_q_values = rewards_sample + self.gamma * max_reward
                    updated_q_values = updated_q_values * (
                        1 - done_sample) - done_sample
                    mask = F.one_hot(action_sample.long(),
                                     2).to(self.device).float()

                    q_values = self.model(state_sample)
                    q_action = torch.sum(q_values * mask, axis=1)
                    loss = loss_fn(q_action, updated_q_values)

                    if 'smooth1' in self.mode.split('_') and self.delay < ep:
                        penalty = torch.abs((ep - self.delay) / self.episode *
                                            torch.sum(smooth_sample))
                        loss += penalty

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm(self.model.parameters(), 1.0)
                    optimizer.step()

                if frame_count % self.update_target_network == 0:
                    self.model_target.load_state_dict(self.model.state_dict())

                if done:
                    break
            episode_reward_history.append(episode_reward)
            if len(episode_reward_history) > 30:
                del episode_reward_history[:1]
            running_reward = np.mean(episode_reward_history)
            #             if ep%500==0:
            #                 print("Episode:\t{},\t Avereged reward: {:.2f}\n".format(ep,running_reward))
            f.write("Episode:\t{},\t Avereged reward: {:.2f}\n".format(
                ep, running_reward))
            if running_reward > best_reward:
                best_reward = running_reward
                torch.save(self.model.state_dict(), self.name + '.pth')
        f.close()