コード例 #1
0
    def __init__(self,
                 meta_controller_experience_memory=None,
                 lr=0.00025,
                 alpha=0.95,
                 eps=0.01,
                 batch_size=32,
                 gamma=0.99,
                 num_options=12):
        # expereince replay memory
        self.meta_controller_experience_memory = meta_controller_experience_memory
        self.lr = lr  # learning rate
        self.alpha = alpha  # optimizer parameter
        self.eps = 0.01  # optimizer parameter
        self.gamma = 0.99
        # BUILD MODEL
        USE_CUDA = torch.cuda.is_available()
        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            self.device = torch.device("cuda:1")
        elif torch.cuda.device_count() == 1:
            self.device = torch.device("cuda:0")
        else:
            self.device = torch.device("cpu")

        dfloat_cpu = torch.FloatTensor
        dfloat_gpu = torch.cuda.FloatTensor

        dlong_cpu = torch.LongTensor
        dlong_gpu = torch.cuda.LongTensor

        duint_cpu = torch.ByteTensor
        dunit_gpu = torch.cuda.ByteTensor

        dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
        ) else torch.FloatTensor
        dlongtype = torch.cuda.LongTensor if torch.cuda.is_available(
        ) else torch.LongTensor
        duinttype = torch.cuda.ByteTensor if torch.cuda.is_available(
        ) else torch.ByteTensor

        self.dtype = dtype
        self.dlongtype = dlongtype
        self.duinttype = duinttype

        Q = DQN(in_channels=4, num_actions=num_options).type(dtype)
        Q_t = DQN(in_channels=4, num_actions=num_options).type(dtype)
        Q_t.load_state_dict(Q.state_dict())
        Q_t.eval()
        for param in Q_t.parameters():
            param.requires_grad = False

        Q = Q.to(self.device)
        Q_t = Q_t.to(self.device)

        self.batch_size = batch_size
        self.Q = Q
        self.Q_t = Q_t
        # optimizer
        optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps)
        self.optimizer = optimizer
        print('init: Meta Controller --> OK')
コード例 #2
0
class Agent():
    def __init__(self, action_size):
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 500000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net and the target net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)
        self.target_net = DQN(action_size)
        self.target_net.to(device)

        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(
            self.optimizer,
            step_size=scheduler_step_size,
            gamma=scheduler_gamma)

        # Initialize a target network and initialize the target network to the policy net
        ### CODE ###
        self.update_target_net()

    def load_policy_net(self, path):
        self.policy_net = torch.load(path)

    # after some time interval update the target net to be same with policy net
    def update_target_net(self):
        ### CODE ###
        self.target_net.load_state_dict(self.policy_net.state_dict())

    """Get action using policy net using epsilon-greedy policy"""

    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            ### CODE #### (copy over from agent.py!)
            return torch.tensor([[random.randrange(self.action_size)]],
                                device=device,
                                dtype=torch.long)
        else:
            ### CODE #### (copy over from agent.py!)
            with torch.no_grad():
                state = torch.FloatTensor(state).unsqueeze(0).cuda()
            return self.policy_net(state).max(1)[1].view(1, 1)

    # pick samples randomly from replay memory (with batch_size)
    def train_policy_net(self, frame):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch = self.memory.sample_mini_batch(frame)
        mini_batch = np.array(mini_batch).transpose()

        history = np.stack(mini_batch[0], axis=0)
        states = np.float32(history[:, :4, :, :]) / 255.
        states = torch.from_numpy(states).cuda()
        actions = list(mini_batch[1])
        actions = torch.LongTensor(actions).cuda()
        rewards = list(mini_batch[2])
        rewards = torch.FloatTensor(rewards).cuda()
        next_states = np.float32(history[:, 1:, :, :]) / 255.
        next_states = torch.tensor(next_states).cuda()
        dones = mini_batch[3]  # checks if the game is over
        musk = torch.tensor(list(map(int, dones == False)), dtype=torch.bool)

        # Your agent.py code here with double DQN modifications
        ### CODE ###
        # Compute Q(s_t, a), the Q-value of the current state
        ### CODE ####
        state_action_values = self.policy_net(states).gather(
            1, actions.view(batch_size, -1))
        # Compute Q function of next state
        ### CODE ####
        next_state_values = torch.zeros(batch_size, device=device).cuda()
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, next_states)),
                                      device=device,
                                      dtype=torch.uint8)
        non_final_next_states = torch.cat([
            i for i in next_states if i is not None
        ]).view(states.size()).cuda()
        # Compute the expected Q values
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.discount_factor) + rewards
        # Compute the Huber Loss
        ### CODE ####
        loss = F.smooth_l1_loss(state_action_values.view(32),
                                expected_state_action_values)

        # Optimize the model, .step() both the optimizer and the scheduler!
        ### CODE ####
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
コード例 #3
0
class Agent():
    def __init__(self, action_size):
        self.action_size = action_size
        
        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 1000000
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net and the target net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)
        
        self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)

        # Initialize a target network and initialize the target network to the policy net
        ### CODE ###
        self.target_net = DQN(action_size).to(device)
        self.update_target_net()
        self.target_net.eval()

    def load_policy_net(self, path):
        self.policy_net = torch.load(path)           

    # after some time interval update the target net to be same with policy net
    def update_target_net(self):
        ### CODE ###
        self.target_net.load_state_dict(self.policy_net.state_dict())


    """Get action using policy net using epsilon-greedy policy"""
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            ### CODE #### (copy over from agent.py!)
            a = torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long)

        else:
            ### CODE #### (copy over from agent.py!)
            with torch.no_grad():
                state = torch.from_numpy(state).reshape(1,4,84,84).to(device)
                a = self.policy_net(state).max(1)[1].view(1, 1)
        return a

    # pick samples randomly from replay memory (with batch_size)
    def train_policy_net(self, frame):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch = self.memory.sample_mini_batch(frame)
        mini_batch = np.array(mini_batch).transpose()

        history = np.stack(mini_batch[0], axis=0)
        states = np.float32(history[:, :4, :, :]) / 255.
        states = torch.from_numpy(states).cuda()
        actions = list(mini_batch[1])
        actions = torch.LongTensor(actions).cuda()
        rewards = list(mini_batch[2])
        rewards = torch.FloatTensor(rewards).cuda()
        next_states = np.float32(history[:, 1:, :, :]) / 255.
        dones = mini_batch[3] # checks if the game is over
        musk = torch.tensor(list(map(int, dones==False)),dtype=torch.uint8)
        
        # Your agent.py code here with double DQN modifications
        ### CODE ###
        curr_Q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        next_state_values = torch.zeros(32, device=device)
        next_states = torch.from_numpy(next_states).to(device)
        next_state_values[musk==1] = self.target_net(next_states[musk==1]).max(1)[0].detach()
        #next_state_values[musk] = self.target_net(next_states[musk]).detach().gather(1, self.policy_net(next_states[musk]).argmax(1).unsqueeze(1)).squeeze(1)

        target_Q = next_state_values * self.discount_factor + rewards
        loss = F.smooth_l1_loss(curr_Q, target_Q)

        self.optimizer.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 10)
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        self.scheduler.step()

        
コード例 #4
0
    def __init__(self,
                 experience_memory=None,
                 lr=0.00025,
                 alpha=0.95,
                 eps=0.01,
                 batch_size=32,
                 gamma=0.99,
                 load_pretrained=False,
                 saved_model_path='./models/a.model'):

        self.experience_memory = experience_memory  # expereince replay memory
        self.lr = lr  # learning rate
        self.alpha = alpha  # optimizer parameter
        self.eps = 0.01  # optimizer parameter
        self.gamma = 0.99
        # BUILD MODEL
        USE_CUDA = torch.cuda.is_available()
        if torch.cuda.is_available():
            self.device = torch.device("cuda:0")
        else:
            self.device = torch.device("cpu")

        dfloat_cpu = torch.FloatTensor
        dfloat_gpu = torch.cuda.FloatTensor

        dlong_cpu = torch.LongTensor
        dlong_gpu = torch.cuda.LongTensor

        duint_cpu = torch.ByteTensor
        dunit_gpu = torch.cuda.ByteTensor

        dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
        ) else torch.FloatTensor
        dlongtype = torch.cuda.LongTensor if torch.cuda.is_available(
        ) else torch.LongTensor
        duinttype = torch.cuda.ByteTensor if torch.cuda.is_available(
        ) else torch.ByteTensor

        self.dtype = dtype
        self.dlongtype = dlongtype
        self.duinttype = duinttype

        Q = DQN(in_channels=5, num_actions=18).type(dtype)
        if load_pretrained:
            Q.load_state_dict(torch.load(saved_model_path))
        Q_t = DQN(in_channels=5, num_actions=18).type(dtype)
        Q_t.load_state_dict(Q.state_dict())
        Q_t.eval()
        for param in Q_t.parameters():
            param.requires_grad = False

        Q = Q.to(self.device)
        Q_t = Q_t.to(self.device)

        # if torch.cuda.device_count() > 0:
        # 	Q = nn.DataParallel(Q).to(self.device)
        # 	Q_t = nn.DataParallel(Q_t).to(self.device)
        # 	batch_size = batch_size * torch.cuda.device_count()
        # else:
        # 	batch_size = batch_size

        self.batch_size = batch_size
        self.Q = Q
        self.Q_t = Q_t
        # optimizer
        optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps)
        self.optimizer = optimizer
        print('init: Controller --> OK')
コード例 #5
0
ファイル: agent.py プロジェクト: XwLu/BreakoutsV4
class Agent(object):
    def __init__(self, args, obs):
        self.net = DQN(args.n_obs, args.n_action)
        self.target_net = DQN(args.n_obs, args.n_action)
        if os.path.isfile('./weights/ckpt.pth'):
            self.net.load_state_dict(torch.load('./weights/ckpt.pth'))
            self.target_net.load_state_dict(torch.load('./weights/ckpt.pth'))
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.state_preproc = StatePreproc(self.device)
        self.n_action = args.n_action
        self.gamma = args.gamma
        self.max_grad_norm = args.max_grad_norm
        self.num_procs = args.num_procs
        self.memory = ReplayBuffer(args)
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          lr=args.lr,
                                          betas=(0.9, 0.99))
        self.criterion = torch.nn.MSELoss()
        # log
        self.log_episode_rewards = torch.zeros(self.num_procs,
                                               device=self.device,
                                               dtype=torch.float)
        self.episode_rewards = deque([0] * 100, maxlen=100)
        self.episode = 1
        self.init(obs)
        # eval
        self.test_episode = args.test_episode

    def init(self, obs):
        self.net.to(self.device)
        self.target_net.to(self.device)
        self.obs_tensor = self.state_preproc(
            obs)  # size: [num_proc, 4, height, width]

    def act(self, obs, epsilon):
        if random.random() > epsilon:
            with torch.no_grad():
                q_vals = self.net(obs)
                action = q_vals.argmax(dim=1)
        else:
            action = torch.tensor(np.random.randint(0,
                                                    self.n_action,
                                                    size=obs.shape[0]),
                                  device=self.device,
                                  dtype=torch.int64)
        return action

    def collect_experiences(self, env, num_frames, epsilon):
        for i in range(num_frames):
            actions = self.act(self.obs_tensor, epsilon)  # size: [num_proc]
            next_obs, rewards, dones, _ = env.step(actions.cpu().numpy())
            next_obs_tensor = self.state_preproc(next_obs)
            rewards_tensor = torch.tensor(
                rewards, device=self.device,
                dtype=torch.float)  # size: [num_proc]
            dones_tensor = 1 - torch.tensor(
                dones, device=self.device,
                dtype=torch.float)  # size: [num_proc]

            self.memory.add(self.obs_tensor, actions, rewards_tensor,
                            next_obs_tensor, dones_tensor)

            self.obs_tensor = next_obs_tensor

            # for log
            self.log_episode_rewards += rewards_tensor
            for i, done in enumerate(dones):
                if done:
                    self.episode_rewards.append(
                        self.log_episode_rewards[i].item())
                    self.log_episode_rewards[i] = 0
                    self.episode += 1

        log = {
            'episode': self.episode,
            'average_reward': np.mean(self.episode_rewards)
        }
        return log

    def improve_policy(self, update_times):
        for _ in range(update_times):
            states, acts, rewards, next_states, dones = self.memory.sample()
            with torch.no_grad():
                q_vals = self.target_net(
                    next_states
                )  # next_states size: [batch_size * num_proc, h, w, channel]
                target_max_q = rewards + self.gamma * torch.max(q_vals, 1)[0]
            curr_q_vals = self.net(states)
            curr_max_q = curr_q_vals.gather(1, acts.unsqueeze(1)).squeeze(1)
            # actions = torch.zeros([acts.shape[0], self.n_action], device=self.device, dtype=torch.float)
            # for i, act in enumerate(acts):
            #     actions[i][act.item()] = 1.0
            # curr_max_q = curr_q_vals * actions
            loss = self.criterion(curr_max_q, target_max_q)
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.net.parameters(),
                                           self.max_grad_norm)
            self.optimizer.step()
        info = {'value': curr_max_q.mean().item(), 'loss': loss.item()}
        return info

    def update_target_net(self):
        for target_param, param in zip(self.target_net.parameters(),
                                       self.net.parameters()):
            target_param.data.copy_(param.data)

    def save_weights(self):
        torch.save(self.net.state_dict(), './weights/ckpt.pth')

    def evaluate(self, env):
        self.net.eval()
        episode_return_list = []
        for i in range(self.test_episode):
            seed = np.random.randint(0, 0xFFFFFF)
            env.seed(seed)
            obs = env.reset()
            done = False
            episode_return = 0
            while not done:
                obs_tensor = self.state_preproc([obs])
                action = self.act(obs_tensor, 0.0)
                obs, reward, done, _ = env.step(action.cpu().numpy())
                episode_return += reward
            episode_return_list.append(episode_return)

        info = {'average_return': np.mean(episode_return_list)}
        self.net.train()
        return info

    def display(self, env):
        self.net.eval()
        seed = np.random.randint(0, 0xFFFFFF)
        env.seed(seed)
        obs = env.reset()
        need_key = True
        episode = 0
        episode_return = 0
        print('`Enter`: next step\n`E`: Run until end-of-episode\n`Q`: Quit')
        while True:
            if need_key:
                key = input('Press key:')
                if key == 'q':  # quit
                    break
                if key == 'e':  # Run until end-of-episode
                    need_key = False
            env.render()
            obs_tensor = self.state_preproc([obs])
            action = self.act(obs_tensor, 0.0).squeeze(0)
            obs, reward, done, _ = env.step(action.cpu().numpy())
            episode_return += reward
            if not need_key:
                time.sleep(0.1)
            if done:
                episode += 1
                obs = env.reset()
                print('episode: {}, episode_return: {}'.format(
                    episode, episode_return))
                episode_return = 0
                need_key = True
        self.net.train()
コード例 #6
0
class Agent():
    def __init__(self, action_size):
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 500000
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net and the target net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)

        self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)

        # Initialize a target network and initialize the target network to the policy net
        ### CODE ###
        self.target_net = DQN(action_size)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.to(device)


    def load_policy_net(self, path):
        self.policy_net = torch.load(path)

    # after some time interval update the target net to be same with policy net
    def update_target_net(self):
        ### CODE ###
        self.target_net.load_state_dict(self.polic_net.state_dict())


    """Get action using policy net using epsilon-greedy policy"""
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            ### CODE ####
            # Choose a random action
            a = torch.tensor([[random.randrange(self.action_size)]], dtype=torch.long)

        else:
            ### CODE ####
            # Choose the best action
            state = torch.from_numpy(state).cuda()
            with torch.no_grad():
                a = self.policy_net(state).max(1)[1].view(1, 1)

        return a

    # pick samples randomly from replay memory (with batch_size)
    def train_policy_net(self, frame):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch = self.memory.sample_mini_batch(frame)
        mini_batch = np.array(mini_batch).transpose()

        history = np.stack(mini_batch[0], axis=0)
        states = np.float32(history[:, :4, :, :]) / 255.
        states = torch.from_numpy(states).cuda()
        actions = list(mini_batch[1])
        actions = torch.LongTensor(actions).cuda()
        rewards = list(mini_batch[2])
        rewards = torch.FloatTensor(rewards).cuda()
        next_states = np.float32(history[:, 1:, :, :]) / 255.
        dones = mini_batch[3] # checks if the game is over
        mask = torch.tensor(list(map(int, dones==False)),dtype=torch.bool)

        # Compute Q(s_t, a), the Q-value of the current state
        state_action_values = self.policy_net(states).gather(1, actions.unsqueeze(1))

        # Compute Q function of next state
        next_states = torch.from_numpy(next_states).cuda()
        non_final_next_states = next_states[mask]
        net_outputs = self.policy_net(non_final_next_states)

        # Find maximum Q-value of action at next state from policy net
        net_outputs = self.target_net(non_final_next_states)
        next_states_value[mask] = net_outputs.max(1)[0].detach()

        # Compute the Huber Loss
        expected_states_value = rewards + self.discount_factor * next_states_value
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model, .step() both the optimizer and the scheduler!
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
コード例 #7
0
    def __init__(self,
                 experience_memory=None,
                 num_actions=4,
                 lr=0.00025,
                 alpha=0.95,
                 eps=0.01,
                 batch_size=32,
                 gamma=0.99,
                 load_pretrained=False,
                 saved_model_path='./models/a.model',
                 optim_method='RMSprop',
                 use_multiple_gpu=True):

        self.experience_memory = experience_memory  # expereince replay memory
        self.lr = lr  # learning rate
        self.alpha = alpha  # optimizer parameter
        self.eps = 0.01  # optimizer parameter
        self.gamma = 0.99
        self.num_actions = num_actions
        self.use_multiple_gpu = use_multiple_gpu
        self.loss_list = []
        self.L = 0.0
        # BUILD MODEL
        if torch.cuda.is_available():
            self.device = torch.device("cuda:0")
        else:
            self.device = torch.device("cpu")

        dfloat_cpu = torch.FloatTensor
        dfloat_gpu = torch.cuda.FloatTensor

        dlong_cpu = torch.LongTensor
        dlong_gpu = torch.cuda.LongTensor

        duint_cpu = torch.ByteTensor
        dunit_gpu = torch.cuda.ByteTensor

        dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
        ) else torch.FloatTensor
        dlongtype = torch.cuda.LongTensor if torch.cuda.is_available(
        ) else torch.LongTensor
        duinttype = torch.cuda.ByteTensor if torch.cuda.is_available(
        ) else torch.ByteTensor

        self.dtype = dtype
        self.dlongtype = dlongtype
        self.duinttype = duinttype

        Q = DQN(in_channels=4, num_actions=num_actions).type(dtype)
        if load_pretrained:
            Q.load_state_dict(torch.load(saved_model_path))
        Q_t = DQN(in_channels=4, num_actions=num_actions).type(dtype)
        Q_t.load_state_dict(Q.state_dict())
        Q_t.eval()
        for param in Q_t.parameters():
            param.requires_grad = False

        Q = Q.to(self.device)
        Q_t = Q_t.to(self.device)

        if torch.cuda.device_count() > 1 and self.use_multiple_gpu:
            Q = nn.DataParallel(Q).to(self.device)
            Q_t = nn.DataParallel(Q_t).to(self.device)

        self.batch_size = batch_size
        self.Q = Q
        self.Q_t = Q_t
        # optimizer
        if optim_method == 'SGD':
            optimizer = torch.optim.SGD(Q.parameters(), lr=self.lr)
        elif optim_method == 'RMSprop':
            optimizer = optim.RMSprop(Q.parameters(),
                                      lr=lr,
                                      alpha=alpha,
                                      eps=eps)
        else:
            optimizer = optim.RMSprop(Q.parameters(),
                                      lr=lr,
                                      alpha=alpha,
                                      eps=eps)
        self.optimizer = optimizer
        print('init: Controller --> OK')
コード例 #8
0
class Agent():
    def __init__(self, action_size):
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 500000
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)

        self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)

    def load_policy_net(self, path):
        self.policy_net = torch.load(path)

    """Get action using policy net using epsilon-greedy policy"""
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            ### CODE #### 
            # Choose a random action
            a = torch.tensor([[random.randrange(self.action_size)]], device=device, dtype=torch.long)

        else:
            ### CODE ####
            # Choose the best action
            with torch.no_grad():
                state = torch.from_numpy(state).reshape(1,4,84,84).to(device)
                a = self.policy_net(state).max(1)[1].view(1, 1)

        return a

    # pick samples randomly from replay memory (with batch_size)
    def train_policy_net(self, frame):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch = self.memory.sample_mini_batch(frame)
        mini_batch = np.array(mini_batch).transpose()

        history = np.stack(mini_batch[0], axis=0)
        states = np.float32(history[:, :4, :, :]) / 255.
        states = torch.from_numpy(states).cuda()
        actions = list(mini_batch[1])
        actions = torch.LongTensor(actions).cuda()
        rewards = list(mini_batch[2])
        rewards = torch.FloatTensor(rewards).cuda()
        next_states = np.float32(history[:, 1:, :, :]) / 255.
        dones = mini_batch[3] # checks if the game is over
        musk = torch.tensor(list(map(int, dones==False)),dtype=torch.uint8)


        # Compute Q(s_t, a), the Q-value of the current state
        ### CODE ####
        curr_Q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute Q function of next state
        ### CODE ####
        next_state_values = torch.zeros(32, device=device)
        next_states = torch.from_numpy(next_states).to(device)
        next_state_values[musk==1] = self.policy_net(next_states[musk==1]).max(1)[0].detach()

        # Find maximum Q-value of action at next state from target net
        ### CODE ####
        target_Q = next_state_values * self.discount_factor + rewards

        # Compute the Huber Loss
        ### CODE ####
        loss = F.smooth_l1_loss(curr_Q, target_Q)

        # Optimize the model, .step() both the optimizer and the scheduler!
        ### CODE ####
        self.optimizer.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 10)
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()