class Agent_DQN():
    def __init__(self, env, args):
        # Parameters for q-learning

        super(Agent_DQN, self).__init__()

        self.env = env
        state = env.reset()
        state = state.transpose(2, 0, 1)

        self.policy_net = DQN(state.shape,
                              self.env.action_space.n)  # Behavior Q
        self.target_net = DQN(state.shape, self.env.action_space.n)  # Target Q
        self.target_net.load_state_dict(self.policy_net.state_dict())
        #Initial Q

        if USE_CUDA:
            print("Using CUDA . . .     ")
            self.policy_net = self.policy_net.cuda()
            self.target_net = self.target_net.cuda()

        print('hyperparameters and network initialized')

        if args.test_dqn or LOAD == True:
            print('loading trained model')
            checkpoint = torch.load('trainData')
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])

        self.target_net.load_state_dict(self.policy_net.state_dict())

    def init_game_setting(self):
        print('loading trained model')
        checkpoint = torch.load('trainData')
        self.policy_net.load_state_dict(checkpoint['model_state_dict'])

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        memory.append((state, action, reward, next_state, done))

    def replay_buffer(self):
        state, action, reward, next_state, done = zip(
            *random.sample(memory, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(
            next_state), done

    def __len__(self):
        return len(self.buffer)

    def make_action(self, observation, test=True):

        observation = observation.transpose(2, 0, 1)

        if np.random.random() > EPSILON or test == True:
            observation = Variable(torch.FloatTensor(
                np.float32(observation)).unsqueeze(0),
                                   volatile=True)
            q_value = self.policy_net.forward(observation)
            action = q_value.max(1)[1].data[0]
            action = int(action.item())
        else:
            action = random.randrange(4)
        return action

    def optimize_model(self):

        states, actions, next_states, rewards, dones = self.replay_buffer()

        states_v = Variable(torch.FloatTensor(np.float32(states)))
        next_states_v = Variable(torch.FloatTensor(np.float32(next_states)),
                                 volatile=True)
        actions_v = Variable(torch.LongTensor(actions))
        rewards_v = Variable(torch.FloatTensor(rewards))
        done = Variable(torch.FloatTensor(dones))

        state_action_values = self.policy_net(states_v).gather(
            1, actions_v.unsqueeze(1)).squeeze(1)
        next_state_values = self.target_net(next_states_v).max(1)[0]
        expected_q_value = rewards_v + next_state_values * GAMMA * (
            1 - done)  #+ rewards_v

        loss = (state_action_values -
                Variable(expected_q_value.data)).pow(2).mean()
        return loss

    def train(self):
        optimizer = optim.Adam(self.policy_net.parameters(), lr=ALPHA)

        # Fill the memory with experiences
        print('Gathering experiences ...')
        meanScore = 0
        AvgRewards = []
        AllScores = []
        step = 1
        iEpisode = 0

        while meanScore < 50:

            state = self.env.reset()
            done = False
            EpisodeScore = 0
            tBegin = time.time()
            done = False

            while not done:

                action = self.make_action(state)
                nextState, reward, done, _ = self.env.step(action)
                self.push(state.transpose(2, 0, 1), action,
                          nextState.transpose(2, 0, 1), reward, done)

                state = nextState

                if len(memory) > StartLearning:
                    loss = self.optimize_model()
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                else:
                    iEpisode = 0
                    continue

                # Update exploration factor
                EPSILON = EPS_END + (EPS_START - EPS_END) * math.exp(
                    -1. * step / EPS_DECAY)
                storeEpsilon.append(EPSILON)
                step += 1

                EpisodeScore += reward

                if step % TARGET_UPDATE == 0:
                    print('Updating Target Network . . .')
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())

            iEpisode += 1
            AllScores.append(EpisodeScore)
            meanScore = np.mean(AllScores[-100:])
            AvgRewards.append(meanScore)

            if len(memory) > StartLearning:
                print('Episode: ', iEpisode, ' score:', EpisodeScore,
                      ' Avg Score:', meanScore, ' epsilon: ', EPSILON, ' t: ',
                      time.time() - tBegin, ' loss:', loss.item())
            else:
                print('Gathering Data . . .')

            if iEpisode % 500 == 0:
                torch.save(
                    {
                        'epoch': iEpisode,
                        'model_state_dict': self.policy_net.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss,
                        'AvgRewards': AvgRewards
                    }, 'trainData')

                os.remove("Rewards.csv")
                with open('Rewards.csv', mode='w') as dataFile:
                    rewardwriter = csv.writer(dataFile,
                                              delimiter=',',
                                              quotechar='"',
                                              quoting=csv.QUOTE_MINIMAL)
                    rewardwriter.writerow(AvgRewards)

        print('======== Complete ========')
        torch.save(
            {
                'epoch': iEpisode,
                'model_state_dict': self.policy_net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                'AvgRewards': AvgRewards
            }, 'trainData')

        with open('Rewards.csv', mode='w') as dataFile:
            rewardwriter = csv.writer(dataFile,
                                      delimiter=',',
                                      quotechar='"',
                                      quoting=csv.QUOTE_MINIMAL)
            rewardwriter.writerow(AvgRewards)
Ejemplo n.º 2
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize every things you need here.
        For example: building your model
        """

        super(Agent_DQN, self).__init__(env)
        self.env = env
        self.args = args
        self.episode = 0
        self.n_actions = self.env.action_space.n
        self.epsilon_start = 1.0
        self.epsilon_final = 0.025
        self.epsilon_decay = 3000
        self.epsilon_by_frame = lambda frame_idx: self.epsilon_final + (
            self.epsilon_start - self.epsilon_final) * math.exp(
                -1. * frame_idx / self.epsilon_decay)
        self.epsilon = 0
        self.eval_net = DQN().cuda()
        self.target_net = DQN().cuda()
        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.criterion = nn.MSELoss()
        #self._model = Net(self.env.observation_space.shape, self.env.action_space.n)
        self._use_cuda = torch.cuda.is_available()
        self.optim = torch.optim.Adam(self.eval_net.parameters(),
                                      lr=self.args.learning_rate)

        if self._use_cuda:
            self.eval_net = self.eval_net.cuda()
            self.target_net = self.target_net.cuda()
            self.criterion = self.criterion.cuda()

#       self.replaybuffer = ReplayBuffer(args.buffer_size)
        self.buffer = deque(maxlen=10000)
        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            self.eval_net.load_state_dict(torch.load(args.model_dqn))
            self.target_net.load_state_dict(self.eval_net.state_dict())
            if self._use_cuda:
                self.eval_net = self.eval_net.cuda()
                self.target_net = self.target_net.cuda()

        ##################
        # YOUR CODE HERE #
        ##################

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary
        """
        ##################
        # YOUR CODE HERE #
        ##################
        pass

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        self.buffer.append((state, action, reward, next_state, done))

    def replay_buffer(self, batch_size):
        state, action, reward, next_state, done = zip(
            *random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(
            next_state), done

    def train(self):
        """
        Implement your training algorithm here
        """
        ##################
        # YOUR CODE HERE #
        ##################

        print('begin train...')

        #        if self.args.log_file is not None:
        #        fp_log = open(self.args.log_file, 'w', buffering=1)
        fout = open('dqn_score.log', 'w')
        if os.path.exists('model') == False:
            os.makedirs('model')

        losses = []
        all_rewards = []
        episode_reward = 0
        best_mean_reward = 0
        state = self.env.reset()
        for i_step in range(self.args.max_steps):
            self.epsilon = self.epsilon_by_frame(i_step)
            action = self.make_action(state)
            next_state, reward, done, _ = self.env.step(action)

            self.push(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward

            if done:
                state = self.env.reset()
                all_rewards.append(episode_reward)
                self.episode += 1
                print('{},{}'.format(self.episode, episode_reward))
                fout.write('Episode{},episode_reward{}\n'.format(
                    self.episode, episode_reward))
                episode_reward = 0

            if len(self.buffer) == self.args.buffer_size:
                if i_step % self.args.eval_net_update_step == 0:
                    loss = self.optimize_model()
                    losses.append(loss)

                if i_step % self.args.target_net_update_step == 0:
                    self.target_net.load_state_dict(self.eval_net.state_dict())

            if i_step % self.args.save_freq == 0:
                mean_reward = \
                    sum(all_rewards[-100:]) / 100
                if best_mean_reward < mean_reward:
                    print('save best model with mean reward = %f' %
                          mean_reward)
                    best_mean_reward = mean_reward
                    torch.save(self.eval_net.state_dict(), self.args.model_dqn)

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ##################
        # YOUR CODE HERE #
        ##################
        observation = torch.cuda.FloatTensor(
            observation.reshape((1, 84, 84, 4))).transpose(1,
                                                           3).transpose(2, 3)
        #        print(type(observation))
        Q_value = self.eval_net.forward(observation).data.cpu().numpy()
        if random.random() > self.epsilon:
            action = np.argmax(Q_value)
        else:
            action = self.env.get_random_action()
        return action

    def optimize_model(self):

        state, action, reward, next_state, done = self.replay_buffer(
            self.args.batch_size)

        state = torch.FloatTensor(np.float32(state)).permute(0, 3, 1, 2)
        next_state = torch.FloatTensor(np.float32(next_state)).permute(
            0, 3, 1, 2)
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        done = torch.ByteTensor(done)

        if self._use_cuda:
            state = state.cuda()
            next_state = next_state.cuda()
            action = action.cuda()
            reward = reward.cuda()
            done = done.cuda()

        q_values = self.eval_net(state)

        # next_q_values = self.target_net(next_state).detach()

        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

        next_q_values = self.target_net(next_state).detach()
        next_q_value = next_q_values.max(1)[0]

        expected_q_value = reward + self.args.gamma * next_q_value * (1 - done)

        loss = self.criterion(q_value, expected_q_value.data)

        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        return loss
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # initializations for replay memory
        self.env = env
        self.buffer = collections.deque(
            maxlen=REPLAY_SIZE)  # initializing a replay memory buffer

        #initializations of agent
        self._reset()
        self.last_action = 0
        self.net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE)
        self.target_net = DQN((4, 84, 84), self.env.action_space.n).to(DEVICE)
        LOAD_MODEL = True

        if args.test_dqn:
            #you can load your model here
            print('preparing to load trained model')
            ###########################
            LOAD_MODEL = True

        if LOAD_MODEL:
            self.net.load_state_dict(
                torch.load(MODEL, map_location=lambda storage, loc: storage))
            print('loaded trained model')
            self.target_net.load_state_dict(self.net.state_dict())

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################

        ###########################
        pass

    def push(self, experience):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        """
        ###########################
        self.buffer.append(experience)
        ###########################

    def replay_buffer(self, batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.

        sample a batch of 32 from the experience collected
        """
        ###########################
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(
            *[self.buffer[idx] for idx in indices])
        ###########################
        # The 'states' below are already in the transposed form because they are sampled from experience
        return np.array(states, dtype=np.float32), np.array(actions), np.array(
            rewards,
            dtype=np.float32), np.array(dones,
                                        dtype=np.bool), np.array(next_states)

    def _reset(self):
        self.state = self.env.reset()
        self.total_reward = 0.0

    def make_action(self, observation, test=True):
        """
        this is exclusively for testing our actions
        select action
        """
        state_a_test = np.array([observation.transpose(2, 0, 1)], copy=False)
        #torch.tensor opperation appends a '1' at the start of the numpy array
        state_v_test = torch.tensor(state_a_test).to('cpu')
        #feeding observation to the network
        Q_values_v_test = self.net.forward(state_v_test)
        # picking the action with maximum probability
        #picking the best action
        _, action_v_test = torch.max(Q_values_v_test, dim=1)
        #coverting tensor to int
        action_test = int(action_v_test.item())
        ###########################
        return action_test

    def make_action_train(self, net, epsilon=0.0, device=DEVICE):
        """
        select action using epsilon greedy method for training purposes
        """

        if np.random.random() < self.epsilon:
            action = random.randrange(self.env.action_space.n)

        else:
            state_a = np.array([self.state.transpose(2, 0, 1)], copy=False)
            #torch.tensor opperation appends a '1' at the start of the numpy array
            # and makes it a tensor to be fed to the net
            state_v = Variable(torch.FloatTensor(state_a).to(device))

            #Q_values_v = self.net(state_v)
            Q_values_v = self.net.forward(state_v)

            #picking the best action
            _, action_v = torch.max(Q_values_v, dim=1)
            #coverting tensor to int
            action = int(action_v.item())

        ###########################
        return action

    def take_a_step(self, net, epsilon=0.0, device=DEVICE):
        """
        execute action and take a step in the environment
        add the state,action,rewards to the experience replay
        return the total_reward
        """
        done_reward = None

        action_for_exp = self.make_action_train(self.net, self.epsilon, DEVICE)

        new_state, reward, is_done, _ = self.env.step(action_for_exp)

        #Here total reward is the reward for each episode
        self.total_reward += reward
        new_state = new_state

        #remember that the state that comes in from taking a step in our environment
        # will be in the form of width X height X depth

        # But whatever state goes into experience will be in the form of depth X height X width
        # i.e the experience buffer will have state in the transposed format
        # because this is the format that pytorch input should look like
        exp = Experience(self.state.transpose(2, 0, 1), action_for_exp, reward,
                         is_done, new_state.transpose(2, 0, 1))

        #adding experiences in our replay memory
        self.push(exp)
        self.state = new_state

        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

    def loss_function(self, batch, net, target_net, optimizer, device=DEVICE):

        states, actions, rewards, dones, next_states = batch

        states_v = Variable(torch.FloatTensor(states).to(device))
        next_states_v = Variable(torch.FloatTensor(next_states).to(device))
        actions_v = Variable(torch.LongTensor(actions).to(device))
        rewards_v = Variable(torch.FloatTensor(rewards).to(device))
        done = Variable(torch.FloatTensor(dones).to(device))

        #Q_vals
        state_action_values = self.net(states_v).gather(
            1,
            actions_v.long().unsqueeze(-1)).squeeze(-1)

        #next_Q_vals
        next_state_values = self.target_net(next_states_v).max(1)[0]

        #next_state_values[done] = 0.0
        #next_state_values = next_state_values.detach()

        expected_state_action_values = rewards_v + next_state_values * GAMMA * (
            1 - done)

        loss = (state_action_values -
                Variable(expected_state_action_values)).pow(2).mean()

        # we dont wanna accumilate our gradients
        # hence it is importent to make them zero at every iteration

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return loss

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        device = torch.device(DEVICE)

        #defining the optimizer for your neural network
        optimizer = optim.RMSprop(self.net.parameters(), lr=LEARNING_RATE)

        #empty list of total rewards
        total_rewards = []
        best_mean_reward = None
        # initializations for time and speed calculation
        frame_idx = 0
        timestep_frame = 0
        timestep = time.time()

        while True:

            frame_idx += 1
            self.epsilon = EPSILON_END + (EPSILON_START -
                                          EPSILON_END) * math.exp(
                                              -1. * frame_idx / EPSILON_DECAY)

            reward = self.take_a_step(self.net, self.epsilon, device=device)

            if reward is not None:
                #appending rewards in an empty list of total_rewards
                total_rewards.append(reward)

                # not asked to calculate speed
                speed = (frame_idx - timestep_frame) / (time.time() - timestep)
                timestep_frame = frame_idx
                timestep = time.time()

                #calculating mean of last(recent) 1000 rewards
                mean_reward = np.mean(total_rewards[-100:])

                print(
                    "{} frames: done {} games, mean reward {}, epsilon {}, speed {} frames/s"
                    .format(frame_idx, len(total_rewards),
                            round(mean_reward, 3), round(self.epsilon, 2),
                            round(speed, 2)))

                if best_mean_reward is None or best_mean_reward < mean_reward or len(
                        total_rewards) % 25 == 0:

                    if best_mean_reward is not None:
                        print("New best mean reward {} -> {}, model saved".
                              format(round(best_mean_reward, 3),
                                     round(mean_reward, 3)))

            if frame_idx % SAVE_INTERVAL == 0:
                torch.save(self.net.state_dict(),
                           'breakoutNoFrameSkip-4v1' + '.dat')

            #checking the replay memory
            if len(self.buffer) < LEARNING_STARTS:
                continue

            #check if we need to update our target function
            if frame_idx % TARGET_UPDATE_INTERVAL == 0:
                self.target_net.load_state_dict(self.net.state_dict())

            # sampling a batch from buffer
            batch = self.replay_buffer(BATCH_SIZE)
            #calculate and backpropogate
            loss_t = self.loss_function(batch, self.net, self.target_net,
                                        optimizer, device)

            #printing loss at every 100 episodes
            if len(total_rewards) % 100 == 0:
                print("loss at episode" + str(len(total_rewards)) + "is" +
                      str(float(loss_t.item())))

            with open('rewards_collection-100mean.csv', mode='w') as dataFile:
                writer = csv.writer(dataFile,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow(total_rewards)

        self.env.close()
Ejemplo n.º 4
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """
        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        #Gym parameters
        self.num_actions = env.action_space.n

        # parameters for repaly buffer
        self.buffer_max_len = 20000
        self.buffer = deque(maxlen=self.buffer_max_len)
        self.episode_reward_list = []
        self.moving_reward_avg = []

        # paramters for neural network
        self.batch_size = 32
        self.gamma = 0.999
        self.eps_threshold = 0
        self.eps_start = 1
        self.eps_end = 0.025
        self.max_expisode_decay = 10000
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        #Training
        self.steps_done = 0
        self.num_episode = 20000
        self.target_update = 5000
        self.learning_rate = 1.5e-4

        # Neural Network
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.learning_rate)

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            self.policy_net = torch.load('policy_net.hb5')
            self.policy_net.eval()
            ###########################
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        with torch.no_grad():
            sample = random.random()

            ## Check if this is the best way to decline
            observation = torch.tensor(observation,
                                       dtype=torch.float,
                                       device=self.device).permute(
                                           2, 0, 1).unsqueeze(0)

            if test:
                print("testing")
                return self.policy_net(observation).max(1)[1].item()

            if sample > self.eps_threshold:
                #print("Above threshold")
                return self.policy_net(observation).max(1)[1].item()
            else:
                #print("Below Threshold")
                return self.env.action_space.sample()
        ###########################

    def push(self, state, reward, action, next_state, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.buffer.append((state, reward, action, next_state, done))
        ###########################

    def replay_buffer(self, batch_size):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        batch = random.sample(self.buffer, batch_size)
        states = []
        rewards = []
        actions = []
        next_states = []
        dones = []
        for sample in batch:
            state, reward, action, next_state, done = sample
            states.append(state)
            rewards.append(reward)
            actions.append(action)
            next_states.append(next_state)
            dones.append(done)
        ###########################
        return states, rewards, actions, next_states, dones

    def update(self):
        if self.steps_done < 5000:
            return
        states, rewards, actions, next_states, dones = self.replay_buffer(
            self.batch_size)
        loss = self.compute_loss(states, rewards, actions, next_states, dones)
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp(-1, 1)
        self.optimizer.step()

    def compute_loss(self, states, rewards, actions, next_states, dones):
        non_final_mask = [not done for done in dones]

        states = torch.tensor(states,
                              dtype=torch.float).permute(0, 3, 1,
                                                         2).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float).permute(
            0, 3, 1, 2).to(self.device)
        dones = torch.tensor(dones, dtype=torch.long).to(self.device)

        Q_current = self.policy_net.forward(states).gather(
            1, actions.unsqueeze(1))
        Q_current = Q_current.squeeze(1)
        ## Should do this with no grad

        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            next_states[non_final_mask]).max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.gamma) + rewards

        loss = F.smooth_l1_loss(Q_current, expected_state_action_values)

        del states, rewards, actions, next_states, dones, Q_current, next_state_values, expected_state_action_values

        return loss

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        for episode in range(self.num_episode):
            #Check this please
            observation = self.env.reset() / 255

            self.eps_threshold = max(
                1 +
                (((self.eps_end - self.eps_start) / self.max_expisode_decay) *
                 episode), self.eps_end)
            episode_steps = 0
            done = False
            episode_reward = 0
            ## Not sure if this is the right way to do this?
            while not done:
                action = self.make_action(observation, test=False)
                new_observation, reward, done, _ = self.env.step(action)

                new_observation = new_observation / 255
                episode_reward += reward
                self.steps_done += 1
                episode_steps += 1

                self.push(observation, reward, action, new_observation, done)

                ## Updating the network
                self.update()

                observation = new_observation

                if self.steps_done % self.target_update == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
            self.episode_reward_list.append(episode_reward)

            if episode % 100 == 0:
                print('episode: {} reward: {} episode length: {}'.format(
                    episode, episode_reward, episode_steps))
                torch.save(self.policy_net.state_dict(), 'test_model.pt')
        ###########################
        print("Done")