Python ReplayMemory.sampleTransition Examples

Programming Language: Python

Namespace/Package Name: replaymemory

Class/Type: ReplayMemory

Method/Function: sampleTransition

Examples at hotexamples.com: 3

Python ReplayMemory.sampleTransition - 3 examples found. These are the top rated real world Python examples of replaymemory.ReplayMemory.sampleTransition extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ReplayMemory(20)

sample(7)

add(4)

getCurrentIndex(3)

push(3)

pushFrame(3)

pushTransition(3)

sampleTransition(3)

sample_buffer(3)

store_transition(3)

update_priorities(2)

addMemory(1)

append(1)

getRandomMemories(1)

Example #1

Show file

File: double_agent.py Project: maxstrobel/TwinDQN

class DoubleAgent(object):
    def __init__(self,
                 game1,
                 game2,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 pretrained_subnet1 = False,
                 pretrained_subnet2 = False,
                 frameskip = 4,
                 frozen = False
                 ):
        """
        Inputs:
        - game 1: string to select the game 1
        - game 2: string to select the game 2
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - pretrained_subnet1: str path to the model of the subnet
        - pretrained_subnet2: str path to the model of the subnet
        - frozen: boolean freeze pretrained subnets
        """

        # Namestring
        self.game1 = game1
        self.game2 = game2

        # Environment
        self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip)
        self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip)


        # Neural net
        self.pretrained_subnet1 = pretrained_subnet1
        self.pretrained_subnet2 = pretrained_subnet2
        self.net = TwinDQN(channels_in = state_buffer_size,
                             num_actions = self.env2.get_number_of_actions(),
                             pretrained_subnet1 = pretrained_subnet1,
                             pretrained_subnet2 = pretrained_subnet2,
                             frozen = frozen)
        self.target_net = TwinDQN(channels_in = state_buffer_size,
                                    num_actions = self.env2.get_number_of_actions(),
                                    pretrained_subnet1 = pretrained_subnet1,
                                    pretrained_subnet2 = pretrained_subnet2,
                                    frozen = frozen)

        # Cuda
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        # Pretrained
        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()),
                                    lr=learning_rate)
        #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()),
        #                               lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500


    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        - action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        EPSILON_PLAY = 0.01
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        elif mode=='play':
            epsilon = EPSILON_PLAY
        else:
            epsilon = EPSILON_END

        if epsilon < random():
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1,1)

            # Prevent noops
            if action[0,0]!=1:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0,0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:
            # Random action
            action = self.env2.sample_action()
            action = LongTensor([[action]])

        return action


    def map_action(self, action):
        """
        Maps action from game with more actions
        to game with less actions

        Inputs:
        - action: int
        Returns:
        - action: int
        """
        # Map SpaceInvaders on Breakout
        if self.game1=='Breakout' and self.game2=='SpaceInvaders':
            if action>3: # shoot+right/left --> right/left
                return action-2

        # Map Assault on SpaceInvaders
        if self.game1=='SpaceInvaders' and self.game2=='Assault':
            if action!=0: # all actions except 2nd idle
                return action-1

        # Map Phoenix on SpaceInvaders
        if self.game1=='SpaceInvaders' and self.game2=='Phoenix':
            if action==4: # shield --> idle
                return 0
            if action==7: # shield+shot --> shot
                return 1
            if action>4: # shoot+right/left --> shoot+right/left
                return action-1

        # Map Phoenix on Assault
        if self.game1=='Assault' and self.game2=='Phoenix':
            if action==4: # shield --> idle
                return 0
            if action==7: # shield+shot --> shot
                return 2
            if 1<= action and action<=3: # shot/right/left --> shot/right/left
                return action+1

        # No mapping necessary
        return action


    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition
        batch = self.replay.sampleTransition(self.batch_size)

        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]),
                                              volatile=True) # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(1)[0]
        next_state_values[non_final_mask]= next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()


        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates%self.update_target_net_each_k_steps==0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]


    def play(self, n):
        """
        Play a game with the current net and render it

        Inputs:
        - n: games to play
        """
        for i in range(n):
            done = False # games end indicator variable

            # Score counter
            total_reward_game1 = 0
            total_reward_game2 = 0
            total_reward = 0

            # Reset game
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            while not done:
                action = self.select_action(state, mode='play')[0,0]
                action1 = self.map_action(action)
                action2 = action

                # perform selected action on game
                screen1, reward1, _, done1, _ = self.env1.step(action1, mode='play')
                screen2, reward2, _, done2, _ = self.env2.step(action2, mode='play')

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)

                # save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

                # convert frames to range 0 to 1 again
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

                # Merged game over indicator
                done = done1 or done2
            print('Final scores Game ({}/{}): {}: {}    '.format(i+1, n, self.game1, total_reward_game1) +
                  '{}: {}    '.format(self.game2, total_reward_game2) +
                  'total: {}'.format(total_reward))
        self.env1.game.close()
        self.env2.game.close()


    def play_stats(self, n_games, mode='random'):
        """
        Play N games randomly or evaluate a net and log results for statistics

        Input:
        - n_games: int Number of games to play
        - mode: str 'random' or 'evaluation'
        """
        # Subdirectory for logging
        sub_dir = mode + '_' + self.game1 + '+' + self.game2 + '/'
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # Store history total
        reward_history = []
        reward_clamped_history = []
        # Store history game 1
        reward_history_game1 = []
        reward_clamped_history_game1 = []
        # Store history game 2
        reward_history_game2 = []
        reward_clamped_history_game2 = []

        # Number of actions to sample from
        n_actions = self.env2.get_number_of_actions()

        for i_episode in range(1, n_games+1):
            # Reset game
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # Store screen
            if mode=='evaluation':
                # list of k last frames
                last_k_frames = []
                for j in range(self.num_stored_frames):
                    last_k_frames.append(None)
                    last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)
                # frame is saved as ByteTensor -> convert to gray value between 0 and 1
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done = False

            # reset score with initial lives, because every lost live adds -1
            total_reward_game1 = 0
            total_reward_clamped_game1 = self.env1.get_lives()
            total_reward_game2 = 0
            total_reward_clamped_game2 = self.env2.get_lives()
            # total scores for both games
            total_reward = total_reward_game1 + total_reward_game2
            total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2

            while not done:
                if mode=='random':
                    action = randrange(n_actions)
                elif mode=='evaluation':
                    action = self.select_action(state, mode='play')[0,0]
                action1 = self.map_action(action)
                action2 = action

                screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1)
                screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2)

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)
                total_reward_clamped_game1 += reward1_clamped
                total_reward_clamped_game2 += reward2_clamped
                total_reward_clamped += reward1_clamped + reward2_clamped

                if mode=='evaluation':
                    # save latest frame, discard oldest
                    for j in range(self.num_stored_frames-1):
                        last_k_frames[j] = last_k_frames[j+1]
                    last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

                    # convert frames to range 0 to 1 again
                    state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

                # Merged game over indicator
                done = done1 or done2

            # Print current result
            print('Episode: {:6}/{:6} |   '.format(i_episode, n_games) +
                  'score total: ({:6.1f}/{:7.1f}) |   '.format(total_reward_clamped,total_reward) +
                  'score game1: ({:6.1f}/{:7.1f}) |   '.format(total_reward_clamped_game1,total_reward_game1) +
                  'score game2: ({:6.1f}/{:7.1f})'.format(total_reward_clamped_game2,total_reward_game2))

            # Save rewards
            reward_history_game1.append(total_reward_game1)
            reward_history_game2.append(total_reward_game2)
            reward_history.append(total_reward)
            reward_clamped_history_game1.append(total_reward_clamped_game1)
            reward_clamped_history_game2.append(total_reward_clamped_game2)
            reward_clamped_history.append(total_reward_clamped)

        avg_reward_total = np.sum(reward_history) / len(reward_history)
        avg_reward_total_clamped = np.sum(reward_clamped_history) / len(reward_clamped_history)
        avg_reward_game1 = np.sum(reward_history_game1) / len(reward_history_game1)
        avg_reward_game1_clamped = np.sum(reward_clamped_history_game1) / len(reward_clamped_history_game1)
        avg_reward_game2 = np.sum(reward_history_game2) / len(reward_history_game2)
        avg_reward_game2_clamped = np.sum(reward_clamped_history_game2) / len(reward_clamped_history_game2)

        # Print final result
        print('\n\n===========================================\n' +
              'avg score after {:6} episodes:\n'.format(n_games) +
              'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) +
              'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) +
              'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2))

        # Log results to files
        with open(sub_dir + mode + '.txt', 'w') as fp:
            fp.write('avg score after {:6} episodes:\n'.format(n_games) +
                     'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) +
                     'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) +
                     'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2))

        # Dump reward
        with open(sub_dir + mode + '_reward_game1.pickle', 'wb') as fp:
            pickle.dump(reward_history_game1, fp)
        with open(sub_dir + mode + '_reward_game2.pickle', 'wb') as fp:
            pickle.dump(reward_history_game2, fp)
        with open(sub_dir + mode + '_reward_total.pickle', 'wb') as fp:
            pickle.dump(reward_history, fp)

        with open(sub_dir + mode + '_reward_clamped_game1', 'wb') as fp:
            pickle.dump(reward_clamped_history_game1, fp)
        with open(sub_dir + mode + '_reward_clamped_game2', 'wb') as fp:
            pickle.dump(reward_clamped_history_game2, fp)
        with open(sub_dir + mode + '_reward_clamped_total', 'wb') as fp:
            pickle.dump(reward_clamped_history, fp)


    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game1 + '+' + self.game2 + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + 'train.txt'
        reward_file = sub_dir + 'reward.pickle'
        reward_file_game1 = sub_dir + 'reward_game1.pickle'
        reward_file_game2 = sub_dir + 'reward_game2.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        reward_clamped_file_game1 = sub_dir + 'reward_clamped_game1.pickle'
        reward_clamped_file_game2 = sub_dir + 'reward_clamped_game2.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        # Total scores
        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        reward_history = []
        reward_clamped_history = []
        # Scores game 1
        avg_score_game1 = 0
        avg_score_clamped_game1 = 0
        reward_history_game1 = []
        reward_clamped_history_game1 = []
        # Scores game 2
        avg_score_game2 = 0
        avg_score_clamped_game2 = 0
        reward_history_game2 = []
        reward_clamped_history_game2 = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' +
                     'Trained game (first):               {}\n'.format(self.game1) +
                     'Trained game (second):              {}\n'.format(self.game2) +
                     'Learning rate:                      {:.2E}\n'.format(self.learning_rate) +
                     'Batch size:                         {:d}\n'.format(self.batch_size) +
                     'Memory size(replay):                {:d}\n'.format(self.mem_size) +
                     'Pretrained:                         {}\n'.format(self.pretrained_model) +
                     'Pretrained subnet 1:                {}\n'.format(self.pretrained_subnet1) +
                     'Pretrained subnet 2:                {}\n'.format(self.pretrained_subnet2) +
                     'Started training after k frames:    {:d}\n'.format(self.start_train_after) +
                     'Optimized after k frames:           {:d}\n'.format(self.optimize_each_k) +
                     'Target net update after k frame:    {:d}\n\n'.format(self.update_target_net_each_k_steps) +
                     '--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n' +
                     'Episode | Steps     | ' +
                     '{:3} games avg total  | '.format(log_avg_episodes) +
                     '{:3} games avg game1  | '.format(log_avg_episodes) +
                     '{:3} games avg game2  | '.format(log_avg_episodes) +
                     'best score total \n' +
                     '--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n')

        print('Started training...\nLogging to {}\n'.format(sub_dir) +
              'Episode | Steps     |   score total        |   score game 1       |   ' +
              'score game 2       | best score total')

        for i_episode in range(1,num_episodes):
            # reset game at the start of each episode
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = torch.cat((gray2pytorch(screen1),
                                              gray2pytorch(screen2)), dim=1)

            if i_episode == 1:
                self.replay.pushFrame(last_k_frames[0].cpu())

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done1 = False
            done2 = False

            # reset score with initial lives, because every lost live adds -1
            total_reward_game1 = 0
            total_reward_clamped_game1 = self.env1.get_lives()
            total_reward_game2 = 0
            total_reward_clamped_game2 = self.env2.get_lives()
            # total scores for both games
            total_reward = total_reward_game1 + total_reward_game2
            total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2

            # Loop over one game
            while not done1 and not done2:
                self.steps +=1

                action = self.select_action(state)
                action1 = self.map_action(action[0,0])
                action2 = action[0,0]

                # perform selected action on game
                screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1)
                screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2)

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)
                total_reward_clamped_game1 += reward1_clamped
                total_reward_clamped_game2 += reward2_clamped
                total_reward_clamped += reward1_clamped + reward2_clamped

                # Bake reward into tensor
                reward = torch.FloatTensor([reward1_clamped+reward2_clamped])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),
                                                                     gray2pytorch(screen2)), dim=1)

                # convert frames to range 0 to 1 again
                if not done1 and not done2:
                    next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                else:
                    next_state = None

                # Store transition
                self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu())
                self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity,
                                            action, reward, done1 or done2)

                #	only optimize each kth step
                if self.steps%self.optimize_each_k == 0:
                    self.optimize(net_updates)
                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done1 or done2:
                    break;

            # Save rewards
            reward_history_game1.append(total_reward_game1)
            reward_history_game2.append(total_reward_game2)
            reward_history.append(total_reward)
            reward_clamped_history_game1.append(total_reward_clamped_game1)
            reward_clamped_history_game2.append(total_reward_clamped_game2)
            reward_clamped_history.append(total_reward_clamped)

            # Sum up for averages
            avg_score_clamped_game1 += total_reward_clamped_game1
            avg_score_clamped_game2 += total_reward_clamped_game2
            avg_score_clamped += total_reward_clamped
            avg_score_game1 += total_reward_game1
            avg_score_game2 += total_reward_game2
            avg_score += total_reward

            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            print('{:7} | '.format(i_episode) +
                  '{:9} |     '.format(self.steps) +
                  '({:6.1f}/{:7.1f}) |     '.format(total_reward_clamped,total_reward) +
                  '({:6.1f}/{:7.1f}) |     '.format(total_reward_clamped_game1,total_reward_game1) +
                  '({:6.1f}/{:7.1f}) |  '.format(total_reward_clamped_game2,total_reward_game2) +
                  '({:6.1f}/{:8.1f})'.format(best_score_clamped, best_score))

            if i_episode % log_avg_episodes == 0 and i_episode!=0:
                avg_score_clamped_game1 /= log_avg_episodes
                avg_score_clamped_game2 /= log_avg_episodes
                avg_score_clamped /= log_avg_episodes
                avg_score_game1 /= log_avg_episodes
                avg_score_game2 /= log_avg_episodes
                avg_score /= log_avg_episodes

                print('--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n' +
                      'Episode | Steps     | ' +
                      '{:3} games avg total  | '.format(log_avg_episodes) +
                      '{:3} games avg game1  | '.format(log_avg_episodes) +
                      '{:3} games avg game2  | '.format(log_avg_episodes) +
                      'best score total \n' +
                      '{:7} | '.format(i_episode) +
                      '{:9} |     '.format(self.steps) +
                      '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped,avg_score) +
                      '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped_game1,avg_score_game1) +
                      '({:6.1f}/{:7.1f}) |  '.format(avg_score_clamped_game2,avg_score_game2) +
                      '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score) +
                      '\nLogging to file...\n\n'
                      '--------+-----------+----------------------+------------' +
                      '----------+----------------------+--------------------\n' +
                      'Episode | Steps     |   score total        |   score game 1       |   ' +
                      'score game 2       | best score total')
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write('{:7} | '.format(i_episode) +
                             '{:9} |     '.format(self.steps) +
                             '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped,avg_score) +
                             '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped_game1,avg_score_game1) +
                             '({:6.1f}/{:7.1f}) |  '.format(avg_score_clamped_game2,avg_score_game2) +
                             '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score))
                # Dump reward
                with open(reward_file_game1, 'wb') as fp:
                    pickle.dump(reward_history_game1, fp)
                with open(reward_file_game2, 'wb') as fp:
                    pickle.dump(reward_history_game2, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)

                with open(reward_clamped_file_game1, 'wb') as fp:
                    pickle.dump(reward_clamped_history_game1, fp)
                with open(reward_clamped_file_game2, 'wb') as fp:
                    pickle.dump(reward_clamped_history_game2, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped_game1 = 0
                avg_score_clamped_game2 = 0
                avg_score_clamped = 0
                avg_score_game1 = 0
                avg_score_game2 = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) + '...\n')
                self.target_net.save(sub_dir + str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + 'final.model')

Example #2

Show file

File: single_agent.py Project: maxstrobel/TwinDQN

class SingleAgent(object):
    def __init__(self,
                 game,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 frameskip = 4
                 ):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # Environment
        self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())

        self.target_net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500


    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        EPSILON_PLAY = 0.01
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        elif mode=='play':
            epsilon = EPSILON_PLAY
        else:
            epsilon = EPSILON_END

        if epsilon < random():
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1,1)

            # Prevent noops
            if action[0,0]!=1:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0,0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:
            # Random action
            action = self.env.sample_action()
            action = LongTensor([[action]])

        return action


    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition
        batch = self.replay.sampleTransition(self.batch_size)

        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]),
                                              volatile=True) # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(1)[0]
        next_state_values[non_final_mask]= next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()

        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates%self.update_target_net_each_k_steps==0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]


    def play(self, n):
        """
        Play a game with the current net and render it

        Inputs:
        - n: games to play
        """
        for i in range(n):
            done = False # games end indicator variable
            score = 0
            # Reset game
            screen = self.env.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = gray2pytorch(screen)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            while not done:
                action = self.select_action(state, mode='play')[0,0]

                screen, reward, _, done, _ = self.env.step(action, mode='play')
                score += reward

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                # convert frames to range 0 to 1 again
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                self.state = state
            print('Game ({}/{}) - Final score {}: {}'.format(i+1, n, self.game, score))
        self.env.game.close()


    def play_stats(self, n_games, mode='random'):
        """
        Play N games randomly or evaluate a net and log results for statistics

        Input:
        - n_games: int Number of games to play
        - mode: str 'random' or 'evaluation'
        """
        # Subdirectory for logging
        sub_dir = mode + '_' + self.game + '/'
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # Store history
        reward_history = []
        reward_clamped_history = []

        # Number of actions to sample from
        n_actions = self.env.get_number_of_actions()

        for i_episode in range(1, n_games+1):
            # Reset game
            screen = self.env.reset()

            # Store screen
            if mode=='evaluation':
                # list of k last frames
                last_k_frames = []
                for j in range(self.num_stored_frames):
                    last_k_frames.append(None)
                    last_k_frames[j] = gray2pytorch(screen)
                # frame is saved as ByteTensor -> convert to gray value between 0 and 1
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done = False

            # reset score with initial lives, because every lost live adds -1
            total_reward = 0
            total_reward_clamped = self.env.get_lives()

            while not done:
                if mode=='random':
                    action = randrange(n_actions)
                elif mode=='evaluation':
                    action = self.select_action(state, mode='play')[0,0]

                screen, reward, reward_clamped, done, _ = self.env.step(action)
                total_reward += int(reward)
                total_reward_clamped += int(reward_clamped)

                if mode=='evaluation':
                    #   save latest frame, discard oldest
                    for j in range(self.num_stored_frames-1):
                        last_k_frames[j] = last_k_frames[j+1]
                    last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                    # convert frames to range 0 to 1 again
                    state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0



            # Print current result
            print('Episode: {:6}/{:6} |  '.format(i_episode, n_games),
                  'score: ({:4}/{:4})'.format(total_reward_clamped,total_reward))

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

        avg_reward = np.sum(reward_history)/len(reward_history)
        avg_reward_clamped = np.sum(reward_clamped_history)/len(reward_clamped_history)

        # Print final result
        print('\n\n=============================================\n' +
              'avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward))

        # Log results to files
        with open(sub_dir + mode + '.txt', 'w') as fp:
            fp.write('avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward))
        with open(sub_dir + mode + '_reward.pickle', 'wb') as fp:
            pickle.dump(reward_history, fp)
        with open(sub_dir + mode + '_reward_clamped.pickle', 'wb') as fp:
            pickle.dump(reward_clamped_history, fp)


    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + self.game + '_train.txt'
        loss_file = sub_dir + 'loss.pickle'
        reward_file = sub_dir + 'reward.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        loss_history = []
        reward_history = []
        reward_clamped_history = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' +
                     'Trained game:                       ' + str(self.game) + '\n' +
                     'Learning rate:                      ' + str(self.learning_rate) + '\n' +
                     'Batch size:                         ' + str(self.batch_size) + '\n' +
                     'Memory size(replay):                ' + str(self.mem_size) + '\n' +
                     'Pretrained:                         ' + str(self.pretrained_model) + '\n' +
                     'Started training after k frames:    ' + str(self.start_train_after) + '\n' +
                     'Optimized after k frames:           ' + str(self.optimize_each_k) + '\n' +
                     'Target net update after k frame:    ' + str(self.update_target_net_each_k_steps) + '\n\n' +
                     '------------------------------------------------------' +
                     '--------------------------------------------------\n')

        print('Started training...\nLogging to', sub_dir)

        for i_episode in range(1,num_episodes):
            # reset game at the start of each episode
            screen = self.env.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = gray2pytorch(screen)

            if i_episode == 1:
                self.replay.pushFrame(last_k_frames[0].cpu())

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            done = False # games end indicator variable
            # reset score with initial lives, because every lost live adds -1
            total_reward = 0
            total_reward_clamped = self.env.get_lives()

            # Loop over one game
            while not done:
                self.steps +=1

                action = self.select_action(state)

                # perform selected action on game
                screen, reward, reward_clamped, done, _ = self.env.step(action[0,0])
                total_reward += int(reward)
                total_reward_clamped += int(reward_clamped)

                # Wrap into tensor
                reward = torch.Tensor([reward_clamped])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                # convert frames to range 0 to 1 again
                if not done:
                    next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                else:
                    next_state = None

                # Store transition
                self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu())
                self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity, action, reward, done)

                #	only optimize each kth step
                if self.steps%self.optimize_each_k == 0:
                    loss = self.optimize(net_updates)

                    # Logging
                    loss_history.append(loss)
                    #q_history.append(q_value)
                    #exp_q_history.append(exp_q_value)

                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done:
                    break;

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

            print('Episode: {:6} |  '.format(i_episode),
                  'steps {:8} |  '.format(self.steps),
                  'loss: {:.2E} |  '.format(loss if loss else 0),
                  'score: ({:4}/{:4}) |  '.format(total_reward_clamped,total_reward),
                  'best score: ({:4}/{:4}) |  '.format(best_score_clamped,best_score),
                  'replay size: {:7}'.format(len(self.replay)))

            avg_score_clamped += total_reward_clamped
            avg_score += total_reward
            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            if i_episode % log_avg_episodes == 0 and i_episode!=0:
                avg_score_clamped /= log_avg_episodes
                avg_score /= log_avg_episodes

                print('----------------------------------------------------------------'
                      '-----------------------------------------------------------------',
                      '\nLogging to file: \nEpisode: {:6}   '.format(i_episode),
                      'steps: {:8}   '.format(self.steps),
                      'avg on last {:4} games ({:6.1f}/{:6.1f})   '.format(log_avg_episodes, avg_score_clamped,avg_score),
                      'best score: ({:4}/{:4})'.format(best_score_clamped, best_score),
                      '\n---------------------------------------------------------------'
                      '------------------------------------------------------------------')
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write('Episode: {:6} |  '.format(i_episode) +
                             'steps: {:8} |  '.format(self.steps) +
                             'avg on last {:4} games ({:6.1f}/{:6.1f}) |  '.format(log_avg_episodes, avg_score_clamped,avg_score) +
                             'best score: ({:4}/{:4})\n'.format(best_score_clamped, best_score))
                # Dump loss & reward
                with open(loss_file, 'wb') as fp:
                    pickle.dump(loss_history, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) + '...\n')
                self.target_net.save(sub_dir + self.game + '-' + str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + self.game + '.model')

Example #3

Show file

File: double_v2_agent.py Project: maxstrobel/TwinDQN

class Agent(object):
    def __init__(
            self,
            game,
            mem_size=512 * 512,  #1024*512,
            state_buffer_size=4,
            batch_size=64,
            learning_rate=1e-5,
            pretrained_model=None,
            frameskip=4,  #1
            record=False):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders)
        #if self.game == 'Breakout-v0':
        #    dimensions = (32, 195, 8, 152)
        #elif self.game == 'SpaceInvaders-v0':
        #    dimensions = (21, 195, 20, 141)
        #elif self.game == 'Assault-v0':
        #    dimensions = (50, 240, 5, 155)
        #elif self.game == 'Phoenix-v0':
        #    dimensions = (23, 183, 0, 160)
        #elif self.game == 'Skiing-v0':
        #    dimensions = (55, 202, 8, 152)
        #elif self.game == 'Enduro-v0':
        #    dimensions = (50, 154, 8, 160)
        #elif self.game == 'BeamRider-v0':
        #    dimensions = (32, 180, 9, 159)

        if self.game == 'BreakoutAndSpace':
            dimensions_break = (32, 195, 8, 152)
            dimensions_space = (21, 195, 20, 141)
        elif self.game != 'BreakoutAndSpace':
            print(
                'Error! This version is for playing BreakOut and SpaceInvaders at the same time.'
            )

        # Environment
        self.env_break = Environment('BreakoutNoFrameskip-v4',
                                     dimensions_break,
                                     frameskip=frameskip)
        self.env_space = Environment('SpaceInvaders-v0',
                                     dimensions_space,
                                     frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in=state_buffer_size,
                       num_actions=self.env_space.get_number_of_actions())

        self.target_net = DQN(
            channels_in=state_buffer_size,
            num_actions=self.env_space.get_number_of_actions())

        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 4
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size,
                                   num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 25000
        else:
            self.start_train_after = mem_size // 2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500

    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (
                EPSILON_START - EPSILON_END) / EPSILON_DECAY
        else:
            epsilon = EPSILON_END

        if epsilon > random() or mode == 'play':
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1, 1)

            # Prevent noops
            if action[0, 0] == 0:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0, 0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:

            # Random action
            action = self.env_space.sample_action()
            action = LongTensor([[action]])

        return action

    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition

        batch = self.replay.sampleTransition(self.batch_size)
        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(
            list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(
            torch.cat([ns for ns in batch.next_state if ns is not None]),
            volatile=True
        )  # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(
            self.batch_size).type(FloatTensor),
                                     volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(
            1)[0]
        next_state_values[non_final_mask] = next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        self.optimizer.zero_grad()

        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates % self.update_target_net_each_k_steps == 0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]

    def play(self):
        """
        Play a game with the current net and render it
        """
        done = False  # games end indicator variable
        score = 0
        # Reset game
        screen_break = self.env_break.reset()
        screen_space = self.env_space.reset()

        # list of k last frames
        ############old version:
        #breakout part
        #last_k_frames_break = []
        #for j in range(self.num_stored_frames):
        #    last_k_frames_break.append(None)
        #    last_k_frames_break[j] = gray2pytorch(screen_break)
        #spaceinvaders part
        #last_k_frames_space = []
        #for j in range(self.num_stored_frames):
        #    last_k_frames_space.append(None)
        #    last_k_frames_space[j] = gray2pytorch(screen_space)
        #################
        last_k_frames = []
        for j in range(self.num_stored_frames):
            last_k_frames.append(None)
            last_k_frames[j] = torch.cat(
                (gray2pytorch(screen_break), gray2pytorch(screen_space)),
                dim=2)

        # frame is saved as ByteTensor -> convert to gray value between 0 and 1
        ############old version:
        #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0
        #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0
        #state = torch.cat((state_break,state_space), 2)
        state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0

        while not done:
            action = self.select_action(state, mode='play')

            # Render game
            self.env_break.game.render(mode='human')
            self.env_space.game.render(mode='human')

            # maps actions from space invaders to breakout (shot-left to left, shot-right to right)
            if action[0, 0] == 4:
                action_break = 2
            elif action[0, 0] == 5:
                action_break = 3
            elif action[0, 0] != 5:
                action_break = action[0, 0]

            screen_break, _, reward_break, done_break, info_break = self.env_break.step(
                action_break, mode='play')
            screen_space, _, reward_space, done_space, info_space = self.env_space.step(
                action[0, 0], mode='play')
            score += reward_break
            score += reward_space
            done = done_break or done_space

            ############old
            #   save latest frame, discard oldest
            #for j in range(self.num_stored_frames - 1):
            #    last_k_frames_break[j] = last_k_frames_break[j + 1]
            #    last_k_frames_space[j] = last_k_frames_space[j + 1]
            #last_k_frames_break[self.num_stored_frames - 1] = gray2pytorch(screen_break)
            #last_k_frames_space[self.num_stored_frames - 1] = gray2pytorch(screen_space)

            # convert frames to range 0 to 1 again
            #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0
            #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0
            #state = torch.cat((state_break, state_space), 2)
            #############old_end

            #   save latest frame, discard oldest
            for j in range(self.num_stored_frames - 1):
                last_k_frames[j] = last_k_frames[j + 1]
            last_k_frames[self.num_stored_frames - 1] = torch.cat(
                (gray2pytorch(screen_break), gray2pytorch(screen_space)),
                dim=2)

            # convert frames to range 0 to 1 again
            state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0
            done = done_break or done_space

        print('Final score:', score)
        self.env.game.close()  #for both changen

    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game + '_' + datetime.now().strftime(
            '%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + self.game + '_train.log'
        loss_file = sub_dir + 'loss.pickle'
        reward_file = sub_dir + 'reward.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        loss_history = []
        reward_history = []
        reward_clamped_history = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(
                datetime.now().strftime('%Y%m%d_%H%M%S') + '\n' +
                'Trained game:    ' + str(self.game) + '\n' +
                'Learning rate:    ' + str(self.learning_rate) + '\n' +
                'Batch size:    ' + str(self.batch_size) + '\n' +
                'Pretrained:    ' + str(self.pretrained_model) + '\n' +
                'Started training after k frames:    ' +
                str(self.start_train_after) + '\n' +
                'Optimized after k frames:    ' + str(self.optimize_each_k) +
                '\n' + 'Target net update after k frame:    ' +
                str(self.update_target_net_each_k_steps) + '\n\n' +
                '--------------------------------------------------------------------------------\n'
            )

        print('Started training...\nLogging to', sub_dir)

        for i_episode in range(1, num_episodes):

            # reset game at the start of each episode
            screen_break = self.env_break.reset()
            screen_space = self.env_space.reset()

            # list of k last frames
            last_k_frames_break = []
            last_k_frames_space = []
            for j in range(self.num_stored_frames):
                last_k_frames_break.append(None)
                last_k_frames_space.append(None)
                last_k_frames_break[j] = gray2pytorch(screen_break)
                last_k_frames_space[j] = gray2pytorch(screen_space)

            if i_episode == 1:
                frames_both = torch.cat((last_k_frames_break[0].cpu(),
                                         last_k_frames_space[0].cpu()), 2)
                #self.replay.pushFrame(last_k_frames_break[0].cpu())
                #self.replay.pushFrame(last_k_frames_space[0].cpu())
                self.replay.pushFrame(frames_both)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state_break = torch.cat(last_k_frames_break,
                                    1).type(FloatTensor) / 255.0
            state_space = torch.cat(last_k_frames_space,
                                    1).type(FloatTensor) / 255.0
            state = torch.cat((state_break, state_space), 2)

            done = False  # games end indicator variable
            # reset score with initial lives, because every lost live adds -1
            total_reward = self.env_break.get_lives()
            total_reward += self.env_space.get_lives()
            total_reward_clamped = self.env_break.get_lives()
            total_reward_clamped += self.env_space.get_lives()
            ###########

            # Loop over one game
            while not done:
                self.steps += 1
                action = self.select_action(state)
                # perform selected action on game
                # screen, reward, done, info = self.env.step(action[0,0])#envTest.step(action[0,0])
                #maps actions from space invaders to breakout (shot-left to left, shot-right to right)

                screen_space, _, reward_space, done_space, info_space = self.env_space.step(
                    action[0, 0])

                action_break = action[0, 0]
                if action_break > 3:  #shoot+right/left --> right/left
                    action_break = action_break - 2
                screen_break, _, reward_break, done_break, info_break = self.env_break.step(
                    action_break)

                total_reward += int(reward_break)
                total_reward += int(reward_space)
                done = done_break or done_space

                #   clamp rewards
                reward_break = torch.Tensor([np.clip(reward_break, -1, 1)])
                reward_space = torch.Tensor([np.clip(reward_space, -1, 1)])
                reward = reward_break + reward_space
                total_reward_clamped += int(reward_break[0])
                total_reward_clamped += int(reward_space[0])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames - 1):
                    last_k_frames_break[j] = last_k_frames_break[j + 1]
                    last_k_frames_space[j] = last_k_frames_space[j + 1]
                last_k_frames_break[self.num_stored_frames -
                                    1] = gray2pytorch(screen_break)
                last_k_frames_space[self.num_stored_frames -
                                    1] = gray2pytorch(screen_space)

                # convert frames to range 0 to 1 again
                if not done:
                    next_state_break = torch.cat(last_k_frames_break,
                                                 1).type(FloatTensor) / 255.0
                    next_state_space = torch.cat(last_k_frames_space,
                                                 1).type(FloatTensor) / 255.0
                    next_state = torch.cat(
                        (next_state_break, next_state_space), 2)
                else:
                    next_state = None

                #Store transition
                #Frame concat, Trasition not (try)
                frame_break = last_k_frames_break[self.num_stored_frames -
                                                  1].cpu()
                frame_space = last_k_frames_space[self.num_stored_frames -
                                                  1].cpu()
                frame_both = torch.cat((frame_break, frame_space), 2)
                self.replay.pushFrame(frame_both)
                self.replay.pushTransition(
                    (self.replay.getCurrentIndex() - 1) % self.replay.capacity,
                    action, reward, done)

                #	only optimize each kth step
                if self.steps % self.optimize_each_k == 0:
                    loss = self.optimize(net_updates)

                    # Logging
                    loss_history.append(loss)
                    #q_history.append(q_value)
                    #exp_q_history.append(exp_q_value)

                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done:
                    break

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

            print(
                'Episode: {:6} |  '.format(i_episode),
                'steps {:8} |  '.format(self.steps),
                'loss: {:.2E} |  '.format(loss if loss else 0),
                'score: ({:4}/{:4}) |  '.format(total_reward_clamped,
                                                total_reward),
                'best score: ({:4}/{:4}) |  '.format(best_score_clamped,
                                                     best_score),
                'replay size: {:7}'.format(len(self.replay)))

            avg_score_clamped += total_reward_clamped
            avg_score += total_reward
            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            if i_episode % log_avg_episodes == 0 and i_episode != 0:
                avg_score_clamped /= log_avg_episodes
                avg_score /= log_avg_episodes

                print(
                    '----------------------------------------------------------------'
                    '-----------------------------------------------------------------',
                    '\nLogging to file: \nEpisode: {:6}   '.format(i_episode),
                    'steps: {:8}   '.format(self.steps),
                    'avg on last {:4} games ({:6.1f}/{:6.1f})   '.format(
                        log_avg_episodes, avg_score_clamped, avg_score),
                    'best score: ({:4}/{:4})'.format(best_score_clamped,
                                                     best_score),
                    '\n---------------------------------------------------------------'
                    '------------------------------------------------------------------'
                )
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write(
                        'Episode: {:6} |  '.format(i_episode) +
                        'steps: {:8} |  '.format(self.steps) +
                        'avg on last {:4} games ({:6.1f}/{:6.1f}) |  '.format(
                            log_avg_episodes, avg_score_clamped, avg_score) +
                        'best score: ({:4}/{:4})\n'.format(
                            best_score_clamped, best_score))
                # Dump loss & reward
                with open(loss_file, 'wb') as fp:
                    pickle.dump(loss_history, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) +
                             '...\n')
                self.target_net.save(sub_dir + self.game + '-' +
                                     str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + self.game + '.model')