class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        # Declare variables
        self.exp_id = uuid.uuid4().__str__().replace('-', '_')
        self.args = args
        self.env = env
        self.eps_threshold = None
        self.nA = env.action_space.n
        self.action_list = np.arange(self.nA)
        self.reward_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.max_q_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.loss_list = deque(
            maxlen=args.window)  # np.zeros(args.window, np.float32)
        self.probability_list = np.zeros(env.action_space.n, np.float32)
        self.cur_eps = self.args.eps
        self.t = 0
        self.ep_len = 0
        self.mode = None
        if self.args.use_pri_buffer:
            self.replay_buffer = NaivePrioritizedBuffer(
                capacity=self.args.capacity, args=self.args)
        else:
            self.replay_buffer = ReplayBuffer(capacity=self.args.capacity,
                                              args=self.args)
        self.position = 0

        self.args.save_dir += f'/{self.exp_id}/'
        os.system(f"mkdir -p {self.args.save_dir}")
        self.meta = MetaData(fp=open(
            os.path.join(self.args.save_dir, 'result.csv'), 'w'),
                             args=self.args)
        self.eps_delta = (self.args.eps -
                          self.args.eps_min) / self.args.eps_decay_window
        self.beta_by_frame = lambda frame_idx: min(
            1.0, args.pri_beta_start + frame_idx *
            (1.0 - args.pri_beta_start) / args.pri_beta_decay)

        # Create Policy and Target Networks
        if self.args.use_dueling:
            print("Using dueling dqn . . .")
            self.policy_net = DuelingDQN(env, self.args).to(self.args.device)
            self.target_net = DuelingDQN(env, self.args).to(self.args.device)
        elif self.args.use_crnn:
            print("Using dueling crnn . . .")
            self.policy_net = CrnnDQN(env).to(self.args.device)
            self.target_net = CrnnDQN(env).to(self.args.device)
        else:
            self.policy_net = DQN(env, self.args).to(self.args.device)
            self.target_net = DQN(env, self.args).to(self.args.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.args.lr,
                                    eps=self.args.optimizer_eps)
        if self.args.lr_scheduler:
            print("Enabling LR Decay . . .")
            self.scheduler = optim.lr_scheduler.ExponentialLR(
                optimizer=self.optimizer, gamma=self.args.lr_decay)
        self.cur_lr = self.optimizer.param_groups[0]['lr']

        # Compute Huber loss
        self.loss = F.smooth_l1_loss

        # todo: Support for Multiprocessing. Bug in pytorch - https://github.com/pytorch/examples/issues/370
        self.policy_net.share_memory()
        self.target_net.share_memory()

        # Set defaults for networks
        self.policy_net.train()
        self.target_net.eval()
        self.target_net.load_state_dict(self.policy_net.state_dict())

        if args.test_dqn:
            # you can load your model here
            ###########################
            # YOUR IMPLEMENTATION HERE #
            print('loading trained model')
            self.load_model()

        if args.use_pri_buffer:
            print('Using priority buffer . . .')
        if args.use_double_dqn:
            print('Using double dqn . . .')

        if args.use_bnorm:
            print("Using batch normalization . . .")

        print("Arguments: \n", json.dumps(vars(self.args), indent=2), '\n')

    def init_game_setting(self):
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        with torch.no_grad():
            if self.args.test_dqn:
                q, argq = self.policy_net(
                    Variable(
                        self.channel_first(observation))).data.cpu().max(1)
                return self.action_list[argq]
            # Fill up probability list equal for all actions
            self.probability_list.fill(self.cur_eps / self.nA)
            # Fetch q from the model prediction
            q, argq = self.policy_net(Variable(
                self.channel_first(observation))).data.cpu().max(1)
            # Increase the probability for the selected best action
            self.probability_list[argq[0].item()] += 1 - self.cur_eps
            # Use random choice to decide between a random action / best action
            action = torch.tensor(
                [np.random.choice(self.action_list, p=self.probability_list)])

        ###########################
        return action.item(), q.item()

    def optimize_model(self):
        """
        Function to perform optimization on DL Network
        :return: Loss
        """
        # Return if initial buffer is not filled.
        if len(self.replay_buffer.memory) < self.args.mem_init_size:
            return 0
        if self.args.use_pri_buffer:
            batch_state, batch_action, batch_next_state, batch_reward, batch_done, indices, weights = self.replay_buffer.sample(
                self.args.batch_size, beta=self.beta_by_frame(self.t))
        else:
            batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.replay_buffer.sample(
                self.args.batch_size)
        batch_state = Variable(
            self.channel_first(
                torch.tensor(np.array(batch_state), dtype=torch.float32)))
        batch_action = Variable(
            torch.tensor(np.array(batch_action), dtype=torch.long))
        batch_next_state = Variable(
            self.channel_first(
                torch.tensor(np.array(batch_next_state), dtype=torch.float32)))
        batch_reward = Variable(
            torch.tensor(np.array(batch_reward), dtype=torch.float32))
        batch_done = Variable(
            torch.tensor(np.array(batch_done), dtype=torch.float32))
        policy_max_q = self.policy_net(batch_state).gather(
            1, batch_action.unsqueeze(1)).squeeze(1)
        if self.args.use_double_dqn:
            policy_ns_max_q = self.policy_net(batch_next_state)
            next_q_value = self.target_net(batch_next_state).gather(
                1,
                torch.max(policy_ns_max_q, 1)[1].unsqueeze(1)).squeeze(1)
            target_max_q = next_q_value * self.args.gamma * (1 - batch_done)
        else:
            target_max_q = self.target_net(batch_next_state).detach().max(
                1)[0].squeeze(0) * self.args.gamma * (1 - batch_done)
        # Compute Huber loss
        if self.args.use_pri_buffer:
            loss = (policy_max_q -
                    (batch_reward + target_max_q.detach())).pow(2) * Variable(
                        torch.tensor(weights, dtype=torch.float32))
            prios = loss + 1e-5
            loss = loss.mean()
        else:
            loss = self.loss(policy_max_q, batch_reward + target_max_q)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        # Clip gradients between -1 and 1
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)

        if self.args.use_pri_buffer:
            self.replay_buffer.update_priorities(indices,
                                                 prios.data.cpu().numpy())

        self.optimizer.step()
        return loss.cpu().detach().numpy()

    def train(self):
        """
        Implement your training algorithm here
        """

        ###########################
        # YOUR IMPLEMENTATION HERE #
        def train_fn():
            self.t = 1
            self.mode = "Random"
            train_start = time.time()
            if not self.args.load_dir == '':
                self.load_model()
            for i_episode in range(1, self.args.max_episodes + 1):
                # Initialize the environment and state
                start_time = time.time()
                state = self.env.reset()
                self.reward_list.append(0)
                self.loss_list.append(0)
                self.max_q_list.append(0)
                self.ep_len = 0
                done = False

                # Save Model
                self.save_model(i_episode)
                # Collect garbage
                self.collect_garbage(i_episode)

                # Run the game
                while not done:
                    # Update the target network, copying all weights and biases in DQN
                    if self.t % self.args.target_update == 0:
                        print("Updating target network . . .")
                        self.target_net.load_state_dict(
                            self.policy_net.state_dict())
                    # Select and perform an action
                    self.cur_eps = max(self.args.eps_min,
                                       self.cur_eps - self.eps_delta)
                    if self.cur_eps == self.args.eps_min:
                        self.mode = 'Exploit'
                    else:
                        self.mode = "Explore"
                    action, q = self.make_action(state)
                    next_state, reward, done, _ = self.env.step(action)
                    self.reward_list[-1] += reward
                    self.max_q_list[-1] = max(self.max_q_list[-1], q)
                    # Store the transition in memory
                    self.replay_buffer.push(state, action, next_state, reward,
                                            done)
                    self.meta.update_step(self.t, self.cur_eps,
                                          self.reward_list[-1],
                                          self.max_q_list[-1],
                                          self.loss_list[-1], self.cur_lr)

                    # Increment step and Episode Length
                    self.t += 1
                    self.ep_len += 1

                    # Move to the next state
                    state = next_state

                    # Perform one step of the optimization (on the target network)
                    if self.ep_len % self.args.learn_freq == 0:
                        loss = self.optimize_model()
                        self.loss_list[-1] += loss
                self.loss_list[-1] /= self.ep_len

                # Decay Step:
                if self.args.lr_scheduler:
                    self.cur_lr = self.scheduler.get_lr()[0]
                    if i_episode % self.args.lr_decay_step == 0 and self.cur_lr > self.args.lr_min:
                        self.scheduler.step(i_episode)

                # Update meta
                self.meta.update_episode(
                    i_episode, self.t,
                    time.time() - start_time,
                    time.time() - train_start, self.ep_len,
                    len(self.replay_buffer.memory),
                    self.cur_eps, self.reward_list[-1],
                    np.mean(self.reward_list), self.max_q_list[-1],
                    np.mean(self.max_q_list), self.loss_list[-1],
                    np.mean(self.loss_list), self.mode, self.cur_lr)

        import multiprocessing as mp
        processes = []
        for rank in range(4):
            p = mp.Process(target=train_fn)
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
Ejemplo n.º 2
0
class Agent():
    """
    RL Agent that interacts with a given environment, learns and adapts succesfull behaviour.
    """


    def __init__(self,state_size, action_size
                 ,batch_size,learn_step_size,buffer_size
                 ,gamma , learning_rate, tau
                 ,seed):
        """
        Intialize the agent and its learning parameter set.

        Parameters
        =========
        state_size (int): Size of the state space
        action_size (int): Size of the action space

        batch_size (int): Size of the batch size used in each learning step
        learn_step_size (int): Number of steps until agent ist trained again
        buffer_size (int): Size of replay memory buffer

        gamma (float): Discount rate that scales future discounts
        learning_rate (float): Learning rate of neural network
        tau (float): Update strenght between local and target network

        seed (float): Random set for initialization
        """

        # ----- Parameter init -----
        # State and action size from environment
        self.state_size = state_size
        self.action_size = action_size

        # Replay buffer and learning properties
        self.batch_size      = batch_size
        self.learn_step_size = learn_step_size
        self.gamma = gamma
        self.tau  = tau

        # General
        self.seed = random.seed(seed)


        # ----- Network and memory init -----
        # Init identical NN as local and target networks and set optimizer
        self.qnetwork_local  = DQN(state_size, action_size, seed).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate)

        # Initialize replay memory and time step (for updating every learn_step_size steps)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0


    def step(self, state, action, reward, next_state, done):
        """
        Append information of past step in memory and trigger learning.

        Parameters
        ==========
        state (array_like): State before action
        action (array_like): Action that was taken
        reward (float): Reward for action
        next_state (array_like): State after action
        done (bool): Indicator if env was solved after action
        """

        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every learn_step_size time steps.
        self.t_step = (self.t_step + 1) % self.learn_step_size
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.get_memory_size()  > self.batch_size:
                self.learn()


    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Parameters
        ==========
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Transform state to PyTorch tensor
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        # Get action scores for state from network
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        """
        Get sample of experience tuples and value parameters target network.
        """

        # Get tuples from experience buffer
        experiences = self.memory.get_sample()
        states, actions, rewards, next_states, dones = experiences

        #  -----DQN -----
        #Optional: to be replaced with Double DQN (see below)
        #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # ----- Double DQN -----
        # Detach to not update weights during learning
        # Select maximum value
        # Unsqueeze to reduce the tensor dimension to one
        expected_next_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
        # Get Q values for next actions from target Q-network
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, expected_next_actions)

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        # Gather values alon an axis specified by dim
        Q_expected = self.qnetwork_local(states).gather(1, actions)


        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----- Update target network -----
        #Soft update model parameters.
        #θ_target = τ*θ_local + (1 - τ)*θ_target
        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)
Ejemplo n.º 3
0
class Agent_DQN():
    def __init__(self, env, test=False):
        self.cuda = torch.device('cuda')
        print("Using device: " + torch.cuda.get_device_name(self.cuda),
              flush=True)

        self.env = env
        self.state_shape = env.observation_space.shape
        self.n_actions = env.action_space.n

        self.memory = deque(maxlen=100000)
        self.batch_size = 32
        self.mem_threshold = 50000

        self.gamma = 0.99

        self.learning_rate = 1e-4

        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_period = 10000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.epsilon_period

        self.update_rate = 4

        self.start_epoch = 1
        self.epochs = 10
        self.epoch = 10000

        self.model = DQN(self.state_shape, self.n_actions).to(self.cuda)
        print("DQN parameters: {}".format(count_parameters(self.model)))

        self.target = DQN(self.state_shape, self.n_actions).to(self.cuda)
        self.target.eval()
        self.target_update = 10000

        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)
        if test:
            self.model.load_state_dict(torch.load('model.pt'))

    def init_game_setting(self):
        pass

    def make_action(self, observation, test=False):
        epsilon = 0.01 if test else self.epsilon
        # turn action into tensor
        observation = torch.tensor(observation,
                                   device=self.cuda,
                                   dtype=torch.float)
        # turn off learning
        self.model.eval()
        # epsilon greedy policy
        if random.random() > epsilon:
            # no need to calculate gradient
            with torch.no_grad():
                # choose highest value action
                b = self.model(observation)
                b = b.cpu().data.numpy()
                action = np.random.choice(
                    np.flatnonzero(np.isclose(b, b.max())))
        else:
            # random action
            action = random.choice(np.arange(self.n_actions))
        # turn learning back on
        self.model.train()
        return action

    def replay_buffer(self):
        # Return tuple of sars transitions
        states, actions, rewards, next_states, dones = zip(
            *random.sample(self.memory, self.batch_size))
        states = torch.tensor(np.vstack(states),
                              device=self.cuda,
                              dtype=torch.float)
        actions = torch.tensor(np.array(actions),
                               device=self.cuda,
                               dtype=torch.long)
        rewards = torch.tensor(np.array(rewards, dtype=np.float32),
                               device=self.cuda,
                               dtype=torch.float)
        next_states = torch.tensor(np.vstack(next_states),
                                   device=self.cuda,
                                   dtype=torch.float)
        dones = torch.tensor(np.array(dones, dtype=np.float32),
                             device=self.cuda,
                             dtype=torch.float)
        return states, actions, rewards, next_states, dones

    def experience_replay(self, n=0):
        # clamp gradient
        clamp = False
        # Reset gradient (because it accumulates by default)
        self.optimizer.zero_grad()
        # sample experience memory
        states, actions, rewards, next_states, dones = self.replay_buffer()
        # get Q(s,a) for sample
        Q = self.model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        # get max_a' Q(s',a')
        Q_prime = self.target(next_states).detach().max(1)[0]
        # calculate y = r + gamma * max_a' Q(s',a') for non-terminal states
        Y = rewards + (self.gamma * Q_prime) * (1 - dones)
        # Huber loss of Q and Y
        loss = F.smooth_l1_loss(Q, Y)
        # Compute dloss/dx
        loss.backward()
        # Clamp gradient
        if clamp:
            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
        # Change the weights
        self.optimizer.step()

    def train(self):
        step = 0
        learn_step = 0
        print("Begin Training:", flush=True)
        learn_curve = []
        last30 = deque(maxlen=30)
        for epoch in range(self.start_epoch, self.epochs + 1):
            durations = []
            rewards = []
            flag = []
            # progress bar
            epoch_bar = tqdm(range(self.epoch), total=self.epoch, ncols=200)
            for episode in epoch_bar:
                # reset state
                state = self.env.reset()
                # decay epsilon
                if self.epsilon > self.epsilon_min:
                    self.epsilon -= self.epsilon_decay
                # run one episode
                done = False
                ep_duration = 0
                ep_reward = 0
                while not done:
                    step += 1
                    ep_duration += 1
                    # get epsilon-greedy action
                    action = self.make_action(state)
                    # do action
                    next_state, reward, done, info = self.env.step(action)
                    ep_reward += reward
                    # add transition to replay memory
                    self.memory.append(
                        Transition(state, action, reward, next_state, done))
                    state = next_state
                    # learn from experience, if available
                    if step % self.update_rate == 0 and len(
                            self.memory) > self.mem_threshold:
                        self.experience_replay(learn_step)
                        learn_step += 1
                    # update target network
                    if step % self.target_update == 1:
                        self.target.load_state_dict(self.model.state_dict())

                durations.append(ep_duration)
                rewards.append(ep_reward)
                last30.append(ep_reward)
                learn_curve.append(np.mean(last30))
                flag.append(info['flag_get'])
                epoch_bar.set_description(
                    "epoch {}/{}, avg duration = {:.2f}, avg reward = {:.2f}, last30 = {:2f}"
                    .format(epoch, self.epochs, np.mean(durations),
                            np.mean(rewards), learn_curve[-1]))
            # save model every epoch
            plt.clf()
            plt.plot(learn_curve)
            plt.title(f"DQN Epoch {epoch} with {save_prefix} Reward")
            plt.xlabel('Episodes')
            plt.ylabel('Moving Average Reward')
            if not os.path.exists(f"{save_prefix}_DQN"):
                os.mkdir(f"{save_prefix}_DQN")
            torch.save(self.model.state_dict(),
                       f'{save_prefix}_DQN/DQN_model_ep{epoch}.pt')
            pickle.dump(
                rewards,
                open(f"{save_prefix}_DQN/DQN_reward_ep{epoch}.pkl", 'wb'))
            pickle.dump(flag,
                        open(f"{save_prefix}_DQN/flag_ep{epoch}.pkl", 'wb'))
            plt.savefig(f"{save_prefix}_DQN/epoch{epoch}.png")
            learn_curve = []
Ejemplo n.º 4
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.env = env
        self.args = args
        self.gamma = self.args.gamma
        self.batch_size = self.args.batch_size
        self.memory_cap = self.args.memory_cap
        self.n_episode = self.args.n_episode
        self.lr = self.args.learning_rate

        self.epsilon = self.args.epsilon
        self.epsilon_decay_window = self.args.epsilon_decay_window
        self.epsilon_min = self.args.epsilon_min
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.epsilon_decay_window

        self.n_step = self.args.n_step
        self.f_update = self.args.f_update
        self.load_model = self.args.load_model
        self.action_size = self.args.action_size
        #         self.algorithm = self.args.algorithm

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        print('using device ', torch.cuda.get_device_name(0))
        self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor
        self.LongTensor = torch.cuda.LongTensor if self.use_cuda else torch.LongTensor
        self.ByteTensor = torch.cuda.ByteTensor if self.use_cuda else torch.ByteTensor
        self.Tensor = self.FloatTensor

        # Create the policy net and the target net
        self.policy_net = DQN()
        self.policy_net.to(self.device)
        #         if self.algorithm == 'DDQN':
        #             self.policy_net_2 = DQN()
        #             self.policy_net_2.to(self.device)
        self.target_net = DQN()
        self.target_net.to(self.device)
        self.policy_net.train()
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        # buffer
        self.memory = []

        ##
        self.mean_window = 100
        self.print_frequency = 100
        self.out_dir = "DQN_Module_b1_1/"

        if args.test_dqn:
            #you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #
            self.policy_net.load_state_dict(
                torch.load('model.pth', map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            if self.algorithm == 'DDQN':
                self.policy_net_2.load_state_dict(
                    torch.load('model.pth', map_location=self.device))
            self.print_test = True

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=False):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if test:
            self.epsilon = self.epsilon_min * 0.5
            observation = observation / 255.
        else:
            self.epsilon = max(self.epsilon - self.epsilon_decay,
                               self.epsilon_min)
        if random.random() > self.epsilon:
            observation = self.Tensor(observation.reshape(
                (1, 84, 84, 4))).transpose(1, 3).transpose(2, 3)
            state_action_value = self.policy_net(
                observation).data.cpu().numpy()
            action = np.argmax(state_action_value)
        else:
            action = random.randint(0, self.action_size - 1)
        ###########################
        return action

    def push(self, state, action, reward, next_state, done):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if len(self.memory) >= self.memory_cap:
            self.memory.pop(0)
        self.memory.append((state, action, reward, next_state, done))
        ###########################

    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.mini_batch = random.sample(self.memory, self.batch_size)
        ###########################
        return

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.steps_done = 0
        self.steps = []
        self.rewards = []
        self.mean_rewards = []
        self.time = []
        self.best_reward = 0
        self.last_saved_reward = 0
        self.start_time = time.time()
        print('train')
        # continue training from where it stopped
        if self.load_model:
            self.policy_net.load_state_dict(
                torch.load(self.out_dir + 'model.pth',
                           map_location=self.device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.epsilon = self.epsilon_min
            print('Loaded')
        for episode in range(self.n_episode):
            # Initialize the environment and state
            state = self.env.reset() / 255.
            #             self.last_life = 5
            total_reward = 0
            self.step = 0
            done = False

            while (not done) and self.step < 10000:
                # move to next state
                self.step += 1
                self.steps_done += 1
                action = self.make_action(state)
                next_state, reward, done, life = self.env.step(action)
                # lives matter
                #                 self.now_life = life['ale.lives']
                #                 dead = self.now_life < self.last_life
                #                 self.last_life = self.now_life
                next_state = next_state / 255.
                # Store the transition in memory
                self.push(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward

                if done:
                    self.rewards.append(total_reward)
                    self.mean_reward = np.mean(
                        self.rewards[-self.mean_window:])
                    self.mean_rewards.append(self.mean_reward)
                    self.time.append(time.time() - self.start_time)
                    self.steps.append(self.step)

                    # print the process to terminal
                    progress = "episode: " + str(
                        episode) + ",\t epsilon: " + str(
                            self.epsilon
                        ) + ",\t Current mean reward: " + "{:.2f}".format(
                            self.mean_reward)
                    progress += ',\t Best mean reward: ' + "{:.2f}".format(
                        self.best_reward) + ",\t time: " + time.strftime(
                            '%H:%M:%S', time.gmtime(self.time[-1]))
                    print(progress)

                    if episode % self.print_frequency == 0:
                        self.print_and_plot()
                    # save the best model
                    if self.mean_reward > self.best_reward and len(
                            self.memory) >= 5000:
                        print('~~~~~~~~~~<Model updated with best reward = ',
                              self.mean_reward, '>~~~~~~~~~~')
                        checkpoint_path = self.out_dir + 'model.pth'
                        torch.save(self.policy_net.state_dict(),
                                   checkpoint_path)
                        self.last_saved_reward = self.mean_reward
                        self.best_reward = self.mean_reward

                if len(self.memory) >= 5000 and self.steps_done % 4 == 0:
                    #                     if self.algorithm == 'DQN':
                    self.optimize_DQN()
                if self.steps_done % self.f_update == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())
#                     print('-------<target net updated at step,',self.steps_done,'>-------')

###########################

    def optimize_DQN(self):
        # sample
        self.replay_buffer()
        state, action, reward, next_state, done = zip(*self.mini_batch)

        # transfer 1*84*84*4 to 1*4*84*84, which is 0,3,1,2
        state = self.Tensor(np.float32(state)).permute(0, 3, 1,
                                                       2).to(self.device)
        action = self.LongTensor(action).to(self.device)
        reward = self.Tensor(reward).to(self.device)
        next_state = self.Tensor(np.float32(next_state)).permute(
            0, 3, 1, 2).to(self.device)

        done = self.Tensor(done).to(self.device)

        # Compute Q(s_t, a)
        state_action_values = self.policy_net(state).gather(
            1, action.unsqueeze(1)).squeeze(1)
        # Compute next Q, including the mask
        next_state_values = self.target_net(next_state).detach().max(1)[0]
        # Compute the expected Q value. stop update if done
        expected_state_action_values = reward + (next_state_values *
                                                 self.gamma) * (1 - done)
        # Compute Huber loss
        self.loss = F.smooth_l1_loss(state_action_values,
                                     expected_state_action_values.data)
        # Optimize the model
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
        return

    def print_and_plot(self):
        fig1 = plt.figure(1)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Steps')
        plt.plot(self.steps)
        fig1.savefig(self.out_dir + 'steps.png')

        fig2 = plt.figure(2)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.plot(self.mean_rewards)
        fig2.savefig(self.out_dir + 'rewards.png')

        fig2 = plt.figure(3)
        plt.clf()
        plt.title('Training...')
        plt.xlabel('Episode')
        plt.ylabel('Time')
        plt.plot(self.time)
        fig2.savefig(self.out_dir + 'time.png')
Ejemplo n.º 5
0
class DQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, alpha, gamma, tau):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.alpha = alpha
        self.gamma = gamma
        self.tau = tau

        # Q Learning Network
        self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
        self.qnetwork_target = DQN(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.alpha)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.fill_replay_buffer(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.__len__() > BATCH_SIZE:
                experiences = self.memory.get_sample_replay_buffer()
                self.learn_DDQN(experiences, self.gamma, self.alpha, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn_DDQN(self, experiences, gamma, alpha, tau):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get index of maximum value for next state from Q_expected
        Q_argmax = self.qnetwork_local(next_states).detach()
        _, a_prime = Q_argmax.max(1)
        #print (self.qnetwork_local(states).detach())
        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().gather(
            1, a_prime.unsqueeze(1))
        #print (Q_targets_next.shape)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        #print (Q_targets.shape)
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        #print (Q_expected.shape)
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 6
0
                            reward = 10
                    # 蓝色胜利,负的奖励
                    elif GAME_STATE == BLUE_WIN:
                        if GAME_OVER_LX == OUT_OF_MAP:
                            reward = -10
                        elif GAME_OVER_LX == ATTACKED:
                            reward = -1
                    # 谁都没赢,没奖励
                    else:
                        reward = 0

                    # todo 新增如果距离太远就发弹,则惩罚agent 负的reward
                    pass

                    agent.remember(s, player1_action, next_s, reward)
                    agent.train()
                    score += reward
                    # 如果游戏结束了
                    if GAME_STATE:
                        score_list.append(score)
                        print('episode:', episode+1, 'score:', score, 'max:', max(score_list))
                        break
                FPS_COUNT = 0
                s = player1.get_obs(player2, bullet_list)
                player1_action = agent.act(s)
                # player1执行action
                listen_model_action(player1_action, player1, player2)

            # 如果游戏结束了,这个逻辑也不需要执行了,等待进入下一次玩家1决策的FPS_COUNT即可
            if not GAME_STATE:
                # 玩家二行为,在玩家二类中定义,自动决策
Ejemplo n.º 7
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example:
            paramters for neural network
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.env = env
        self.batch_size = BATCH_SIZE
        self.gamma = 0.999
        self.eps_start = EPS_START
        self.eps_decay = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE

        self.policy_net = DQN(self.env.action_space.n)
        self.target_net = DQN(self.env.action_space.n)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        if use_cuda:
            self.policy_net.cuda()
            self.target_net.cuda()

        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-5)
        self.memory = deque(maxlen=10000)

        if args.test_dqn:
            # you can load your model here
            print('loading trained model')
            ###########################
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        global steps_done
        self.policy_net.eval()
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * steps_done / EPS_DECAY)
        steps_done += 1
        if sample > eps_threshold:
            return self.policy_net(
                Variable(torch.from_numpy(observation),
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1)
        else:
            return LongTensor([[random.randrange(self.env.action_space.n)]])
        ###########################
        return action

    def push(self, s, a, r, s_, done):
        """ You can add additional arguments as you need.
        Push new data to buffer and remove the old one if the buffer is full.

        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.memory.append((s, a, r, s_, done))
        if len(self.memory) > self.maxlen:
            self.replay_memory_store.popleft()
        self.memory_counter += 1

        ###########################

    def replay_buffer(self):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        #print("memory", len(self.memory), self.BATCH_SIZE)
        minibatch = random.sample(self.memory, self.BATCH_SIZE)
        minibatch = np.array(minibatch).transpose(0, 3, 1, 2)
        minibatch = torch.tensor(minibatch / 255.0)
        ###########################
        return minibatch

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
        # detailed explanation).
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True).cuda()
        state_batch = Variable(torch.cat(batch.state)).cuda()
        action_batch = Variable(torch.cat(batch.action)).cuda()
        reward_batch = Variable(torch.cat(batch.reward)).cuda()

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        self.policy_net.train()
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_state_values = Variable(
            torch.zeros(BATCH_SIZE).type(Tensor)).cuda()
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0]
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch
        # Undo volatility (which was used to prevent unnecessary gradients)
        expected_state_action_values = Variable(
            expected_state_action_values.data).cuda()

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        num_episodes = 1400000
        for i_episode in range(num_episodes):
            # Initialize the environment and state
            observation = self.env.reset()
            observation = observation.transpose((2, 0, 1))
            observation = observation[np.newaxis, :]
            state = observation

            for t in count():
                # Select and perform an action
                action = self.make_action(state)
                next_state, reward, done, _ = self.env.step(action[0, 0])
                next_state = next_state.transpose((2, 0, 1))
                next_state = next_state[np.newaxis, :]
                reward = Tensor([reward])

                # Store the transition in memory
                self.memory.push(torch.from_numpy(state), action,
                                 torch.from_numpy(next_state), reward)

                # Observe new state
                if not done:
                    state = next_state
                else:
                    state = None

                # Perform one step of the optimization (on the target network)
                self.optimize_model()
                if done:
                    print(
                        'resetting env. episode %d \'s reward total was %d.' %
                        (i_episode + 1, t + 1))
                    break
            # Update the target network
            if i_episode % TARGET_UPDATE == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            if i_episode % 50 == 0:
                checkpoint_path = os.path.join('save_dir', 'model-best.pth')
                torch.save(self.policy_net.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
Ejemplo n.º 8
0
class Agent_DQN(Agent):
    def __init__(self, env, args):
        """
        Initialize everything you need here.
        For example: 
            paramters for neural network  
            initialize Q net and target Q net
            parameters for repaly buffer
            parameters for q-learning; decaying epsilon-greedy
            ...
        """

        super(Agent_DQN, self).__init__(env)
        self.action = env.get_action_space()

        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        print('Using device:', self.device)
        self.model = DQN().to(self.device)
        self.model_target = DQN().to(self.device)
        self.episode = 100000
        self.max_steps_per_episode = 14000
        self.update_target_network = 10000
        self.epsilon = 1.0
        self.min_epsilon = 0.1
        self.step_epsilon = (self.epsilon - self.min_epsilon) / (1E6)
        self.env = env
        self.history = []
        self.buffer_size = min(args.history_size // 5, 2000)
        self.history_size = args.history_size
        self.learning_rate = 1e-4
        self.name = args.name
        self.batch_size = 32
        self.gamma = 0.99
        self.priority = []
        self.w = 144
        self.h = 256
        self.mode = args.mode
        self.delay = args.delay
        self.epoch = args.continue_epoch
        if args.test_dqn or self.epoch > 0:
            #you can load your model here
            print('loading trained model')
            ###########################
            self.model.load_state_dict(
                torch.load(self.name + '.pth', map_location=self.device))
            self.model_target.load_state_dict(
                torch.load(self.name + '.pth', map_location=self.device))
            # YOUR IMPLEMENTATION HERE #

    def init_game_setting(self):
        """
        Testing function will call this function at the begining of new game
        Put anything you want to initialize if necessary.
        If no parameters need to be initialized, you can leave it as blank.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        ###########################
        pass

    def make_action(self, observation, test=True):
        """
        Return predicted action of your agent
        Input:
            observation: np.array
                stack 4 last preprocessed frames, shape: (84, 84, 4)
        Return:
            action: int
                the predicted action from trained model
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.model.eval()
        with torch.no_grad():
            if test == False:
                if np.random.random() < self.epsilon or len(
                        self.history) < self.buffer_size:
                    action = int(np.random.choice([0, 1], 1)[0])
                else:
                    obs = torch.from_numpy(observation).to(self.device).float()
                    action_prob = self.model(obs.view(1, 12, self.h, self.w))
                    action = torch.argmax(action_prob).detach().item()
                return action

            else:
                observation = np.swapaxes(observation, 0, 2) / 255.
                obs = torch.from_numpy(observation).to(self.device).float()
                action_prob = self.model(obs.view(1, 12, self.h, self.w))
                action = torch.argmax(action_prob).detach().item()

                return self.action[action]
        ###########################

    def push(self, state, action, reward, done, state_next, smooth=None):
        """ You can add additional arguments as you need. 
        Push new data to buffer and remove the old one if the buffer is full.
        
        Hints:
        -----
            you can consider deque(maxlen = 10000) list
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        self.history.append(
            np.array([state, action, reward, done, state_next, smooth]))

        if len(self.history) > self.history_size:
            self.history.pop(0)

        ###########################

    def replay_buffer(self, refresh=False):
        """ You can add additional arguments as you need.
        Select batch from buffer.
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #
        if 'prioritized' in self.mode.split('_'):
            if refresh:
                self.priority = np.zeros(len(self.history))
                for i in range(len(self.history)):
                    max_reward, _ = torch.max(self.model_target(
                        torch.from_numpy(self.history[i][4]).to(
                            self.device).float().view(1, 12, self.h, self.w)),
                                              axis=1)
                    max_reward = max_reward.detach().item()
                    Q = self.model(
                        torch.from_numpy(
                            self.history[i][0]).to(self.device).float().view(
                                1, 12, self.h,
                                self.w))[0,
                                         self.history[i][1]].detach().item()
                    self.priority[i] = abs(
                        (self.history[i][2] + self.gamma * max_reward - Q))
                self.priority = self.priority / sum(self.priority)
                return 0
            priority = np.zeros(len(self.history))
            priority[:len(self.priority)] = self.priority
            if sum(priority) == 0:
                indices = np.random.choice(range(len(self.history)),
                                           size=self.batch_size)
            else:
                indices = np.random.choice(range(len(self.history)),
                                           size=self.batch_size,
                                           p=priority)

            ###########################
            return indices
        else:
            return np.random.choice(range(len(self.history)),
                                    size=self.batch_size)

    def train(self):
        """
        Implement your training algorithm here
        """
        ###########################
        # YOUR IMPLEMENTATION HERE #

        episode_reward_history = []
        best_reward = -10
        optimizer = torch.optim.Adam(self.model.parameters(),
                                     lr=self.learning_rate)
        # optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.learning_rate,momentum=0.5)
        loss_fn = torch.nn.SmoothL1Loss()
        frame_count = 0
        if self.epoch > 0:
            f = open(self.name + '.txt', "a")
        else:
            f = open(self.name + '.txt', "w")
        done = False
        for ep in range(self.epoch, self.episode):
            state = self.env.reset()
            state = np.swapaxes(state, 0, 2) / 255.
            episode_reward = 0
            pre_action = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            smooth = 0
            for timestep in range(0, self.max_steps_per_episode):
                frame_count += 1
                action = self.make_action(state, test=False)
                if done:
                    action = 1

                # Decay
                self.epsilon -= self.step_epsilon
                self.epsilon = max(self.epsilon, self.min_epsilon)

                # next frame
                state_next, reward, done, _ = self.env.step(
                    self.action[action])
                state_next = np.swapaxes(state_next, 0, 2) / 255.
                episode_reward += reward
                # print(reward)
                #normalize reward
                # reward = np.sign(reward)
                # Save actions and states in replay buffer

                state = state_next
                if 'smooth1' in self.mode.split('_'):
                    pre_action.pop(0)
                    pre_action.append(action)
                    smooth = float(np.mean(pre_action) - 0.5)

                self.push(state, action, reward, done, state_next, smooth)

                if frame_count % 8 == 0 and len(
                        self.history) >= self.buffer_size:
                    if frame_count % self.history_size // 10 == 0 and 'prioritized' in self.mode.split(
                            '_'):
                        #update priority vector
                        self.replay_buffer(refresh=True)
                    indice = self.replay_buffer()
                    self.model.train()
                    # data_batch = torch.from_numpy(np.array(self.history)[indice]).to(self.device).float()
                    state_sample = torch.from_numpy(
                        np.array([self.history[i][0]
                                  for i in indice])).to(self.device).float()
                    action_sample = torch.from_numpy(
                        np.array([self.history[i][1]
                                  for i in indice])).to(self.device).float()
                    rewards_sample = torch.from_numpy(
                        np.array([self.history[i][2]
                                  for i in indice])).to(self.device).float()
                    done_sample = torch.from_numpy(
                        np.array([self.history[i][3]
                                  for i in indice])).to(self.device).float()
                    next_state_sample = torch.from_numpy(
                        np.array([self.history[i][4]
                                  for i in indice])).to(self.device).float()
                    smooth_sample = torch.from_numpy(
                        np.array([self.history[i][5]
                                  for i in indice])).to(self.device).float()
                    future_rewards = self.model_target(next_state_sample)

                    max_reward, _ = torch.max(future_rewards, axis=1)
                    updated_q_values = rewards_sample + self.gamma * max_reward
                    updated_q_values = updated_q_values * (
                        1 - done_sample) - done_sample
                    mask = F.one_hot(action_sample.long(),
                                     2).to(self.device).float()

                    q_values = self.model(state_sample)
                    q_action = torch.sum(q_values * mask, axis=1)
                    loss = loss_fn(q_action, updated_q_values)

                    if 'smooth1' in self.mode.split('_') and self.delay < ep:
                        penalty = torch.abs((ep - self.delay) / self.episode *
                                            torch.sum(smooth_sample))
                        loss += penalty

                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm(self.model.parameters(), 1.0)
                    optimizer.step()

                if frame_count % self.update_target_network == 0:
                    self.model_target.load_state_dict(self.model.state_dict())

                if done:
                    break
            episode_reward_history.append(episode_reward)
            if len(episode_reward_history) > 30:
                del episode_reward_history[:1]
            running_reward = np.mean(episode_reward_history)
            #             if ep%500==0:
            #                 print("Episode:\t{},\t Avereged reward: {:.2f}\n".format(ep,running_reward))
            f.write("Episode:\t{},\t Avereged reward: {:.2f}\n".format(
                ep, running_reward))
            if running_reward > best_reward:
                best_reward = running_reward
                torch.save(self.model.state_dict(), self.name + '.pth')
        f.close()