Exemple #1
0
    def __init__(self,
                 memory_size,
                 batch_size,
                 learn_start_time,
                 learn_fre,
                 lr,
                 replay_iters,
                 eps_T,
                 eps_t_init,
                 gamma,
                 update_period,
                 board,
                 device,
                 model_path,
                 r_memory_Fname,
                 o_model_name,
                 model_load=False):
        self.step_now = 0  # record the step
        self.reward_num = 0
        self.reward_accumulated = 0  # delay reward
        self.final_tem = 10  # just for now
        self.step_last_update = 0  # record the last update time
        self.update_period = update_period  # for the off policy
        self.update_cont = 0
        self.learn_start_time = learn_start_time
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.alpha = 0.6
        self.beta = 0.4
        self.replay_bata_iters = replay_iters
        self.replay_eps = 1e-6
        self.loss_back = 0
        self.q_value_p = 0
        self.memory_min_num = 400  #she min num to learn
        self.step_last_learn = 0  # record the last learn step
        self.learn_fre = learn_fre  # step frequency to learn
        self.e_greedy = 1  # record the e_greedy
        self.eps_T = eps_T  # par for updating the maybe step 80,0000
        self.eps_t_init = eps_t_init  # par for updating the eps

        self.device = device
        self.model_path = model_path
        self.mode_enjoy = model_load
        if model_load == False:
            self.policy_net = DQN(board[0], board[1], action_num).to(device)
            self.target_net = DQN(board[0], board[1], action_num).to(device)
            #self.target_net.eval()
            self.optimizer = optim.SGD(self.policy_net.parameters(), lr=lr)
            self.loss_fn = nn.functional.mse_loss  # use the l1 loss
            self.memory = Memory(memory_size)
        else:
            self.load(o_model_name)
        #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)
        self.obs_new = None
        self.obs_old = None
        self.action = None
        self.action_old = None
        self.dqn_direct_flag = False  # show if the dqn action is done
        self.model_save_flag = False
Exemple #2
0
    def __init__(self, state_size, action_size, is_double_Q, is_prioritized, seed = 456):
        """
        This is for initialization.

        (input)
        - state_size (int): size of a state
        - action_size (int): dim of the action space
        - seed (int): random seed
        - is_double_Q (bool): double Q-learning (True) or normal Q-learning
        - is_prioritized (bool): prioritized replay buffer (True) or normal replay buffer
        - seed (int): random seed

        """

        self.seed = torch.manual_seed(seed)
        self.state_size = state_size
        self.action_size = action_size

        # Q-networks, local and target
        self.qnetwork_local = DQN(self.state_size, self.action_size, seed).to(device)
        self.qnetwork_target = DQN(self.state_size, self.action_size, seed).to(device)
        self.is_double_Q = is_double_Q

        # optimizer for learning process
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr = LR)

        # replay buffer for learning process
        self.is_prioritized = is_prioritized
        self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.is_prioritized, seed)
        self.beta = BETA0_PRIORITIZED # power of the weights for the prioritized replay buffer
        self.learning_count = 0 # count how many times the learning process is done (used for the update of beta)

        # the number of time step (modulo UPDATED_EVERY)
        self.t_step = 0
        # loss
        self.loss = 0.0
Exemple #3
0
class RL_AGENT_ONE():
    """
    RL agent class
    """
    def __init__(self, memory_size, batch_size, learn_start_time, learn_fre, lr, replay_iters, eps_T, eps_t_init,
        gamma, update_period, board, device, model_path, r_memory_Fname, o_model_name, model_load=False ):
        self.step_now = 0 # record the step
        self.reward_num = 0
        self.reward_accumulated = 0 # delay reward
        self.final_tem = 10 # just for now
        self.step_last_update = 0 # record the last update time 
        self.update_period = update_period # for the off policy
        self.learn_start_time = learn_start_time 
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.alpha = 0.6
        self.beta = 0.4
        self.replay_bata_iters = replay_iters 
        self.replay_eps = 1e-6
        self.loss_back = 0
        self.q_value_p = 0
        self.memory_min_num = 1000 #she min num to learn
        self.step_last_learn = 0 # record the last learn step
        self.learn_fre = learn_fre # step frequency to learn
        self.e_greedy = 1 # record the e_greedy
        self.eps_T = eps_T # par for updating the maybe step 80,0000
        self.eps_t_init = eps_t_init # par for updating the eps
         
        self.device = device
        self.model_path = model_path
        self.mode_enjoy = model_load
        if model_load == False: 
            self.policy_net = DQN(board[0], board[1], action_num).to(device)
            self.target_net = DQN(board[0], board[1], action_num).to(device)
            #self.target_net.eval()
            self.optimizer = optim.SGD(self.policy_net.parameters(), lr=lr)
            self.loss_fn = nn.functional.mse_loss # use the l1 loss
            self.memory = Memory(memory_size)
            self.beta_schedule = LinearSchedule(self.replay_bata_iters, self.beta, 1.0)
        else:
            self.load(o_model_name) 
        #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) 
        self.obs_new = None
        self.obs_old = None
        self.action = None
        self.action_old = None
        self.dqn_direct_flag = False # show if the dqn action is done
        self.model_save_flag = False
    
    def reset(self):
        """ 
        reset the flag, state, reward for a new half or game
        """
        self.obs_new = None
        self.obs_old = None
        self.action = None
        self.dqn_direct_flag = False

    def load(self, old_model):
        """
        load the trained model
        par:
        |old_model:str, the name of the old model
        """
        model_path_t = self.model_path + 't' + old_model
        self.target_net = torch.load(model_path_t, map_location=self.device)
        self.target_net.eval()
        #print('target net par', self.target_net.state_dict())

    def save(self):
        """
        save the trained model
        """
        #if self.model_path in os.listdir()
        t = time.strftime('%m%d%H%M%S')
        self.model_path_p = self.model_path + 'p' + t + '.pt'
        self.model_path_t = self.model_path + 't' + t + '.pt'
        #print('target net par is', self.policy_net.state_dict())
        torch.save(self.policy_net, self.model_path_p)
        torch.save(self.target_net, self.model_path_t)

    def learn(self, env, step_now, obs_old, action, obs_new, reward, done):
        """
        This func is used to learn the agent
        par:
        |step_now: int, the global time of training
        |env: class-Environment, use it for nothing
        |transition: action, obs_new, reward 
        |obs_old/new: instance obs
        |done: bool, if the game is over 
        """
        """ check if we should update the policy net """
        if step_now - self.step_last_update == self.update_period:
            #print('update the p t network')
            self.step_last_update = step_now
            self.target_net.load_state_dict(self.policy_net.state_dict())
                
        """ init the obs_new for init learn """
        state_new = self.feature_combine(obs_new) # get the feature state
        state_old = self.feature_combine(obs_old) # get the feature state
        transition_now = (state_old, action, \
            reward, state_new)

        """ augument reward data to the memory """
        self.memory.episode_add(state_old, action, reward, state_new, done)

        """ select the batch memory to update the network """
        step_diff = step_now - self.step_last_learn
        if step_now > self.learn_start_time and \
                step_diff >= self.learn_fre and \
                    self.memory.__len__() > self.memory_min_num:
            self.step_last_learn = step_now # update the self.last learn
            batch_data = self.memory.sample(self.batch_size)
            s_o_set = []
            actions = []
            rewards = []
            s_n_set = []
            dones = []
            for bd in batch_data:
                s_o_set.append(bd.state)
                actions.append(bd.action)
                rewards.append(bd.reward)
                s_n_set.append(bd.next_state)
                dones.append(bd.done)
            loss_list = []
            batch_idx_list = []
            reward_not_zero_cnt = 0
            actions = torch.tensor(actions, device=self.device)

            """ cnt how many times learn for non reward """
            with torch.no_grad():
                target_values = [self.gamma*self.target_net(s_n).max(0)[0] \
                    for idx, s_n in enumerate(s_n_set)]
                target_values = [t_*(1 - d_) + r_ \
                    for t_, d_, r_ in zip(target_values, dones, rewards)] 
            policy_values = [self.policy_net(s).gather(0, a) \
                    for s, a in zip(s_o_set, actions)]
            loss = [self.loss_fn(t_v, p_v)+ self.replay_eps \
                    for p_v, t_v in zip(policy_values, target_values)]
            loss_back = sum(loss) / self.batch_size
            self.loss_back = loss_back

            """ update the par """
            self.optimizer.zero_grad()
            loss_back.backward()
            self.optimizer.step()

        """ check if we should save the model """
        if self.model_save_flag == True:
            self.save()

    def check_train(self, env):
        """ check if the reward is backward"""
        pass
            
    def select_egreedy(self, q_value, step_now):
        """
        select the action by e-greedy policy
        arg:
        |q_value: the greedy standard 
        """
        self.e_greedy = np.exp((self.eps_t_init - step_now) / self.eps_T)
        if self.e_greedy < 0.3:
            self.e_greedy = 0.3

        """ if we are in enjoying mode """
        if self.mode_enjoy == True:
            print('q_value is', q_value)
            self.e_greedy = 0.3

        """ select the action by e-greedy """
        if np.random.random() > self.e_greedy:
            action = action_list[q_value.max(0)[1]]
        else:
            action = action_list[np.random.randint(action_num)]
        return action

    def feature_combine(self, obs):
        """ 
        This file extract features from the obs.layers and 
        combine them into a new feature layer
        Used feature layers:    
        """
        """ combine all the layers """
        feature_c = obs.copy()
        feature_c = feature_c.astype(np.float32)
        feature_c = torch.tensor(feature_c, dtype=torch.float32, device=self.device)
        return feature_c

    def data_augment(self, transition):
        """
        use this func to flip the feature, to boost the experience,
        deal the problem of sparse reward
        par:
        |transition: tuple, with (feature_o, action, feature_n, reward) 
        """
        flip_ver_dim = 2
        feature_old = transition[0]
        action = transition[1]
        feature_new = transition[3]
        reward = transition[2]

        """ vertical flip """
        feature_o_aug = feature_old.flip([flip_ver_dim])
        feature_n_aug = feature_new.flip([flip_ver_dim])

        """ vertical :action flip """
        if action == 0:  action = 1
        elif action == 1: action = 0

        return feature_o_aug, action, reward, feature_n_aug

    def act(self, map, step_now):
        """ this func is interact with the competition func """
        dqn_action = -1 # reset
        state_old = self.feature_combine(map) # get the feature
        with torch.no_grad():
            q_values = self.policy_net(state_old)
        action = self.select_egreedy( \
            q_values, step_now)# features to model

        return action

    def act_enjoy(self, map):
        """ this func is interact with the competition func """
        dqn_action = -1 # reset
        step_now = self.eps_T
        state_old = self.feature_combine(map) # get the feature
        q_values = self.target_net(state_old)
        action = self.select_egreedy( \
            q_values, step_now)# features to model

        return action
Exemple #4
0
class Agent():
    """
    This module is used for interacting with and learn from the environement.

    """

    def __init__(self, state_size, action_size, is_double_Q, is_prioritized, seed = 456):
        """
        This is for initialization.

        (input)
        - state_size (int): size of a state
        - action_size (int): dim of the action space
        - seed (int): random seed
        - is_double_Q (bool): double Q-learning (True) or normal Q-learning
        - is_prioritized (bool): prioritized replay buffer (True) or normal replay buffer
        - seed (int): random seed

        """

        self.seed = torch.manual_seed(seed)
        self.state_size = state_size
        self.action_size = action_size

        # Q-networks, local and target
        self.qnetwork_local = DQN(self.state_size, self.action_size, seed).to(device)
        self.qnetwork_target = DQN(self.state_size, self.action_size, seed).to(device)
        self.is_double_Q = is_double_Q

        # optimizer for learning process
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr = LR)

        # replay buffer for learning process
        self.is_prioritized = is_prioritized
        self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.is_prioritized, seed)
        self.beta = BETA0_PRIORITIZED # power of the weights for the prioritized replay buffer
        self.learning_count = 0 # count how many times the learning process is done (used for the update of beta)

        # the number of time step (modulo UPDATED_EVERY)
        self.t_step = 0
        # loss
        self.loss = 0.0

    def reset(self):
        """
        This method is used for resetting time_step
        
        """
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """
        This method saves an experience to the replay buffer.
        Then, after a fixed number of iterations, the Q-network learns from
        the experiences stored in the replay buffer if it contains more
        experiences than the batch size.

        (input)
        - state (float, dim = state_size): state vector
        - action (int, dim = action_size): action vector
        - reward (float, dim 1): reward
        - next_state(float, dim = state_size): state vector for the next state
        - done (bool): if the episode is done or not

        """

        # create tensors storing states (same for actions etc.)
        states = torch.from_numpy(np.vstack([state])).float().to(device)
        actions = torch.from_numpy(np.vstack([action])).long().to(device)
        rewards = torch.from_numpy(np.vstack([reward])).float().to(device)
        next_states = torch.from_numpy(np.vstack([next_state])).float().to(device)
        dones = torch.from_numpy(np.vstack([done]).astype(np.uint8)).float().to(device)

        # compute the TD error
        self.qnetwork_local.eval()
        if(self.is_double_Q != True): # for normal Q-learning
            # get the maxium Q-values of target network for next_state
            qsa_target_next_max = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        else: # for double Q-learning
            # get actions which maximize the Q-values of the local network for next_states.
            self.qnetwork_local.eval()
            with torch.no_grad():
                max_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
            self.qnetwork_local.train()
            # get the Q-values of target network for next_state, max_actions
            qsa_target_next_max = self.qnetwork_target(next_states).gather(1, max_actions)

        delta = rewards + GAMMA * qsa_target_next_max * (1-dones) - self.qnetwork_local(states).gather(1, actions)
        delta = delta.data.cpu().numpy()[0][0]
        self.qnetwork_local.train()

        # save experience in the replay buffer
        self.buffer.add(state, action, reward, next_state, done, delta)

        # update the weights of Q-networks every UPDATE_EVERY time steps
        # (and if the replay buffer contains more experiences thant the batch size)
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if(self.t_step == 0):
            if(len(self.buffer) > BATCH_SIZE):
                self.learning_count += 1
                # in case of prioritized buffer, update beta
                if self.is_prioritized:
                    self.beta = 1.0 + (BETA0_PRIORITIZED - 1.0)/self.learning_count

                # learn and update the weights of Q-networks
                self.learn(self.beta)

    def act(self, state, eps):
        """
        This method takes a state as an input, uses the policy defined by
        deep Q-Network and then select the next action based on
        epsilon-greedy algorithm.

        (input)
        - state (float, dim = state_size): state vector
        - eps (float): epsilon for the epsilon-greedy algorithm

        (output)
        - index for the next action (int)

        """

        # convert state a tensor
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        ## retrieve the action value
        self.qnetwork_local.eval() # evaluation mode
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train() # return to the training model

        # choose an action based on epsilon-greedy algorithm
        if(random.random() > eps):
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return np.argmax(random.choice(np.arange(self.action_size)))

    def learn(self, beta = 1.0):
        """
        This method updates the weights of the Q-network
        by learning from the experiences stored in the replay buffer.

        (input)
        - beta (float): beta index for the priority replay

        """

        # sampling from the replay buffer
        states, actions, rewards, next_states, dones, deltas = self.buffer.sample(beta)

        if(self.is_double_Q != True): # for normal Q-learning
            # get the maxium Q-values of target network for next_state
            qsa_target_next_max = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        else: # for double Q-learning
            # get actions which maximize the Q-value of local network for next_state
            self.qnetwork_local.eval()
            with torch.no_grad():
                max_actions = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
            self.qnetwork_local.train()
            # get the Q-values of target network for next_state, max_actions
            qsa_target_next_max = self.qnetwork_target(next_states).gather(1, max_actions)

        # compute target/expected Q-values
        qsa_target = rewards + GAMMA * qsa_target_next_max * (1- dones)
        qsa_expect = self.qnetwork_local(states).gather(1, actions)

        # in case of the prioritized replay buffer, multiply the gradient
        # (and thus temporarily the target and expected Q-value) with weights
        if(self.is_prioritized):

            # multiply the weights with the target/expected Q-values
            # Note that since self.buffer.weights is the square-root of the
            # weights considered in the prioritized replay buffer paper,
            # the mean square error evaluated for qsa_target and qsa_expect below
            # gives an appropriate gradient for the update of the network weights
            # (i.e. multiplied by self.buffer.weights compared to the standard replay buffer case)
            qsa_target = qsa_target * self.buffer.weights
            qsa_expect = qsa_expect * self.buffer.weights

            # update the deltas and priorities in the sampled experiences in the replay buffer
            deltas = qsa_target - qsa_expect
            deltas = deltas.data.cpu().numpy().squeeze(1)
            for i, j in enumerate(self.buffer.id_experiences):
                self.buffer.memory[j]._replace(delta = deltas[i])
                self.buffer.priority[j] = np.power(np.abs(deltas[i]) + EPS_PRIORITIZED, ALPHA_PRIORITIZED)

        # compute the mean square error as a loss and do back-propagation
        self.loss = F.mse_loss(qsa_expect, qsa_target)
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

        # soft-update of the parameters in the target network
        self.soft_update(TAU)

    def soft_update(self, tau):
        """
        This method carries out the soft-update of the parameters
        in the target network.

        (input)
        tau (float): parameter for the soft-update

        """
        for l_param, t_param in zip(self.qnetwork_local.parameters(),self.qnetwork_target.parameters()):
            t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data)
class RL_AGENT_ONE():
    """
    RL agent class
    """
    def __init__(self,
                 memory_size,
                 batch_size,
                 learn_start_time,
                 learn_fre,
                 lr,
                 replay_iters,
                 eps_T,
                 eps_t_init,
                 gamma,
                 update_period,
                 board,
                 device,
                 model_path,
                 r_memory_Fname,
                 o_model_name,
                 model_load=False):
        self.step_now = 0  # record the step
        self.reward_num = 0
        self.reward_accumulated = 0  # delay reward
        self.final_tem = 10  # just for now
        self.step_last_update = 0  # record the last update time
        self.update_period = update_period  # for the off policy
        self.learn_start_time = learn_start_time
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.alpha = 0.6
        self.beta = 0.4
        self.replay_bata_iters = replay_iters
        self.replay_eps = 1e-6
        self.memory_min_num = 1000  #she min num to learn
        self.step_last_learn = 0  # record the last learn step
        self.learn_fre = learn_fre  # step frequency to learn
        self.e_greedy = 1  # record the e_greedy
        self.eps_T = eps_T  # par for updating the maybe step 80,0000
        self.eps_t_init = eps_t_init  # par for updating the eps

        self.device = device
        self.model_path = model_path
        self.mode_enjoy = model_load
        if model_load == False:
            self.policy_net = DQN(board[0], board[1], action_num).to(device)
            self.target_net = DQN(board[0], board[1], action_num).to(device)
            self.optimizer = optim.Adagrad(self.policy_net.parameters(), lr=lr)
            self.loss_fn = nn.functional.mse_loss  # use the l1 loss
            self.memory = PrioritizedReplayBuffer(memory_size, self.alpha)
            self.beta_schedule = LinearSchedule(self.replay_bata_iters,
                                                self.beta, 1.0)
        else:
            self.load(o_model_name)
        #self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)
        self.obs_new = None
        self.obs_old = None
        self.action = None
        self.action_old = None
        self.dqn_direct_flag = False  # show if the dqn action is done
        self.model_save_flag = False

    def reset(self):
        """ 
        reset the flag, state, reward for a new half or game
        """
        self.obs_new = None
        self.obs_old = None
        self.action = None
        self.dqn_direct_flag = False

    def load(self, old_model):
        """
        load the trained model
        par:
        |old_model:str, the name of the old model
        """
        model_path_t = self.model_path + 't' + old_model
        self.target_net = torch.load(model_path_t, map_location=self.device)
        self.target_net.eval()
        print('target net par', self.target_net.state_dict())

    def save(self):
        """
        save the trained model
        """
        #if self.model_path in os.listdir()
        t = time.strftime('%m%d%H%M%S')
        self.model_path_p = self.model_path + 'p' + t + '.pt'
        self.model_path_t = self.model_path + 't' + t + '.pt'
        print('target net par is', self.policy_net.state_dict())
        torch.save(self.policy_net, self.model_path_p)
        torch.save(self.target_net, self.model_path_t)

    def learn(self, env, step_now, obs_old, action, obs_new, reward, done):
        """
        This func is used to learn the agent
        par:
        |step_now: int, the global time of training
        |env: class-Environment, use it for nothing
        |transition: action, obs_new, reward 
        |obs_old/new: instance obs
        |done: bool, if the game is over 
        """
        """ check if we should update the policy net """
        if step_now - self.step_last_update == self.update_period:
            #print('update the p t network')
            self.step_last_update = step_now
            self.target_net.load_state_dict(self.policy_net.state_dict())
        """ init the obs_new for init learn """
        state_new = self.feature_combine(obs_new)  # get the feature state
        state_old = self.feature_combine(obs_old)  # get the feature state
        transition_now = (state_old, action, \
            reward, state_new)
        """ augument reward data to the memory """
        if reward > 0:
            #print('get one reward in learn', self.reward_accumulated)
            self.memory.add(*self.data_augment(transition_now), done)
        self.memory.add(state_old, action, \
            reward, state_new, done)
        """ select the batch memory to update the network """
        step_diff = step_now - self.step_last_learn
        if step_now > self.learn_start_time and \
                step_diff >= self.learn_fre and \
                    self.memory.__len__() > self.memory_min_num:
            self.step_last_learn = step_now  # update the self.last learn
            #print('start sample', step_now)
            batch_data = self.memory.sample(self.batch_size, \
                beta=self.beta_schedule.value(step_now))
            loss_list = []
            batch_idx_list = []
            reward_not_zero_cnt = 0
            for t_ in zip(*batch_data):
                s, action, reward, s_new, done, weight, idx = t_
                #print('the transition is', t_)
                """ cnt how many times learn for non reward """
                q_values = self.policy_net(s_new)
                action_new = self.select_egreedy(
                    q_values.cpu().detach().numpy(), self.eps_T * 2)
                #reward = torch.tensor(reward).to(self.device)
                #target_value =reward if done \
                #else self.gamma*self.target_net(s_new)[action_new] + reward
                target_value = self.gamma*self.target_net(s_new).gather( \
                        0, torch.tensor(action_new, device=self.device))
                target_value = target_value + reward if not done \
                    else  target_value - (target_value - reward)
                loss = self.loss_fn(self.policy_net(s).gather(0, \
                        torch.tensor(action, device=self.device)), target_value)
                loss_list.append(loss + self.replay_eps)
                batch_idx_list.append(idx)
                """ update the par """
                self.optimizer.zero_grad()
                loss.backward()
                #print('self.policy_net linear grad is', self.policy_net.fc_3.bias.grad)
                #print('self.policy_net linear grad is', self.policy_net.fc_3.weight.grad)
                #input()
                for param in self.policy_net.parameters():
                    param.grad.data.clamp_(-1, 1)
                self.optimizer.step()
                #input()
            self.memory.update_priorities(batch_idx_list, loss_list)
        """ check if we should save the model """
        if self.model_save_flag == True:
            self.save()

    def select_egreedy(self, q_value, step_now):
        """
        select the action by e-greedy policy
        arg:
        |q_value: the greedy standard 
        """
        self.e_greedy = np.exp((self.eps_t_init - step_now) / self.eps_T)
        """ if we are in enjoying mode """
        if self.mode_enjoy == True:
            print('q_value is', q_value)
            self.e_greedy = 0
        """ select the action by e-greedy """
        if np.random.random() > self.e_greedy:
            action = action_list[ \
                    np.where(q_value==np.max(q_value))[0][0] ]
        else:
            action = action_list[np.random.randint(action_num)]
        return action

    def feature_combine(self, obs):
        """ 
        This file extract features from the obs.layers and 
        combine them into a new feature layer
        Used feature layers:    
        """
        """ combine all the layers """
        feature_c = obs.copy()
        feature_c = feature_c.astype(np.float32)
        feature_c = torch.tensor(feature_c,
                                 dtype=torch.float32,
                                 device=self.device)
        size = feature_c.shape
        feature_c = feature_c.resize_(1, 1, size[0], size[1])
        return feature_c

    def data_augment(self, transition):
        """
        use this func to flip the feature, to boost the experience,
        deal the problem of sparse reward
        par:
        |transition: tuple, with (feature_o, action, feature_n, reward) 
        """
        flip_ver_dim = 2
        feature_old = transition[0]
        action = transition[1]
        feature_new = transition[3]
        reward = transition[2]
        """ vertical flip """
        feature_o_aug = feature_old.flip([flip_ver_dim])
        feature_n_aug = feature_new.flip([flip_ver_dim])
        """ vertical :action flip """
        if action == 0: action = 1
        elif action == 1: action = 0

        return feature_o_aug, action, reward, feature_n_aug

    def act(self, map, step_now):
        """ this func is interact with the competition func """
        dqn_action = -1  # reset
        state_old = self.feature_combine(map)  # get the feature
        q_values = self.policy_net(state_old)
        action = self.select_egreedy( \
            q_values.cpu().detach().numpy(), step_now)# features to model

        return action

    def act_enjoy(self, map):
        """ this func is interact with the competition func """
        dqn_action = -1  # reset
        step_now = self.eps_T
        state_old = self.feature_combine(map)  # get the feature
        q_values = self.target_net(state_old)
        action = self.select_egreedy( \
            q_values.cpu().detach().numpy(), step_now)# features to model

        return action