コード例 #1
0
    def build(self):
        """
        Build model by adding all necessary variables
        """
        with self.current_graph.as_default():
            # add placeholders
            self.add_placeholders_op()

            # compute Q values of state
            s = self.process_state(self.s)
            self.q, self.hs_out = self.get_q_values_op(s, self.past_a, self.agate, self.slen, self.hs, scope="q", reuse=False)

            # compute Q values of next state
            sp = self.process_state(self.sp)
            self.target_q, self.hsp_out = self.get_q_values_op(sp, self.a, self.agatep, self.splen, self.hsp, scope="target_q", reuse=False)

            self.dnc = DNC(self.config, self.config.dnc_h_size)
            self.out_dnc, self.hs_out_dnc = self.dnc(self.s_dnc, self.hs_dnc)
            self.agate_dnc = self.dnc.get_agate(self.s_dnc, self.hs_dnc)
            self.path_dnc, self.target_ldm_dnc = self.dnc.value_iteration(self.hs_dnc.access_state, self.src_inputs_dnc, self.tgt_inputs_dnc, self.max_len_dnc)

            # add update operator for target network
            self.add_update_target_op("q", "target_q")

            # add square loss
            self.add_loss_op(self.q, self.target_q)

            # add optmizer for the main networks
            self.add_optimizer_op("q")
コード例 #2
0
    def add_placeholders_op(self):
        """
        Adds placeholders to the graph

        These placeholders are used as inputs by the rest of the model building and will be fed
        data during training.
        """
        # here, typically, a state shape is (5,3,22)
        state_shape = list([4*3, 3, len(self.env.state.xmap.item_class_id)+2]) # four orientations
        self.s = tf.placeholder(tf.bool, shape=(None, None, state_shape[0], state_shape[1], state_shape[2]))
        self.hs = tf.nn.rnn_cell.LSTMStateTuple(tf.placeholder(tf.float32, shape=(None, self.config.h_size)),tf.placeholder(tf.float32, shape=(None, self.config.h_size)))
        self.slen = tf.placeholder(tf.int32, shape=(None))
        self.sp = tf.placeholder(tf.bool, shape=(None, None, state_shape[0], state_shape[1], state_shape[2]))
        self.hsp = tf.nn.rnn_cell.LSTMStateTuple(tf.placeholder(tf.float32, shape=(None, self.config.h_size)),tf.placeholder(tf.float32, shape=(None, self.config.h_size)))
        self.splen = tf.placeholder(tf.int32, shape=(None))

        self.agate = tf.placeholder(tf.float32, shape=(None)) # (nb*state_history,)
        self.agatep = tf.placeholder(tf.float32, shape=(None)) # (nb*state_history,)

        self.a = tf.placeholder(tf.int32, shape=(None)) # (nb*state_history,)
        self.past_a = tf.placeholder(tf.int32, shape=(None)) # (nb*state_history,)
        self.r = tf.placeholder(tf.float32, shape=(None)) # (nb*state_history,)
        self.done_mask = tf.placeholder(tf.bool, shape=(None)) # (nb*state_history,)
        self.seq_mask = tf.placeholder(tf.bool, shape=(None)) # (nb*state_history,)
        self.lr = tf.placeholder(tf.float32, shape=(None))

        self.s_dnc = tf.placeholder(tf.float32, shape=(None, 4*self.config.word_size+self.config.mask_size+2))
        self.hs_dnc = DNC.state_placeholder(self.config)
        self.src_inputs_dnc = tf.placeholder(tf.float32, shape=(None, 4*4, 5, 5, self.config.ndigits*self.config.nway))
        self.tgt_inputs_dnc = tf.placeholder(tf.float32, shape=(None, 4*4, 5, 5, self.config.ndigits*self.config.nway))
        self.max_len_dnc = tf.placeholder(tf.int32, shape=(None))
コード例 #3
0
    def evaluate(self, model_a, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env
        env.state.is_render_image = self.config.render_test
        model_a.env.state.is_render_image = model_a.config.render_test
        rewards = []
        orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])]
        #### visualize landmark in a fixed scene ####
        if self.config.deploy_only and self.config.vis_heat_map:
            width, height = env.state.xmap.dim['width'], env.state.xmap.dim['height']
            goal_heatmap = np.zeros([height, width])
            goal_heatmap_norm = np.zeros([height, width])
        #############################################
        npath = self.config.npath # paths to generate in each environment
        nquery = self.config.nquery # query to generate in each environment
        max_plan_len = self.config.max_plan_len
        ndigits = self.config.ndigits
        nway = self.config.nway
        num_classes = len(self.env.state.xmap.item_class_id)
        for ii in range(num_episodes):
            total_reward = 0
            env.reset()
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, env.state)
            dnc_state = DNC.zero_state(self.config, batch_size=1)
            h_state = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0

            # sample paths
            for i in range(npath):
                state_seq, path_loc, path_ori = env.teacher.gen_sample_seq(env.state)
                state_seq_encoding = DRQN_planner.encode_state(state_seq, ndigits, nway)
                goal_state_seq = np.reshape(state_seq, [state_seq.shape[0], 4, 3, 3, num_classes+2]).astype('bool')
                #### missing could be everything ####
                goal_state_seq = np.tile(goal_state_seq[:,:,:,:,[num_classes]], [1,1,1,1,num_classes+2])+goal_state_seq
                #### treat missing observation as correct observation ####
                goal_state_seq[:,:,:,:,num_classes] = True
                #### transpose
                goal_state_seq = np.transpose(goal_state_seq, [0,2,3,4,1])
                path_len = state_seq.shape[0]
                mask_seq = np.logical_not(state_seq[:,:3,:,num_classes])
                flag_seq = np.zeros([path_len])
                flag_seq[-1] = 1
                model_a.env.state.teleport(model_a.env.agent, path_loc[0], orientation_map[path_ori[0]])

                for j in range(path_len):
                    # get agate from dnc
                    cur_dnc_in = np.concatenate([state_seq_encoding[j].reshape(-1),mask_seq[j].reshape(-1), np.array([0, flag_seq[j]])], axis=0)
                    agate_dnc_val = self.sess.run(self.agate_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    agate_dnc_val = agate_dnc_val[0,0]
                    # get q value and sample action
                    action, q_values, h_state = self.get_action(state_seq[j][None][None], h_state, slen, [action], [agate_dnc_val])
                    # take action and update dnc
                    cur_dnc_in[-2] = action
                    dnc_state = self.sess.run(self.hs_out_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})                    
                    #### visualize landmark in a fixed scene ####
                    if self.config.deploy_only and self.config.vis_heat_map:
                        if (goal_location>=0).sum()==2 and (goal_location<[width, height]).sum()==2:
                            goal_heatmap_norm[path_loc[j][1], path_loc[j][0]] += 1
                            if action==1:
                                goal_heatmap[path_loc[j][1], path_loc[j][0]] += 1
                    
            reward_list = list()
            for k in range(nquery):
                reward_list.append(0)
                src_inputs, tgt_inputs, src_loc, tgt_loc, goal_obs_onehot_state = self.env.teacher.gen_sample_query(self.env.state)
                src_inputs = DRQN_planner.encode_state(src_inputs, ndigits, nway)
                tgt_inputs = DRQN_planner.encode_state(tgt_inputs, ndigits, nway)
                path_dnc_val, target_ldm_dnc_val = self.sess.run([self.path_dnc, self.target_ldm_dnc], feed_dict={self.hs_dnc: dnc_state, self.src_inputs_dnc: src_inputs[None], 
                    self.tgt_inputs_dnc: tgt_inputs[None], self.max_len_dnc: max_plan_len})
                path_dnc_val = DRQN_planner.decode_state(np.reshape(path_dnc_val[0], [max_plan_len, 3, 3, -1]), ndigits, nway, num_classes+2)
                target_ldm_dnc_val = DRQN_planner.decode_state(np.reshape(target_ldm_dnc_val[0], [3, 3, -1]), ndigits, nway, num_classes+2)
                path_dnc_val_inner = np.argmax(path_dnc_val, axis=3)
                target_ldm_dnc_val_inner = np.argmax(target_ldm_dnc_val, axis=2)
                cur_len = max_plan_len
                for l in range(max_plan_len):
                    if (path_dnc_val_inner[l]==target_ldm_dnc_val_inner).all():
                        cur_len = l+1
                        break
                path_dnc_val = path_dnc_val[:cur_len]
                path_dnc_val = np.concatenate([path_dnc_val, goal_obs_onehot_state[None]], 0)
                #### modify goal state ####
                #### missing could be everything ####
                path_dnc_val = np.tile(path_dnc_val[:,:,:,[num_classes]], [1,1,1,num_classes+2])+path_dnc_val
                #### treat missing observation as correct observation ####
                path_dnc_val[:,:,:,num_classes] = True
                model_a.env.state.teleport(model_a.env.agent, src_loc, np.array([0,1]))
                h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                for l in range(path_dnc_val.shape[0]):
                    cur_goal_state = path_dnc_val[l]
                    cur_goal_state = np.expand_dims(cur_goal_state, 3)
                    cur_goal_state = np.concatenate([np.rot90(cur_goal_state, 0), np.rot90(cur_goal_state, 1),
                        np.rot90(cur_goal_state, 2), np.rot90(cur_goal_state, 3)], 3)
                    model_a.env.teacher.set_goal(cur_goal_state, tgt_loc)
                    reward_list[-1] += model_a.navi_goal(h_state_a, cur_goal_state)
                if model_a.env.teacher.goal_finish:
                    reward_list[-1] += 10
            total_reward = sum(reward_list)/len(reward_list)
                    
            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))
        #### visualize landmark in a fixed scene ####
        if self.config.deploy_only and self.config.vis_heat_map:
            #goal_heatmap /= (goal_heatmap_norm+1e-6)
            plt.imshow(goal_heatmap, cmap='hot', interpolation='nearest')
            plt.show()
            plt.pause(100)
        #############################################

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward
コード例 #4
0
    def train(self, model_a, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train
        model_a.env.state.is_render_image = model_a.config.render_train
        orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])]

        npath = self.config.npath # paths to generate in each environment
        nquery = self.config.nquery # query to generate in each environment
        max_plan_len = self.config.max_plan_len
        ndigits = self.config.ndigits
        nway = self.config.nway

        num_classes = len(self.env.state.xmap.item_class_id)

        # three steps:
        #   1. sample paths from the teacher environment and pass to dnc
        #   2. get immediate reward from whether agent could reach the subgoal
        #   3. sample query paths and ask agent to follow the plan, get the final big reward
        #   -- train one step after each teacher's move
        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            self.env.reset()
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, self.env.state)
            dnc_state = DNC.zero_state(self.config, batch_size=1)
            h_state = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0

            # sample paths
            for i in range(npath):
                state_seq, path_loc, path_ori = self.env.teacher.gen_sample_seq(self.env.state)
                state_seq_encoding = DRQN_planner.encode_state(state_seq, ndigits, nway)
                goal_state_seq = np.reshape(state_seq, [state_seq.shape[0], 4, 3, 3, num_classes+2]).astype('bool')
                #### missing could be everything ####
                goal_state_seq = np.tile(goal_state_seq[:,:,:,:,[num_classes]], [1,1,1,1,num_classes+2])+goal_state_seq
                #### treat missing observation as correct observation ####
                goal_state_seq[:,:,:,:,num_classes] = True
                #### transpose
                goal_state_seq = np.transpose(goal_state_seq, [0,2,3,4,1])
                path_len = state_seq.shape[0]
                mask_seq = np.logical_not(state_seq[:,:3,:,num_classes])
                flag_seq = np.zeros([path_len])
                flag_seq[-1] = 1
                model_a.env.state.teleport(model_a.env.agent, path_loc[0], orientation_map[path_ori[0]])

                for j in range(path_len):
                    # get agate from dnc
                    cur_dnc_in = np.concatenate([state_seq_encoding[j].reshape(-1),mask_seq[j].reshape(-1), np.array([0, flag_seq[j]])], axis=0)
                    agate_dnc_val = self.sess.run(self.agate_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    agate_dnc_val = agate_dnc_val[0,0]
                    # get q value and sample action
                    idx = replay_buffer.store_frame(state_seq[j])
                    q_input = replay_buffer.encode_recent_observation()
                    best_action, q_values, h_state = self.get_best_action([q_input], h_state, slen, [action], [agate_dnc_val])
                    action = exp_schedule.get_action(best_action)
                    # store q values
                    max_q_values.append(max(q_values))
                    q_values += list(q_values)
                    # take action and update dnc
                    cur_dnc_in[-2] = action
                    dnc_state = self.sess.run(self.hs_out_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    # acquire reward
                    reward = 0
                    done = False
                    if action==1:
                        h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                        model_a.env.teacher.set_goal(goal_state_seq[j], path_loc[j])
                        reward_a = model_a.navi_goal(h_state_a, goal_state_seq[j])
                        if not model_a.env.teacher.goal_finish:
                            reward += -0.05
                        reward += -0.05
                        model_a.env.state.teleport(model_a.env.agent, path_loc[j], orientation_map[path_ori[j]])
                    # acquire final reward
                    if i==npath-1 and j==path_len-1:
                        done = True
                        reward_list = list()
                        for k in range(nquery):
                            reward_list.append(0)
                            src_inputs, tgt_inputs, src_loc, tgt_loc, goal_obs_onehot_state = self.env.teacher.gen_sample_query(self.env.state)
                            src_inputs = DRQN_planner.encode_state(src_inputs, ndigits, nway)
                            tgt_inputs = DRQN_planner.encode_state(tgt_inputs, ndigits, nway)
                            path_dnc_val, target_ldm_dnc_val = self.sess.run([self.path_dnc, self.target_ldm_dnc], feed_dict={self.hs_dnc: dnc_state, self.src_inputs_dnc: src_inputs[None], 
                                self.tgt_inputs_dnc: tgt_inputs[None], self.max_len_dnc: max_plan_len})
                            path_dnc_val = DRQN_planner.decode_state(np.reshape(path_dnc_val[0], [max_plan_len, 3, 3, -1]), ndigits, nway, num_classes+2)
                            target_ldm_dnc_val = DRQN_planner.decode_state(np.reshape(target_ldm_dnc_val[0], [3, 3, -1]), ndigits, nway, num_classes+2)
                            path_dnc_val_inner = np.argmax(path_dnc_val, axis=3)
                            target_ldm_dnc_val_inner = np.argmax(target_ldm_dnc_val, axis=2)
                            cur_len = max_plan_len
                            for l in range(max_plan_len):
                                if (path_dnc_val_inner[l]==target_ldm_dnc_val_inner).all():
                                    cur_len = l+1
                                    break
                            path_dnc_val = path_dnc_val[:cur_len]
                            path_dnc_val = np.concatenate([path_dnc_val, goal_obs_onehot_state[None]], 0)
                            #### modify goal state ####
                            #### missing could be everything ####
                            path_dnc_val = np.tile(path_dnc_val[:,:,:,[num_classes]], [1,1,1,num_classes+2])+path_dnc_val
                            #### treat missing observation as correct observation ####
                            path_dnc_val[:,:,:,num_classes] = True
                            model_a.env.state.teleport(model_a.env.agent, src_loc, np.array([0,1]))
                            h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                            for l in range(path_dnc_val.shape[0]):
                                cur_goal_state = path_dnc_val[l]
                                cur_goal_state = np.expand_dims(cur_goal_state, 3)
                                cur_goal_state = np.concatenate([np.rot90(cur_goal_state, 0), np.rot90(cur_goal_state, 1),
                                    np.rot90(cur_goal_state, 2), np.rot90(cur_goal_state, 3)], 3)
                                model_a.env.teacher.set_goal(cur_goal_state, tgt_loc)
                                reward_list[-1] += model_a.navi_goal(h_state_a, cur_goal_state)
                            if model_a.env.teacher.goal_finish:
                                reward_list[-1] += 10
                        reward += sum(reward_list)/len(reward_list)
                    # store everything into replay buffer
                    replay_buffer.store_effect(idx, action, agate_dnc_val, reward, done)

                    t += 1
                    last_eval += 1
                    last_record += 1

                    # perform a training step
                    loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon)

                    # logging stuff
                    if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                       (t % self.config.learning_freq == 0)):
                        self.update_averages(rewards, max_q_values, q_values, scores_eval)
                        exp_schedule.update(t)
                        lr_schedule.update(t)
                        if len(rewards) > 0:
                            prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                            ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                            ("Grads", grad_eval), ("Max Q", self.max_q), 
                                            ("lr", lr_schedule.epsilon)])

                    elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                        sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                            self.config.learning_start))
                        sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d"%(t))
                scores_eval += [self.evaluate(model_a)]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(model_a)]
        export_plot(scores_eval, "Scores", self.config.plot_output)
コード例 #5
0
class DRQN_planner(object):
    """
    Implement DRQN with Tensorflow
    """
    def __init__(self, env, config, current_graph, logger=None):
        """
        Initialize Q Network and env

        Args:
            config: class with hyperparameters
            logger: logger instance from logging module
        """
        # directory for training outputs
        if not os.path.exists(config.output_path):
            os.makedirs(config.output_path)
            
        # store hyper params
        self.config = config
        self.logger = logger
        if logger is None:
            self.logger = get_logger(config.log_path)
        self.env = env
        self.current_graph = current_graph

        # build model
        self.build()

    def process_state(self, state):
        """
        Processing of state

        State placeholders are tf.uint8 for fast transfer to GPU
        Need to cast it to float32 for the rest of the tf graph.

        Args:
            state: node of tf graph of shape = (batch_size, nchannels)
                    of type tf.uint8.
        """
        state = tf.cast(state, tf.float32)
        state /= self.config.high

        return state


    def get_best_action(self, state, h_state, slen, past_a, agate):
        """
        Return best action

        Args:
            state: observation
            h_state: hidden state for rnn
        Returns:
            action: (int)
            action_values: (np array) q values for all actions
        """
        action_values, h_state_out = self.sess.run([self.q, self.hs_out], feed_dict={self.s: state, self.hs: h_state, self.slen: slen, self.past_a: past_a, self.agate: agate})
        return np.argmax(action_values[0]), action_values[0], h_state_out

    def get_action(self, state, h_state, slen, past_a, agate):
        """
        Returns action with some epsilon strategy

        Args:
            state: observation from gym
            h_state: hidden state for rnn
        """
        best_action, q_values, h_state_out = self.get_best_action(state, h_state, slen, past_a, agate)
        if np.random.random() < self.config.soft_epsilon:
            return np.random.randint(0, 2), q_values, h_state_out
        else:
            return best_action, q_values, h_state_out
            
    @property
    def policy(self):
        """
        model.policy(state, h_state) = action
        """
        return lambda state, h_state, slen, past_a, agate: self.get_action(state, h_state, slen, past_a, agate)

    def init_averages(self):
        """
        Defines extra attributes for tensorboard
        """
        self.avg_reward = 0
        self.max_reward = 0
        self.std_reward = 0

        self.avg_q = 0
        self.max_q = 0
        self.std_q = 0
        
        self.eval_reward = 0

    def update_averages(self, rewards, max_q_values, q_values, scores_eval):
        """
        Update the averages

        Args:
            rewards: deque
            max_q_values: deque
            q_values: deque
            scores_eval: list
        """
        self.avg_reward = np.mean(rewards)
        self.max_reward = np.max(rewards)
        self.std_reward = np.sqrt(np.var(rewards) / len(rewards))

        self.max_q      = np.mean(max_q_values)
        self.avg_q      = np.mean(q_values)
        self.std_q      = np.sqrt(np.var(q_values) / len(q_values))

        if len(scores_eval) > 0:
            self.eval_reward = scores_eval[-1]

    def add_placeholders_op(self):
        """
        Adds placeholders to the graph

        These placeholders are used as inputs by the rest of the model building and will be fed
        data during training.
        """
        # here, typically, a state shape is (5,3,22)
        state_shape = list([4*3, 3, len(self.env.state.xmap.item_class_id)+2]) # four orientations
        self.s = tf.placeholder(tf.bool, shape=(None, None, state_shape[0], state_shape[1], state_shape[2]))
        self.hs = tf.nn.rnn_cell.LSTMStateTuple(tf.placeholder(tf.float32, shape=(None, self.config.h_size)),tf.placeholder(tf.float32, shape=(None, self.config.h_size)))
        self.slen = tf.placeholder(tf.int32, shape=(None))
        self.sp = tf.placeholder(tf.bool, shape=(None, None, state_shape[0], state_shape[1], state_shape[2]))
        self.hsp = tf.nn.rnn_cell.LSTMStateTuple(tf.placeholder(tf.float32, shape=(None, self.config.h_size)),tf.placeholder(tf.float32, shape=(None, self.config.h_size)))
        self.splen = tf.placeholder(tf.int32, shape=(None))

        self.agate = tf.placeholder(tf.float32, shape=(None)) # (nb*state_history,)
        self.agatep = tf.placeholder(tf.float32, shape=(None)) # (nb*state_history,)

        self.a = tf.placeholder(tf.int32, shape=(None)) # (nb*state_history,)
        self.past_a = tf.placeholder(tf.int32, shape=(None)) # (nb*state_history,)
        self.r = tf.placeholder(tf.float32, shape=(None)) # (nb*state_history,)
        self.done_mask = tf.placeholder(tf.bool, shape=(None)) # (nb*state_history,)
        self.seq_mask = tf.placeholder(tf.bool, shape=(None)) # (nb*state_history,)
        self.lr = tf.placeholder(tf.float32, shape=(None))

        self.s_dnc = tf.placeholder(tf.float32, shape=(None, 4*self.config.word_size+self.config.mask_size+2))
        self.hs_dnc = DNC.state_placeholder(self.config)
        self.src_inputs_dnc = tf.placeholder(tf.float32, shape=(None, 4*4, 5, 5, self.config.ndigits*self.config.nway))
        self.tgt_inputs_dnc = tf.placeholder(tf.float32, shape=(None, 4*4, 5, 5, self.config.ndigits*self.config.nway))
        self.max_len_dnc = tf.placeholder(tf.int32, shape=(None))

    def clip_if_enabled(self, x):
        if self.config.clip_val > 0:
          return tf.clip_by_value(x, -self.config.clip_val, self.config.clip_val)
        else:
            return x

    def get_q_values_op(self, state, past_a, agate, seq_len, h_state, scope, reuse=False):
        """
        Returns Q values for all actions

        Args:
            state: (tf tensor) 
                shape = (batch_size, seq_len, img_w, img_h, nchannel)
            goal_state: (tf tensor)
                shape = (batch_size, 1, img_w, img_h, nchannel, 4)
            past_a: (tf tensor)
                shape = (batch_size*seq_len,)
            seq_len: (tf tensor)
                shape = (batch_size,)
            h_state: (tf tensor) 
                shape = (batch_size, h_size)
            scope: (string) scope name, that specifies if target network or not
            reuse: (bool) reuse of variables in the scope

        Returns:
            out: (tf tensor) of shape = (batch_size * seq_len, num_actions)
            h_state_out: (tf tensor) of shape = (batch_size, h_size)
        """

        num_actions = 2
        h_size = self.config.h_size
        max_seq_len = tf.shape(state)[1]
        state_shape = list([4*3, 3, len(self.env.state.xmap.item_class_id)+2])
        past_a = tf.reshape(tf.one_hot(past_a, num_actions), shape=(-1, max_seq_len, 1, num_actions))
        past_a = tf.tile(past_a, multiples=[1,1,4,1])

        #out = tf.reshape(state, shape=(-1, max_seq_len, 4, np.int32(state_shape[0]*state_shape[1]*state_shape[2]/4)))

        out = tf.reduce_sum(state[:,:,:,:,1:17], axis=4, keep_dims=True)
        out = tf.concat([out, tf.expand_dims(state[:,:,:,:,0],4), tf.expand_dims(state[:,:,:,:,17],4), state[:,:,:,:,22:24]], axis=4)
        out = tf.reshape(out, shape=(-1, max_seq_len, 4, 3*3*5))

        agate = tf.reshape(agate, shape=(-1, max_seq_len, 1, 1))
        agate = tf.tile(agate, multiples=[1,1,4,1])
        with tf.variable_scope(scope, reuse = False):
            #### recurrent
            out = tf.concat([out, past_a, agate], axis=3)
            out = layers.fully_connected(layers.fully_connected(out, 200), 100)
            out = tf.reduce_max(out, axis=2)
            lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=h_size)
            out, h_state_out = tf.nn.dynamic_rnn(inputs=out, cell=lstm_cell, sequence_length=seq_len, dtype=tf.float32, initial_state=h_state)
            out = tf.reshape(out, shape=[-1,h_size])
            out = self.clip_if_enabled(out)
            h_state_out = tf.nn.rnn_cell.LSTMStateTuple(self.clip_if_enabled(h_state_out[0]),self.clip_if_enabled(h_state_out[1]))

            streamA, streamV = tf.split(out, 2, axis=1)
            advantage = layers.fully_connected(streamA, num_actions, activation_fn = None, weights_initializer=layers.xavier_initializer(), biases_initializer=tf.zeros_initializer())
            value = layers.fully_connected(streamV, 1, activation_fn = None, weights_initializer=layers.xavier_initializer(), biases_initializer=tf.zeros_initializer())
            out = value+tf.subtract(advantage,tf.reduce_mean(advantage, axis=1, keep_dims=True))
        return out, h_state_out


    def add_update_target_op(self, q_scope, target_q_scope):
        """
        update_target_op will be called periodically 
        to copy Q network weights to target Q network
    
        Args:
            q_scope: (string) name of the scope of variables for q
            target_q_scope: (string) name of the scope of variables
                        for the target network
        """
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,q_scope)
        tgt_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,target_q_scope)
        self.update_target_op = tf.no_op()
        for i in range(len(params)):
            self.update_target_op = tf.group(self.update_target_op, tf.assign(tgt_params[i],params[i]))


    def add_loss_op(self, q, target_q):
        """
        Sets the loss of a batch, self.loss is a scalar

        Args:
            q: (tf tensor) shape = (batch_size, num_actions)
            target_q: (tf tensor) shape = (batch_size, num_actions)
        """
        num_actions = 2#self.env.agent.num_actions
        q_sample = self.r+tf.reduce_max(target_q, axis=1)*(1-tf.cast(self.done_mask,tf.float32))*self.config.gamma
        q_pred = q*tf.one_hot(self.a, num_actions)
        q_pred = tf.reduce_sum(q_pred, axis=1)
        self.loss = tf.reduce_sum(tf.square(q_pred-q_sample)*tf.cast(self.seq_mask,tf.float32), axis=0)/tf.reduce_sum(tf.cast(self.seq_mask,tf.float32), axis=0)


    def add_optimizer_op(self, scope):
        """
        Set self.train_op and self.grad_norm
        """
        with tf.variable_scope(scope):
            opt = tf.train.AdamOptimizer(learning_rate=self.lr)
            params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
            grads_and_vars = opt.compute_gradients(self.loss, params)
            if self.config.grad_clip:
                grads_and_vars = [(tf.clip_by_norm(grad, self.config.grad_clip_val), var) for grad, var in grads_and_vars]
            self.train_op = opt.apply_gradients(grads_and_vars)
            grads = [grad for grad, var in grads_and_vars]
            self.grad_norm = tf.global_norm(grads)  

    def add_summary(self):
        """
        Tensorboard stuff
        """
        # extra placeholders to log stuff from python
        self.avg_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_reward")
        self.max_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="max_reward")
        self.std_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="std_reward")

        self.avg_q_placeholder  = tf.placeholder(tf.float32, shape=(), name="avg_q")
        self.max_q_placeholder  = tf.placeholder(tf.float32, shape=(), name="max_q")
        self.std_q_placeholder  = tf.placeholder(tf.float32, shape=(), name="std_q")

        self.eval_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="eval_reward")

        # add placeholders from the graph
        tf.summary.scalar("loss", self.loss)
        tf.summary.scalar("grads_norm", self.grad_norm)

        # extra summaries from python -> placeholders
        tf.summary.scalar("Avg_Reward", self.avg_reward_placeholder)
        tf.summary.scalar("Max_Reward", self.max_reward_placeholder)
        tf.summary.scalar("Std_Reward", self.std_reward_placeholder)

        tf.summary.scalar("Avg_Q", self.avg_q_placeholder)
        tf.summary.scalar("Max_Q", self.max_q_placeholder)
        tf.summary.scalar("Std_Q", self.std_q_placeholder)

        tf.summary.scalar("Eval_Reward", self.eval_reward_placeholder)
            
        # logging
        self.merged = tf.summary.merge_all()
        self.file_writer = tf.summary.FileWriter(self.config.output_path, 
                                                self.sess.graph)

    def save(self, t):
        """
        Saves session
        """
        with self.current_graph.as_default():
            if not os.path.exists(os.path.dirname(self.config.model_output)):
                os.makedirs(os.path.dirname(self.config.model_output))

            self.saver.save(self.sess, self.config.model_output, global_step = t)


    def restore(self, t):
        """
        Restore session
        """
        with self.current_graph.as_default():
            #self.saver = tf.train.import_meta_graph(self.config.model_output+'-'+str(t)+'.meta')
            self.saver.restore(self.sess, self.config.model_output+'-'+str(t))

    def build(self):
        """
        Build model by adding all necessary variables
        """
        with self.current_graph.as_default():
            # add placeholders
            self.add_placeholders_op()

            # compute Q values of state
            s = self.process_state(self.s)
            self.q, self.hs_out = self.get_q_values_op(s, self.past_a, self.agate, self.slen, self.hs, scope="q", reuse=False)

            # compute Q values of next state
            sp = self.process_state(self.sp)
            self.target_q, self.hsp_out = self.get_q_values_op(sp, self.a, self.agatep, self.splen, self.hsp, scope="target_q", reuse=False)

            self.dnc = DNC(self.config, self.config.dnc_h_size)
            self.out_dnc, self.hs_out_dnc = self.dnc(self.s_dnc, self.hs_dnc)
            self.agate_dnc = self.dnc.get_agate(self.s_dnc, self.hs_dnc)
            self.path_dnc, self.target_ldm_dnc = self.dnc.value_iteration(self.hs_dnc.access_state, self.src_inputs_dnc, self.tgt_inputs_dnc, self.max_len_dnc)

            # add update operator for target network
            self.add_update_target_op("q", "target_q")

            # add square loss
            self.add_loss_op(self.q, self.target_q)

            # add optmizer for the main networks
            self.add_optimizer_op("q")

    def initialize(self):
        """
        Assumes the graph has been constructed
        Creates a tf Session and run initializer of variables
        """
        # create tf session
        with self.current_graph.as_default():
            self.sess = tf.Session()
            # tensorboard stuff
            self.add_summary()

            # initiliaze all variables
            init = tf.global_variables_initializer()
            self.sess.run(init)

            # synchronise q and target_q networks
            self.sess.run(self.update_target_op)

            # for saving networks weights
            self.saver = tf.train.Saver(max_to_keep=50)
            if self.config.restore_param:
                self.restore(self.config.restore_t)

    def update_target_params(self):
        """
        Update parametes of Q' with parameters of Q
        """
        self.sess.run(self.update_target_op)

    def convert_state_to_goal_state(self, state):
        #### goal state is used for filtering and raw_goal_state is used to predict Q
        #### crop to get 3x3 goal ####
        side_radius = self.config.visible_radius_unit_side
        raw_goal_state = copy.deepcopy(state[:3,side_radius-1:side_radius+2,:])
        goal_state = copy.deepcopy(raw_goal_state)
        #### missing could be everything ####
        num_classes = len(self.env.state.xmap.item_class_id)
        for i in range(3):
            for j in range(3):
                if goal_state[i,j,num_classes] == True:
                    goal_state[i,j,:] = True
        #### treat missing observation as correct observation ####
        goal_state[:,:,num_classes] = True
        #### rotate ####
        goal_state = np.expand_dims(goal_state, 3)
        goal_state = np.concatenate([np.rot90(goal_state, 0), np.rot90(goal_state, 1),
            np.rot90(goal_state, 2), np.rot90(goal_state, 3)], 3)
        #### rotate raw goal state ####
        raw_goal_state = np.concatenate([np.rot90(raw_goal_state, 0), np.rot90(raw_goal_state, 1),
            np.rot90(raw_goal_state, 2), np.rot90(raw_goal_state, 3)], 0)
        return raw_goal_state, goal_state

    def train_step(self, t, replay_buffer, lr):
        """
        Perform training step

        Args:
            t: (int) nths step
            replay_buffer: buffer for sampling
            lr: (float) learning rate
        """
        loss_eval, grad_eval = 0, 0

        # perform training step
        if (t > self.config.learning_start and t % self.config.learning_freq == 0):
            loss_eval, grad_eval = self.update_step(t, replay_buffer, lr)

        # occasionaly update target network with q network
        if t % self.config.target_update_freq == 0:
            self.update_target_params()
            
        # occasionaly save the weights
        if (t % self.config.saving_freq == 0):
            self.save(t)

        return loss_eval, grad_eval

    def update_step(self, t, replay_buffer, lr):
        """
        Performs an update of parameters by sampling from replay_buffer

        Args:
            t: number of iteration (episode and move)
            replay_buffer: ReplayBuffer instance .sample() gives batches
            lr: (float) learning rate
        Returns:
            loss: (Q - Q_target)^2
        """

        s_batch, slen_batch, a_batch, past_a_batch, agate_batch, agatep_batch, r_batch, done_mask_batch, seq_mask_batch, sp_batch, splen_batch = replay_buffer.sample_batch(
            self.config.batch_size)

        fd = {
            # inputs
            self.s: s_batch,
            self.slen: slen_batch,
            self.hs: (np.zeros([self.config.batch_size, self.config.h_size]),np.zeros([self.config.batch_size, self.config.h_size])),
            self.a: a_batch,
            self.past_a: past_a_batch,
            self.agate: agate_batch,
            self.agatep: agatep_batch,
            self.r: r_batch,
            self.sp: sp_batch, 
            self.splen: splen_batch,
            self.hsp: (np.zeros([self.config.batch_size, self.config.h_size]),np.zeros([self.config.batch_size, self.config.h_size])),
            self.done_mask: done_mask_batch,
            self.seq_mask: seq_mask_batch,
            self.lr: lr, 
            # extra info
            self.avg_reward_placeholder: self.avg_reward, 
            self.max_reward_placeholder: self.max_reward, 
            self.std_reward_placeholder: self.std_reward, 
            self.avg_q_placeholder: self.avg_q, 
            self.max_q_placeholder: self.max_q, 
            self.std_q_placeholder: self.std_q, 
            self.eval_reward_placeholder: self.eval_reward, 
        }

        loss_eval, grad_norm_eval, summary, _ = self.sess.run([self.loss, self.grad_norm, 
                                                 self.merged, self.train_op], feed_dict=fd)

        # tensorboard stuff
        self.file_writer.add_summary(summary, t)
        
        return loss_eval, grad_norm_eval

    def train(self, model_a, exp_schedule, lr_schedule):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """
        # initialize replay buffer and variables
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)
        self.init_averages()

        t = last_eval = last_record = 0 # time control of nb of steps
        scores_eval = [] # list of scores computed at iteration time
        #scores_eval += [self.evaluate()]

        prog = Progbar(target=self.config.nsteps_train)

        self.env.state.is_render_image = self.config.render_train
        model_a.env.state.is_render_image = model_a.config.render_train
        orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])]

        npath = self.config.npath # paths to generate in each environment
        nquery = self.config.nquery # query to generate in each environment
        max_plan_len = self.config.max_plan_len
        ndigits = self.config.ndigits
        nway = self.config.nway

        num_classes = len(self.env.state.xmap.item_class_id)

        # three steps:
        #   1. sample paths from the teacher environment and pass to dnc
        #   2. get immediate reward from whether agent could reach the subgoal
        #   3. sample query paths and ask agent to follow the plan, get the final big reward
        #   -- train one step after each teacher's move
        # interact with environment
        while t < self.config.nsteps_train:
            total_reward = 0
            self.env.reset()
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, self.env.state)
            dnc_state = DNC.zero_state(self.config, batch_size=1)
            h_state = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0

            # sample paths
            for i in range(npath):
                state_seq, path_loc, path_ori = self.env.teacher.gen_sample_seq(self.env.state)
                state_seq_encoding = DRQN_planner.encode_state(state_seq, ndigits, nway)
                goal_state_seq = np.reshape(state_seq, [state_seq.shape[0], 4, 3, 3, num_classes+2]).astype('bool')
                #### missing could be everything ####
                goal_state_seq = np.tile(goal_state_seq[:,:,:,:,[num_classes]], [1,1,1,1,num_classes+2])+goal_state_seq
                #### treat missing observation as correct observation ####
                goal_state_seq[:,:,:,:,num_classes] = True
                #### transpose
                goal_state_seq = np.transpose(goal_state_seq, [0,2,3,4,1])
                path_len = state_seq.shape[0]
                mask_seq = np.logical_not(state_seq[:,:3,:,num_classes])
                flag_seq = np.zeros([path_len])
                flag_seq[-1] = 1
                model_a.env.state.teleport(model_a.env.agent, path_loc[0], orientation_map[path_ori[0]])

                for j in range(path_len):
                    # get agate from dnc
                    cur_dnc_in = np.concatenate([state_seq_encoding[j].reshape(-1),mask_seq[j].reshape(-1), np.array([0, flag_seq[j]])], axis=0)
                    agate_dnc_val = self.sess.run(self.agate_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    agate_dnc_val = agate_dnc_val[0,0]
                    # get q value and sample action
                    idx = replay_buffer.store_frame(state_seq[j])
                    q_input = replay_buffer.encode_recent_observation()
                    best_action, q_values, h_state = self.get_best_action([q_input], h_state, slen, [action], [agate_dnc_val])
                    action = exp_schedule.get_action(best_action)
                    # store q values
                    max_q_values.append(max(q_values))
                    q_values += list(q_values)
                    # take action and update dnc
                    cur_dnc_in[-2] = action
                    dnc_state = self.sess.run(self.hs_out_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    # acquire reward
                    reward = 0
                    done = False
                    if action==1:
                        h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                        model_a.env.teacher.set_goal(goal_state_seq[j], path_loc[j])
                        reward_a = model_a.navi_goal(h_state_a, goal_state_seq[j])
                        if not model_a.env.teacher.goal_finish:
                            reward += -0.05
                        reward += -0.05
                        model_a.env.state.teleport(model_a.env.agent, path_loc[j], orientation_map[path_ori[j]])
                    # acquire final reward
                    if i==npath-1 and j==path_len-1:
                        done = True
                        reward_list = list()
                        for k in range(nquery):
                            reward_list.append(0)
                            src_inputs, tgt_inputs, src_loc, tgt_loc, goal_obs_onehot_state = self.env.teacher.gen_sample_query(self.env.state)
                            src_inputs = DRQN_planner.encode_state(src_inputs, ndigits, nway)
                            tgt_inputs = DRQN_planner.encode_state(tgt_inputs, ndigits, nway)
                            path_dnc_val, target_ldm_dnc_val = self.sess.run([self.path_dnc, self.target_ldm_dnc], feed_dict={self.hs_dnc: dnc_state, self.src_inputs_dnc: src_inputs[None], 
                                self.tgt_inputs_dnc: tgt_inputs[None], self.max_len_dnc: max_plan_len})
                            path_dnc_val = DRQN_planner.decode_state(np.reshape(path_dnc_val[0], [max_plan_len, 3, 3, -1]), ndigits, nway, num_classes+2)
                            target_ldm_dnc_val = DRQN_planner.decode_state(np.reshape(target_ldm_dnc_val[0], [3, 3, -1]), ndigits, nway, num_classes+2)
                            path_dnc_val_inner = np.argmax(path_dnc_val, axis=3)
                            target_ldm_dnc_val_inner = np.argmax(target_ldm_dnc_val, axis=2)
                            cur_len = max_plan_len
                            for l in range(max_plan_len):
                                if (path_dnc_val_inner[l]==target_ldm_dnc_val_inner).all():
                                    cur_len = l+1
                                    break
                            path_dnc_val = path_dnc_val[:cur_len]
                            path_dnc_val = np.concatenate([path_dnc_val, goal_obs_onehot_state[None]], 0)
                            #### modify goal state ####
                            #### missing could be everything ####
                            path_dnc_val = np.tile(path_dnc_val[:,:,:,[num_classes]], [1,1,1,num_classes+2])+path_dnc_val
                            #### treat missing observation as correct observation ####
                            path_dnc_val[:,:,:,num_classes] = True
                            model_a.env.state.teleport(model_a.env.agent, src_loc, np.array([0,1]))
                            h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                            for l in range(path_dnc_val.shape[0]):
                                cur_goal_state = path_dnc_val[l]
                                cur_goal_state = np.expand_dims(cur_goal_state, 3)
                                cur_goal_state = np.concatenate([np.rot90(cur_goal_state, 0), np.rot90(cur_goal_state, 1),
                                    np.rot90(cur_goal_state, 2), np.rot90(cur_goal_state, 3)], 3)
                                model_a.env.teacher.set_goal(cur_goal_state, tgt_loc)
                                reward_list[-1] += model_a.navi_goal(h_state_a, cur_goal_state)
                            if model_a.env.teacher.goal_finish:
                                reward_list[-1] += 10
                        reward += sum(reward_list)/len(reward_list)
                    # store everything into replay buffer
                    replay_buffer.store_effect(idx, action, agate_dnc_val, reward, done)

                    t += 1
                    last_eval += 1
                    last_record += 1

                    # perform a training step
                    loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon)

                    # logging stuff
                    if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and
                       (t % self.config.learning_freq == 0)):
                        self.update_averages(rewards, max_q_values, q_values, scores_eval)
                        exp_schedule.update(t)
                        lr_schedule.update(t)
                        if len(rewards) > 0:
                            prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), 
                                            ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), 
                                            ("Grads", grad_eval), ("Max Q", self.max_q), 
                                            ("lr", lr_schedule.epsilon)])

                    elif (t < self.config.learning_start) and (t % self.config.log_freq == 0):
                        sys.stdout.write("\rPopulating the memory {}/{}...".format(t, 
                                                            self.config.learning_start))
                        sys.stdout.flush()

                # count reward
                total_reward += reward
                if done or t >= self.config.nsteps_train:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)          

            if (t > self.config.learning_start) and (last_eval > self.config.eval_freq):
                # evaluate our policy
                last_eval = 0
                print("")
                self.logger.info("Global step: %d"%(t))
                scores_eval += [self.evaluate(model_a)]

        # last words
        self.logger.info("- Training done.")
        self.save(t)
        scores_eval += [self.evaluate(model_a)]
        export_plot(scores_eval, "Scores", self.config.plot_output)

    def evaluate(self, model_a, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env
        env.state.is_render_image = self.config.render_test
        model_a.env.state.is_render_image = model_a.config.render_test
        rewards = []
        orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])]
        #### visualize landmark in a fixed scene ####
        if self.config.deploy_only and self.config.vis_heat_map:
            width, height = env.state.xmap.dim['width'], env.state.xmap.dim['height']
            goal_heatmap = np.zeros([height, width])
            goal_heatmap_norm = np.zeros([height, width])
        #############################################
        npath = self.config.npath # paths to generate in each environment
        nquery = self.config.nquery # query to generate in each environment
        max_plan_len = self.config.max_plan_len
        ndigits = self.config.ndigits
        nway = self.config.nway
        num_classes = len(self.env.state.xmap.item_class_id)
        for ii in range(num_episodes):
            total_reward = 0
            env.reset()
            model_a.env.reset()
            model_a.env.state.copy_state(model_a.env.agent, env.state)
            dnc_state = DNC.zero_state(self.config, batch_size=1)
            h_state = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size]))
            slen = np.ones(1).astype('int32')
            action = 0

            # sample paths
            for i in range(npath):
                state_seq, path_loc, path_ori = env.teacher.gen_sample_seq(env.state)
                state_seq_encoding = DRQN_planner.encode_state(state_seq, ndigits, nway)
                goal_state_seq = np.reshape(state_seq, [state_seq.shape[0], 4, 3, 3, num_classes+2]).astype('bool')
                #### missing could be everything ####
                goal_state_seq = np.tile(goal_state_seq[:,:,:,:,[num_classes]], [1,1,1,1,num_classes+2])+goal_state_seq
                #### treat missing observation as correct observation ####
                goal_state_seq[:,:,:,:,num_classes] = True
                #### transpose
                goal_state_seq = np.transpose(goal_state_seq, [0,2,3,4,1])
                path_len = state_seq.shape[0]
                mask_seq = np.logical_not(state_seq[:,:3,:,num_classes])
                flag_seq = np.zeros([path_len])
                flag_seq[-1] = 1
                model_a.env.state.teleport(model_a.env.agent, path_loc[0], orientation_map[path_ori[0]])

                for j in range(path_len):
                    # get agate from dnc
                    cur_dnc_in = np.concatenate([state_seq_encoding[j].reshape(-1),mask_seq[j].reshape(-1), np.array([0, flag_seq[j]])], axis=0)
                    agate_dnc_val = self.sess.run(self.agate_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})
                    agate_dnc_val = agate_dnc_val[0,0]
                    # get q value and sample action
                    action, q_values, h_state = self.get_action(state_seq[j][None][None], h_state, slen, [action], [agate_dnc_val])
                    # take action and update dnc
                    cur_dnc_in[-2] = action
                    dnc_state = self.sess.run(self.hs_out_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state})                    
                    #### visualize landmark in a fixed scene ####
                    if self.config.deploy_only and self.config.vis_heat_map:
                        if (goal_location>=0).sum()==2 and (goal_location<[width, height]).sum()==2:
                            goal_heatmap_norm[path_loc[j][1], path_loc[j][0]] += 1
                            if action==1:
                                goal_heatmap[path_loc[j][1], path_loc[j][0]] += 1
                    
            reward_list = list()
            for k in range(nquery):
                reward_list.append(0)
                src_inputs, tgt_inputs, src_loc, tgt_loc, goal_obs_onehot_state = self.env.teacher.gen_sample_query(self.env.state)
                src_inputs = DRQN_planner.encode_state(src_inputs, ndigits, nway)
                tgt_inputs = DRQN_planner.encode_state(tgt_inputs, ndigits, nway)
                path_dnc_val, target_ldm_dnc_val = self.sess.run([self.path_dnc, self.target_ldm_dnc], feed_dict={self.hs_dnc: dnc_state, self.src_inputs_dnc: src_inputs[None], 
                    self.tgt_inputs_dnc: tgt_inputs[None], self.max_len_dnc: max_plan_len})
                path_dnc_val = DRQN_planner.decode_state(np.reshape(path_dnc_val[0], [max_plan_len, 3, 3, -1]), ndigits, nway, num_classes+2)
                target_ldm_dnc_val = DRQN_planner.decode_state(np.reshape(target_ldm_dnc_val[0], [3, 3, -1]), ndigits, nway, num_classes+2)
                path_dnc_val_inner = np.argmax(path_dnc_val, axis=3)
                target_ldm_dnc_val_inner = np.argmax(target_ldm_dnc_val, axis=2)
                cur_len = max_plan_len
                for l in range(max_plan_len):
                    if (path_dnc_val_inner[l]==target_ldm_dnc_val_inner).all():
                        cur_len = l+1
                        break
                path_dnc_val = path_dnc_val[:cur_len]
                path_dnc_val = np.concatenate([path_dnc_val, goal_obs_onehot_state[None]], 0)
                #### modify goal state ####
                #### missing could be everything ####
                path_dnc_val = np.tile(path_dnc_val[:,:,:,[num_classes]], [1,1,1,num_classes+2])+path_dnc_val
                #### treat missing observation as correct observation ####
                path_dnc_val[:,:,:,num_classes] = True
                model_a.env.state.teleport(model_a.env.agent, src_loc, np.array([0,1]))
                h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size]))
                for l in range(path_dnc_val.shape[0]):
                    cur_goal_state = path_dnc_val[l]
                    cur_goal_state = np.expand_dims(cur_goal_state, 3)
                    cur_goal_state = np.concatenate([np.rot90(cur_goal_state, 0), np.rot90(cur_goal_state, 1),
                        np.rot90(cur_goal_state, 2), np.rot90(cur_goal_state, 3)], 3)
                    model_a.env.teacher.set_goal(cur_goal_state, tgt_loc)
                    reward_list[-1] += model_a.navi_goal(h_state_a, cur_goal_state)
                if model_a.env.teacher.goal_finish:
                    reward_list[-1] += 10
            total_reward = sum(reward_list)/len(reward_list)
                    
            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))
        #### visualize landmark in a fixed scene ####
        if self.config.deploy_only and self.config.vis_heat_map:
            #goal_heatmap /= (goal_heatmap_norm+1e-6)
            plt.imshow(goal_heatmap, cmap='hot', interpolation='nearest')
            plt.show()
            plt.pause(100)
        #############################################

        if num_episodes > 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward

    @staticmethod
    def encode_state(state, ndigits, nway):
        # state of shape (~, one_hot_len)
        # encoding of shape (~, ndigits*nway)
        state_shape = list(np.shape(state))
        state_idx = np.argmax(np.reshape(state, [-1, state_shape[-1]]),1)
        encoding = np.zeros([len(state_idx), nway*ndigits])
        for i in range(ndigits):
            encoding[np.arange(encoding.shape[0]), state_idx%nway+i*nway] = 1
            state_idx = np.floor(state_idx/nway).astype('int32')
        state_shape[-1] = ndigits*nway
        encoding = encoding.reshape(state_shape)
        return encoding

    @staticmethod
    def decode_state(encoding, ndigits, nway, num_classes):
        state_shape = list(np.shape(encoding))
        state_idx = np.argmax(np.reshape(encoding, [-1, ndigits, nway]), 2)
        for i in range(ndigits-1):
            state_idx[:,i+1:] *= nway
        state_idx = np.sum(state_idx, axis=1)
        state_idx[state_idx>=num_classes] = num_classes-1
        state = np.zeros([len(state_idx), num_classes])
        state[np.arange(len(state_idx)),state_idx] = 1
        state_shape[-1] = num_classes
        state = state.reshape(state_shape)
        return state

    def run(self, model_a, exp_schedule, lr_schedule):
        """
        Apply procedures of training for a QN

        Args:
            exp_schedule: exploration strategy for epsilon
            lr_schedule: schedule for learning rate
        """
        # initialize
        self.initialize()
        # model
        self.train(model_a, exp_schedule, lr_schedule)

    def deploy(self, model_a):
        """
        Apply procedures of training for a QN

        Args:
            exp_schedule: exploration strategy for epsilon
            lr_schedule: schedule for learning rate
        """
        # initialize
        self.initialize()
        # model
        self.evaluate(model_a)