def __init__(
            self,
            env,
            q_func,
            optimizer_spec,
            session,
            exploration=LinearSchedule(1000000, 0.1),
            stopping_criterion=None,
            replay_buffer_size=1000000,
            batch_size=32,
            gamma=0.99,
            learning_starts=50000,
            learning_freq=4,
            frame_history_len=4,
            target_update_freq=10000,
            grad_norm_clipping=10,
            rew_file=None,
            double_q=True,
            lander=False,
            explore='e-greedy',
            ex2=False,
            min_replay_size=10000,
            # not sure
            ex2_len=1000,
            coef=0.01,
            seed=250,
            evaluation=False,
            directory='./models/model1'):
        """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            img_in: tf.Tensor
                tensorflow tensor representing the input image
            num_actions: int
                number of actions
            scope: str
                scope in which all the model related variables
                should be created
            reuse: bool
                whether previously created variables should be reused.
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    session: tf.Session
        tensorflow session to use.
    exploration: rl_algs.deepq.utils.schedules.Schedule
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    double_q: bool
        If True, then use double Q-learning to compute target values. Otherwise, use vanilla DQN.
        https://papers.nips.cc/paper/3964-double-q-learning.pdf
    """
        assert type(env.observation_space) == gym.spaces.Box
        assert type(env.action_space) == gym.spaces.Discrete

        self.target_update_freq = target_update_freq
        self.optimizer_spec = optimizer_spec
        self.batch_size = batch_size
        self.learning_freq = learning_freq
        self.learning_starts = learning_starts
        self.stopping_criterion = stopping_criterion
        self.env = env

        # Double (need to modify)
        graph_1 = tf.Graph()
        graph_2 = tf.Graph()
        # Settings for Atari Ram
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                                   intra_op_parallelism_threads=1)
        self.session1 = tf.Session(config=tf_config, graph=graph_1)
        self.session2 = tf.Session(config=tf_config, graph=graph_2)

        # print("AVAILABLE GPUS: ", get_available_gpus())

        self.exploration = exploration
        self.rew_file = str(
            uuid.uuid4()) + '.pkl' if rew_file is None else rew_file
        self.double_q = double_q
        self.explore = explore
        # EX2
        # [1e-3, 1e-4, 1e-5]
        self.coef = coef
        self.first_train = True
        self.first_train_itrs = int(5e3)
        self.train_itrs = int(1e3)
        self.ex2 = ex2
        self.min_replay_size = min_replay_size
        self.ex2_len = ex2_len
        self.count = 0
        self.seed = seed
        self.eval = evaluation
        print('eval?', self.eval)
        print('exploration strategy', explore)
        print('using ex2', ex2)
        print('using coef', coef)

        ###############
        # BUILD MODEL #
        ###############
        if len(self.env.observation_space.shape) == 1:
            # This means we are running on low-dimensional observations (e.g. RAM)
            # IT is what I am debugging on!
            input_shape = self.env.observation_space.shape
        else:
            img_h, img_w, img_c = self.env.observation_space.shape
            input_shape = (img_h, img_w, frame_history_len * img_c)
        self.num_actions = self.env.action_space.n
        if self.eval:
            # Model 1
            with graph_1.as_default():
                saver1 = tf.train.import_meta_graph(
                    './models/Jamesbond_soft_q_ex2_e4.meta')
                saver1.restore(self.session1,
                               './models/Jamesbond_soft_q_ex2_e4')
                self.obs_t_ph1 = tf.get_collection('obs_t_ph')[0]
                self.Temp1 = tf.get_collection('Temp')[0]
                self.keep_per1 = tf.get_collection('keep_per')[0]
                self.q_dist1 = tf.get_collection('q_dist')[0]
                self.q_t1 = tf.get_collection('q_t')[0]
                # Ex2
                if self.ex2:
                    self.ex2_in1_1 = tf.get_collection('ex2_in1')[0]
                    self.ex2_in2_1 = tf.get_collection('ex2_in2')[0]
                    self.ex2_dis_output1 = tf.get_collection(
                        'ex2_dis_output')[0]
                    self.ex2_prob1 = tf.get_collection('ex2_prob')[0]
            # Model 2
            with graph_2.as_default():
                saver2 = tf.train.import_meta_graph(
                    './models/Alien_soft_q_ex2_e4.meta')
                saver2.restore(self.session2, './models/Alien_soft_q_ex2_e4')
                self.obs_t_ph2 = tf.get_collection('obs_t_ph')[0]
                self.Temp2 = tf.get_collection('Temp')[0]
                self.keep_per2 = tf.get_collection('keep_per')[0]
                self.q_dist2 = tf.get_collection('q_dist')[0]
                self.q_t2 = tf.get_collection('q_t')[0]
                # Ex2
                if self.ex2:
                    self.ex2_in1_2 = tf.get_collection('ex2_in1')[0]
                    self.ex2_in2_2 = tf.get_collection('ex2_in2')[0]
                    self.ex2_dis_output2 = tf.get_collection(
                        'ex2_dis_output')[0]
                    self.ex2_prob2 = tf.get_collection('ex2_prob')[0]
            self.model_initialized = True
            # print('obs is here',self.obs_t_ph)
            # print(self.Temp)
            # print(self.keep_per)
            # print(self.q_dist)

            print('restored and initialized the model')
        else:
            # set up placeholders
            # placeholder for current observation (or state)
            self.obs_t_ph = tf.placeholder(tf.float32 if lander else tf.uint8,
                                           [None] + list(input_shape))
            # placeholder for current action
            self.act_t_ph = tf.placeholder(tf.int32, [None])
            # placeholder for current reward
            self.rew_t_ph = tf.placeholder(tf.float32, [None])
            # placeholder for next observation (or state)
            self.obs_tp1_ph = tf.placeholder(
                tf.float32 if lander else tf.uint8, [None] + list(input_shape))
            # placeholder for end of episode mask
            # this value is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target, not the
            # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
            self.done_mask_ph = tf.placeholder(tf.float32, [None])

            # casting to float on GPU ensures lower data transfer times.
            # TO-DO: WHY?
            if lander:
                obs_t_float = self.obs_t_ph
                obs_tp1_float = self.obs_tp1_ph
            else:
                obs_t_float = tf.cast(self.obs_t_ph, tf.float32) / 255.0
                obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0

            # Here, you should fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # TensorFlow will differentiate this error for you, you just need to pass it to the
            # optimizer. See assignment text for details.

            # Your code should produce one scalar-valued tensor: total_error
            # This will be passed to the optimizer in the provided code below.

            # Your code should also produce two collections of variables:
            # q_func_vars
            # target_q_func_vars
            # These should hold all of the variables of the Q-function network and target network,
            # respectively. A convenient way to get these is to make use of TF's "scope" feature.
            # For example, you can create your Q-function network with the scope "q_func" like this:
            # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
            # And then you can obtain the variables like this:
            # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
            # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"

            # Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error
            ######

            # YOUR CODE HERE
            # For bayesian exploration: Add dropout value to the network
            # Get Q-function and target network
            self.keep_per = tf.placeholder(shape=None, dtype=tf.float32)
            if self.explore == 'bayesian':
                print('Bayesian variables defined!')
                dropout = True
            else:
                dropout = False

            # EX2
            if self.ex2:
                print('Use Exemplar Model')
                self.exemplar = Exemplar(input_dim=input_shape[0],
                                         seed=self.seed,
                                         eval=self.eval)

            q_t = q_func(obs_t_float,
                         self.num_actions,
                         scope='q_func',
                         reuse=False,
                         dropout=dropout,
                         keep_prob=self.keep_per)
            q_tp1 = q_func(obs_tp1_float,
                           self.num_actions,
                           scope='target_q_func_vars',
                           reuse=False,
                           dropout=dropout,
                           keep_prob=self.keep_per)

            # For boltzmann exploration
            if self.explore == 'soft_q':
                print('Boltzman variables defined!')
                self.Temp = tf.placeholder(shape=None, dtype=tf.float32)
                # print(q_t)

                #value = tf.reduce_mean(q_t, 1)
                # print(value)

                # print(q_t - value)
                # print(self.q_dist)
                # exit()
                # self.q_dist = tf.nn.softmax(q_t/self.Temp)

                # # Old version
                # value = tf.log( tf.reduce_sum(tf.exp(q_t),1) )
                # self.q_dist = tf.exp(q_t - value)

                # New version
                self.q_dist = tf.nn.softmax(q_t / self.Temp)

            # Max operation
            self.q_t_action = tf.argmax(q_t, axis=1)
            # value = tf.reduce_mean(q_t)
            # self.q_t_action = tf.nn.softmax(q_t - value)

            # Specify double Q function difference
            if self.double_q:
                print('using double q learning')
                # TO-DO: VERY VERY IMPORTANT TO REUSE VAIRABLES
                # TO-DO: DO WE NEED TO SET GRADIENT NOT UPDATE
                q_tp1_target = q_func(obs_tp1_float,
                                      self.num_actions,
                                      scope='q_func',
                                      reuse=True)
                q_tp1_target_action = tf.argmax(q_tp1_target, axis=1)
                q_tp1_max = tf.reduce_sum(
                    q_tp1 * tf.one_hot(indices=q_tp1_target_action,
                                       depth=self.num_actions,
                                       on_value=1.0,
                                       off_value=0.0),
                    axis=1)
            else:
                # Soft maximum
                if self.explore == 'soft_q':
                    print('using soft q learning')
                    # q_tp1_max = tf.log( tf.reduce_sum(tf.exp(q_tp1),1) )
                    q_tp1_max = tf.reduce_logsumexp(q_tp1, 1)
                    # print(q_tp1_max)
                    # exit()
                else:
                    q_tp1_max = tf.reduce_max(q_tp1, 1)

            # Get target value
            q_tp1 = gamma * (1.0 - self.done_mask_ph) * q_tp1_max
            target = self.rew_t_ph + q_tp1
            # Get Q_fai(si,ai)
            # TO-DO: VERY VERY IMPORTANT! use reduce_sum instead of reduce_max since exist negative value
            q_t_target = tf.reduce_sum(q_t * tf.one_hot(indices=self.act_t_ph,
                                                        depth=self.num_actions,
                                                        on_value=1.0,
                                                        off_value=0.0),
                                       axis=1)

            # Calculate loss
            self.total_error = target - q_t_target
            self.total_error = tf.reduce_mean(huber_loss(self.total_error))

            # Produce collections of variables to update separately
            q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope='q_func')
            target_q_func_vars = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func_vars')
            if 0:
                print(q_t.get_shape())
                print(q_tp1.get_shape())
                print(self.q_t_action.get_shape())
                print(self.done_mask_ph.get_shape())
                print(q_tp1_max.get_shape())
                print(q_tp1.get_shape())
                print(q_t_target.get_shape())
                print(self.total_error.get_shape())
                exit()
            ######

            # construct optimization op (with gradient clipping)
            self.learning_rate = tf.placeholder(tf.float32, (),
                                                name="learning_rate")
            optimizer = self.optimizer_spec.constructor(
                learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
            self.train_fn = minimize_and_clip(optimizer,
                                              self.total_error,
                                              var_list=q_func_vars,
                                              clip_val=grad_norm_clipping)

            # update_target_fn will be called periodically to copy Q network to target Q network
            update_target_fn = []
            for var, var_target in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(target_q_func_vars, key=lambda v: v.name)):
                update_target_fn.append(var_target.assign(var))
            self.update_target_fn = tf.group(*update_target_fn)

        # construct the replay buffer
        self.replay_buffer = ReplayBuffer(replay_buffer_size,
                                          frame_history_len,
                                          lander=lander)
        self.replay_buffer_idx = None

        ###############
        # RUN ENV     #
        ###############
        if not self.eval:
            self.model_initialized = False
        self.num_param_updates = 0
        self.mean_episode_reward = -float('nan')
        self.best_mean_episode_reward = -float('inf')
        # last_obs intialized here
        self.last_obs = self.env.reset()
        self.log_every_n_steps = 10000
        self.timesteps = []
        self.mean_episode_rewards = []
        self.best_mean_episode_rewards = []

        self.start_time = None
        self.t = 0

        # EX2
        if not eval:
            self.saver = tf.train.Saver()
            tf.add_to_collection('obs_t_ph', self.obs_t_ph)
            tf.add_to_collection('Temp', self.Temp)
            tf.add_to_collection('keep_per', self.keep_per)
            tf.add_to_collection('q_dist', self.q_dist)
            tf.add_to_collection('q_t', q_t)
            if self.ex2:
                in1, in2, dis_output, prob = self.exemplar.model.predict_tensor(
                )
                tf.add_to_collection('ex2_in1', in1)
                tf.add_to_collection('ex2_in2', in2)
                tf.add_to_collection('ex2_dis_output', dis_output)
                tf.add_to_collection('ex2_prob', prob)

        if self.ex2 and not self.eval:
            self.exemplar.model.init_tf_sess(self.session)
            self.model_initialized = True
        """
class QLearner(object):
    def __init__(
            self,
            env,
            q_func,
            optimizer_spec,
            session,
            exploration=LinearSchedule(1000000, 0.1),
            stopping_criterion=None,
            replay_buffer_size=1000000,
            batch_size=32,
            gamma=0.99,
            learning_starts=50000,
            learning_freq=4,
            frame_history_len=4,
            target_update_freq=10000,
            grad_norm_clipping=10,
            rew_file=None,
            double_q=True,
            lander=False,
            explore='e-greedy',
            ex2=False,
            min_replay_size=10000,
            # not sure
            ex2_len=1000,
            coef=0.01,
            seed=250,
            evaluation=False,
            directory='./models/model1'):
        """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            img_in: tf.Tensor
                tensorflow tensor representing the input image
            num_actions: int
                number of actions
            scope: str
                scope in which all the model related variables
                should be created
            reuse: bool
                whether previously created variables should be reused.
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    session: tf.Session
        tensorflow session to use.
    exploration: rl_algs.deepq.utils.schedules.Schedule
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    double_q: bool
        If True, then use double Q-learning to compute target values. Otherwise, use vanilla DQN.
        https://papers.nips.cc/paper/3964-double-q-learning.pdf
    """
        assert type(env.observation_space) == gym.spaces.Box
        assert type(env.action_space) == gym.spaces.Discrete

        self.target_update_freq = target_update_freq
        self.optimizer_spec = optimizer_spec
        self.batch_size = batch_size
        self.learning_freq = learning_freq
        self.learning_starts = learning_starts
        self.stopping_criterion = stopping_criterion
        self.env = env

        # Double (need to modify)
        graph_1 = tf.Graph()
        graph_2 = tf.Graph()
        # Settings for Atari Ram
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                                   intra_op_parallelism_threads=1)
        self.session1 = tf.Session(config=tf_config, graph=graph_1)
        self.session2 = tf.Session(config=tf_config, graph=graph_2)

        # print("AVAILABLE GPUS: ", get_available_gpus())

        self.exploration = exploration
        self.rew_file = str(
            uuid.uuid4()) + '.pkl' if rew_file is None else rew_file
        self.double_q = double_q
        self.explore = explore
        # EX2
        # [1e-3, 1e-4, 1e-5]
        self.coef = coef
        self.first_train = True
        self.first_train_itrs = int(5e3)
        self.train_itrs = int(1e3)
        self.ex2 = ex2
        self.min_replay_size = min_replay_size
        self.ex2_len = ex2_len
        self.count = 0
        self.seed = seed
        self.eval = evaluation
        print('eval?', self.eval)
        print('exploration strategy', explore)
        print('using ex2', ex2)
        print('using coef', coef)

        ###############
        # BUILD MODEL #
        ###############
        if len(self.env.observation_space.shape) == 1:
            # This means we are running on low-dimensional observations (e.g. RAM)
            # IT is what I am debugging on!
            input_shape = self.env.observation_space.shape
        else:
            img_h, img_w, img_c = self.env.observation_space.shape
            input_shape = (img_h, img_w, frame_history_len * img_c)
        self.num_actions = self.env.action_space.n
        if self.eval:
            # Model 1
            with graph_1.as_default():
                saver1 = tf.train.import_meta_graph(
                    './models/Jamesbond_soft_q_ex2_e4.meta')
                saver1.restore(self.session1,
                               './models/Jamesbond_soft_q_ex2_e4')
                self.obs_t_ph1 = tf.get_collection('obs_t_ph')[0]
                self.Temp1 = tf.get_collection('Temp')[0]
                self.keep_per1 = tf.get_collection('keep_per')[0]
                self.q_dist1 = tf.get_collection('q_dist')[0]
                self.q_t1 = tf.get_collection('q_t')[0]
                # Ex2
                if self.ex2:
                    self.ex2_in1_1 = tf.get_collection('ex2_in1')[0]
                    self.ex2_in2_1 = tf.get_collection('ex2_in2')[0]
                    self.ex2_dis_output1 = tf.get_collection(
                        'ex2_dis_output')[0]
                    self.ex2_prob1 = tf.get_collection('ex2_prob')[0]
            # Model 2
            with graph_2.as_default():
                saver2 = tf.train.import_meta_graph(
                    './models/Alien_soft_q_ex2_e4.meta')
                saver2.restore(self.session2, './models/Alien_soft_q_ex2_e4')
                self.obs_t_ph2 = tf.get_collection('obs_t_ph')[0]
                self.Temp2 = tf.get_collection('Temp')[0]
                self.keep_per2 = tf.get_collection('keep_per')[0]
                self.q_dist2 = tf.get_collection('q_dist')[0]
                self.q_t2 = tf.get_collection('q_t')[0]
                # Ex2
                if self.ex2:
                    self.ex2_in1_2 = tf.get_collection('ex2_in1')[0]
                    self.ex2_in2_2 = tf.get_collection('ex2_in2')[0]
                    self.ex2_dis_output2 = tf.get_collection(
                        'ex2_dis_output')[0]
                    self.ex2_prob2 = tf.get_collection('ex2_prob')[0]
            self.model_initialized = True
            # print('obs is here',self.obs_t_ph)
            # print(self.Temp)
            # print(self.keep_per)
            # print(self.q_dist)

            print('restored and initialized the model')
        else:
            # set up placeholders
            # placeholder for current observation (or state)
            self.obs_t_ph = tf.placeholder(tf.float32 if lander else tf.uint8,
                                           [None] + list(input_shape))
            # placeholder for current action
            self.act_t_ph = tf.placeholder(tf.int32, [None])
            # placeholder for current reward
            self.rew_t_ph = tf.placeholder(tf.float32, [None])
            # placeholder for next observation (or state)
            self.obs_tp1_ph = tf.placeholder(
                tf.float32 if lander else tf.uint8, [None] + list(input_shape))
            # placeholder for end of episode mask
            # this value is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target, not the
            # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
            self.done_mask_ph = tf.placeholder(tf.float32, [None])

            # casting to float on GPU ensures lower data transfer times.
            # TO-DO: WHY?
            if lander:
                obs_t_float = self.obs_t_ph
                obs_tp1_float = self.obs_tp1_ph
            else:
                obs_t_float = tf.cast(self.obs_t_ph, tf.float32) / 255.0
                obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0

            # Here, you should fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # TensorFlow will differentiate this error for you, you just need to pass it to the
            # optimizer. See assignment text for details.

            # Your code should produce one scalar-valued tensor: total_error
            # This will be passed to the optimizer in the provided code below.

            # Your code should also produce two collections of variables:
            # q_func_vars
            # target_q_func_vars
            # These should hold all of the variables of the Q-function network and target network,
            # respectively. A convenient way to get these is to make use of TF's "scope" feature.
            # For example, you can create your Q-function network with the scope "q_func" like this:
            # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
            # And then you can obtain the variables like this:
            # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
            # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"

            # Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error
            ######

            # YOUR CODE HERE
            # For bayesian exploration: Add dropout value to the network
            # Get Q-function and target network
            self.keep_per = tf.placeholder(shape=None, dtype=tf.float32)
            if self.explore == 'bayesian':
                print('Bayesian variables defined!')
                dropout = True
            else:
                dropout = False

            # EX2
            if self.ex2:
                print('Use Exemplar Model')
                self.exemplar = Exemplar(input_dim=input_shape[0],
                                         seed=self.seed,
                                         eval=self.eval)

            q_t = q_func(obs_t_float,
                         self.num_actions,
                         scope='q_func',
                         reuse=False,
                         dropout=dropout,
                         keep_prob=self.keep_per)
            q_tp1 = q_func(obs_tp1_float,
                           self.num_actions,
                           scope='target_q_func_vars',
                           reuse=False,
                           dropout=dropout,
                           keep_prob=self.keep_per)

            # For boltzmann exploration
            if self.explore == 'soft_q':
                print('Boltzman variables defined!')
                self.Temp = tf.placeholder(shape=None, dtype=tf.float32)
                # print(q_t)

                #value = tf.reduce_mean(q_t, 1)
                # print(value)

                # print(q_t - value)
                # print(self.q_dist)
                # exit()
                # self.q_dist = tf.nn.softmax(q_t/self.Temp)

                # # Old version
                # value = tf.log( tf.reduce_sum(tf.exp(q_t),1) )
                # self.q_dist = tf.exp(q_t - value)

                # New version
                self.q_dist = tf.nn.softmax(q_t / self.Temp)

            # Max operation
            self.q_t_action = tf.argmax(q_t, axis=1)
            # value = tf.reduce_mean(q_t)
            # self.q_t_action = tf.nn.softmax(q_t - value)

            # Specify double Q function difference
            if self.double_q:
                print('using double q learning')
                # TO-DO: VERY VERY IMPORTANT TO REUSE VAIRABLES
                # TO-DO: DO WE NEED TO SET GRADIENT NOT UPDATE
                q_tp1_target = q_func(obs_tp1_float,
                                      self.num_actions,
                                      scope='q_func',
                                      reuse=True)
                q_tp1_target_action = tf.argmax(q_tp1_target, axis=1)
                q_tp1_max = tf.reduce_sum(
                    q_tp1 * tf.one_hot(indices=q_tp1_target_action,
                                       depth=self.num_actions,
                                       on_value=1.0,
                                       off_value=0.0),
                    axis=1)
            else:
                # Soft maximum
                if self.explore == 'soft_q':
                    print('using soft q learning')
                    # q_tp1_max = tf.log( tf.reduce_sum(tf.exp(q_tp1),1) )
                    q_tp1_max = tf.reduce_logsumexp(q_tp1, 1)
                    # print(q_tp1_max)
                    # exit()
                else:
                    q_tp1_max = tf.reduce_max(q_tp1, 1)

            # Get target value
            q_tp1 = gamma * (1.0 - self.done_mask_ph) * q_tp1_max
            target = self.rew_t_ph + q_tp1
            # Get Q_fai(si,ai)
            # TO-DO: VERY VERY IMPORTANT! use reduce_sum instead of reduce_max since exist negative value
            q_t_target = tf.reduce_sum(q_t * tf.one_hot(indices=self.act_t_ph,
                                                        depth=self.num_actions,
                                                        on_value=1.0,
                                                        off_value=0.0),
                                       axis=1)

            # Calculate loss
            self.total_error = target - q_t_target
            self.total_error = tf.reduce_mean(huber_loss(self.total_error))

            # Produce collections of variables to update separately
            q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope='q_func')
            target_q_func_vars = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func_vars')
            if 0:
                print(q_t.get_shape())
                print(q_tp1.get_shape())
                print(self.q_t_action.get_shape())
                print(self.done_mask_ph.get_shape())
                print(q_tp1_max.get_shape())
                print(q_tp1.get_shape())
                print(q_t_target.get_shape())
                print(self.total_error.get_shape())
                exit()
            ######

            # construct optimization op (with gradient clipping)
            self.learning_rate = tf.placeholder(tf.float32, (),
                                                name="learning_rate")
            optimizer = self.optimizer_spec.constructor(
                learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
            self.train_fn = minimize_and_clip(optimizer,
                                              self.total_error,
                                              var_list=q_func_vars,
                                              clip_val=grad_norm_clipping)

            # update_target_fn will be called periodically to copy Q network to target Q network
            update_target_fn = []
            for var, var_target in zip(
                    sorted(q_func_vars, key=lambda v: v.name),
                    sorted(target_q_func_vars, key=lambda v: v.name)):
                update_target_fn.append(var_target.assign(var))
            self.update_target_fn = tf.group(*update_target_fn)

        # construct the replay buffer
        self.replay_buffer = ReplayBuffer(replay_buffer_size,
                                          frame_history_len,
                                          lander=lander)
        self.replay_buffer_idx = None

        ###############
        # RUN ENV     #
        ###############
        if not self.eval:
            self.model_initialized = False
        self.num_param_updates = 0
        self.mean_episode_reward = -float('nan')
        self.best_mean_episode_reward = -float('inf')
        # last_obs intialized here
        self.last_obs = self.env.reset()
        self.log_every_n_steps = 10000
        self.timesteps = []
        self.mean_episode_rewards = []
        self.best_mean_episode_rewards = []

        self.start_time = None
        self.t = 0

        # EX2
        if not eval:
            self.saver = tf.train.Saver()
            tf.add_to_collection('obs_t_ph', self.obs_t_ph)
            tf.add_to_collection('Temp', self.Temp)
            tf.add_to_collection('keep_per', self.keep_per)
            tf.add_to_collection('q_dist', self.q_dist)
            tf.add_to_collection('q_t', q_t)
            if self.ex2:
                in1, in2, dis_output, prob = self.exemplar.model.predict_tensor(
                )
                tf.add_to_collection('ex2_in1', in1)
                tf.add_to_collection('ex2_in2', in2)
                tf.add_to_collection('ex2_dis_output', dis_output)
                tf.add_to_collection('ex2_prob', prob)

        if self.ex2 and not self.eval:
            self.exemplar.model.init_tf_sess(self.session)
            self.model_initialized = True
        """
    # eval
    if self.eval:
        print("Initialize Evaluation Mode")
        self.saver.restore(self.session, "./bstmodel/model.ckpt")
        self.model_initialized = True
        print("Initialized models")
    """

    def stopping_criterion_met(self):
        return self.stopping_criterion is not None and self.stopping_criterion(
            self.env, self.t)

    def step_env(self):
        ### 2. Step the env and store the transition
        # At this point, "self.last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.

        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, self.last_obs must point to the new latest observation.

        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!

        ## what do you mean?? what is context?
        ## i think it is just {s,a,r}
        ## check encode_recent_observation!

        # Note that you cannot use "self.last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.

        # Don't forget to include epsilon greedy exploration!

        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)

        # Ex2
        self.count += 1
        # Store observation
        ret = self.replay_buffer.store_frame(self.last_obs)
        self.e_current_idx = ret
        # print(np.shape(self.last_obs))

        # For exploration, the value will gradually decrease
        if self.explore == 'greedy':
            # print("using greedy exploration!")
            if (not self.model_initialized):
                action = np.random.randint(0, self.num_actions)
            else:
                recent_obs = self.replay_buffer.encode_recent_observation()
                action = self.session.run(self.q_t_action,
                                          feed_dict={
                                              self.obs_t_ph: [recent_obs],
                                              self.keep_per: 1.0
                                          })
                action = action[0]
        if self.explore == 'e-greedy':
            # print("using e-greedy exploration!")
            # Random.random return [0,1)
            if (not self.model_initialized) or (
                    random.random() < self.exploration.value(self.t)):
                action = np.random.randint(0, self.num_actions)
            else:
                # Understanding: context have at least two frames to encode velocity info
                # RECENT_OBS: FOR RAM (128,) AND FOR LAUDER (9,) AND FOR ATARI (84,84,4)
                # Action shape (1,)
                # Encode recent observation
                recent_obs = self.replay_buffer.encode_recent_observation()
                # print(np.shape(recent_obs))
                action = self.session.run(self.q_t_action,
                                          feed_dict={
                                              self.obs_t_ph: [recent_obs],
                                              self.keep_per: 1.0
                                          })
                action = action[0]
                # print(np.shape(action))
                # exit()
        if self.explore == 'soft_q':
            # print("using boltzmann exploration!")
            if (not self.model_initialized):
                action = np.random.randint(0, self.num_actions)
            else:
                recent_obs = self.replay_buffer.encode_recent_observation()
                #print(recent_obs.shape)
                #print(recent_obs)
                #print(self.q_dist)
                #print(self.obs_t_ph)
                #print(self.Temp)
                #print(self.keep_per)
                #exit()
                q_t1 = self.session1.run(self.q_t1,
                                         feed_dict={
                                             self.obs_t_ph1: [recent_obs],
                                             self.Temp1: 1.0,
                                             self.keep_per1: 1.0
                                         })
                q_t2 = self.session2.run(self.q_t2,
                                         feed_dict={
                                             self.obs_t_ph2: [recent_obs],
                                             self.Temp2: 1.0,
                                             self.keep_per2: 1.0
                                         })
                ex1_out, ex_prob1 = self.session1.run(
                    [self.ex2_dis_output1, self.ex2_prob1],
                    feed_dict={
                        self.ex2_in1_1: [recent_obs],
                        self.ex2_in2_1: [recent_obs]
                    })
                ex2_out, ex_prob2 = self.session2.run(
                    [self.ex2_dis_output2, self.ex2_prob2],
                    feed_dict={
                        self.ex2_in1_2: [recent_obs],
                        self.ex2_in2_2: [recent_obs]
                    })
                # print( "q_t1 shape", q_t1.shape)
                #print([ex_prob1, ex_prob2])
                prob = np.clip([ex_prob1, ex_prob2], 0, 50)

                alphas = np_softmax(prob)
                # alphas = np.array([0.5, 0.5])
                # print("alphas shape:",alphas.shape)
                # alphas = np.array([1.0, 0.0])
                alphas = alphas[np.newaxis, :]
                # print("alpha:",alphas)
                q_t = np.concatenate((q_t1, q_t2))
                # print("q_t shape:", q_t.shape)
                q_t = np.dot(alphas, q_t)
                # print("q_t final shape", q_t.shape)
                q_dist = np_softmax(q_t[0])
                #print("q_t final shape", q_dist.shape)
                action = np.random.choice(self.num_actions, p=q_dist)
                # if self.eval and (self.replay_buffer.num_in_buffer > self.min_replay_size) and (self.count >= self.ex2_len):
                #     self.count = 0
                #     #paths = self.replay_buffer.get_all_positive(self.ex2_len)
                #     #ex2_out, ex2_pb = self.session.run([self.ex2_dis_output, self.ex2_prob], feed_dict={self.ex2_in1: paths,
                #     #                                                       self.ex2_in2: paths})

                #     # print("ex2 dis_out", ex2_out)
                #     # print("ex2 pb_out", ex2_pb)
                #     #for _ in range(10):
                #     #    ex2_out, ex2_pb = self.session.run([self.ex2_dis_output, self.ex2_prob], feed_dict={self.ex2_in1: np.ones((1,9))/9,
                #     #                                                           self.ex2_in2: np.ones((1,9))/9 })
                #     #    print("ex2_pb", ex2_pb)
                # #exit()
                # if 0:
                #     print('in',input_q)
                #     print('qt',q_t)
                #     print('qd',q_d)
                # action = np.random.choice(self.num_actions, p=q_d[0])

        if self.explore == 'bayesian':
            # print("using bayesian exploration!")
            if (not self.model_initialized):
                action = np.random.randint(0, self.num_actions)
            else:
                recent_obs = self.replay_buffer.encode_recent_observation()
                keep_per = (1.0 - self.exploration.value(self.t)) + 0.1
                # Deal with larger than 1.0 case
                keep_per = 1.0 if keep_per > 1.0 else keep_per
                # print(keep_per)
                action = self.session.run(self.q_t_action,
                                          feed_dict={
                                              self.obs_t_ph: [recent_obs],
                                              self.keep_per: keep_per
                                          })
                action = action[0]
                # print(action)
                # exit()

        # Step one step forward
        # INPUT FOR ACTION IS INT VALUE
        obs, reward, done, info = self.env.step(action)
        # print(np.shape(obs))
        # exit()
        # Point to the newest observation
        if done:
            obs = self.env.reset()
        self.last_obs = obs
        # Store others
        self.replay_buffer.store_effect(ret, action, reward, done)

        # Update EX2 model and rewards
        if not self.eval:
            if self.ex2 and (self.replay_buffer.num_in_buffer >
                             self.min_replay_size) and (self.count >=
                                                        self.ex2_len):
                self.count = 0
                # fit ex2 model
                if self.first_train:
                    train_itrs = self.first_train_itrs
                    self.first_train = False
                else:
                    train_itrs = self.train_itrs
                for _ in range(train_itrs):
                    positive = self.replay_buffer.sample_positive(
                        self.ex2_len, 128)
                    negative = self.replay_buffer.sample_negative(
                        self.ex2_len, 128)
                    # positive_np = np.asarray(positive)
                    # print(positive_np.shape)
                    # print(self.replay_buffer.num_in_buffer)
                    # print(positive)
                    # print(len(positive))
                    # exit()
                    self.exemplar.fit(positive, negative)
                # update rewards
                paths = self.replay_buffer.get_all_positive(self.ex2_len)
                bonus_reward = self.exemplar.predict(paths)
                self.replay_buffer.update_reward(self.ex2_len, bonus_reward,
                                                 self.coef)

        if self.eval:
            self.t += 1
        # exit()
        #####
        # YOUR CODE HERE

    def update_model(self):
        ### 3. Perform experience replay and train the network.
        # Absolutely, this process takes long!
        # note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (self.t > self.learning_starts and \
            self.t % self.learning_freq == 0 and \
            self.replay_buffer.can_sample(self.batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # batch_size = 32, observation shape = 128
            obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = self.replay_buffer.sample(
                self.batch_size)

            # 3.b: initialize the model if it has not been initialized yet; to do
            # that, call
            #    initialize_interdependent_variables(self.session, tf.global_variables(), {
            #        self.obs_t_ph: obs_t_batch,
            #        self.obs_tp1_ph: obs_tp1_batch,
            #    })
            # where obs_t_batch and obs_tp1_batch are the batches of observations at
            # the current and next time step. The boolean variable model_initialized
            # indicates whether or not the model has been initialized.
            # Remember that you have to update the target network too (see 3.d)!

            # TO-DO: is it only initialize once when first start
            if not self.model_initialized:
                print("initializing model")
                if self.ex2:
                    # initialized in Siamese model
                    print("Ex2 no need to initialize")
                    pass
                else:
                    print("interdependent init")
                    initialize_interdependent_variables(
                        self.session, tf.global_variables(), {
                            self.obs_t_ph: obs_t_batch,
                            self.obs_tp1_ph: obs_tp1_batch,
                        })
                    # self.session.run(tf.global_variables_initializer())
                # TO-DO: VERY VERY IMPORTATNT!!
                #self.saver = tf.train.Saver()
                print("set model_initialized True")
                self.model_initialized = True

            # 3.c: train the model. To do this, you'll need to use the self.train_fn and
            # self.total_error ops that were created earlier: self.total_error is what you
            # created to compute the total Bellman error in a batch, and self.train_fn
            # will actually perform a gradient step and update the network parameters
            # to reduce total_error. When calling self.session.run on these you'll need to
            # populate the following placeholders:
            # self.obs_t_ph
            # self.act_t_ph
            # self.rew_t_ph
            # self.obs_tp1_ph
            # self.done_mask_ph
            # (this is needed for computing self.total_error)
            # self.learning_rate -- you can get this from self.optimizer_spec.lr_schedule.value(t)
            # (this is needed by the optimizer to choose the learning rate)
            # TO-DO: check written rule okay?
            _, error = self.session.run(
                [self.train_fn, self.total_error],
                feed_dict={
                    self.obs_t_ph:
                    obs_t_batch,
                    self.act_t_ph:
                    act_batch,
                    self.rew_t_ph:
                    rew_batch,
                    self.obs_tp1_ph:
                    obs_tp1_batch,
                    self.done_mask_ph:
                    done_mask,
                    self.learning_rate:
                    self.optimizer_spec.lr_schedule.value(self.t),
                    self.keep_per:
                    1.0
                })
            # print('error', error)
            # exit()
            # 3.d: periodically update the target network by calling
            # self.session.run(self.update_target_fn)
            # you should update every target_update_freq steps, and you may find the
            # variable self.num_param_updates useful for this (it was initialized to 0)
            #####
            # YOUR CODE HERE
            self.num_param_updates += 1
            if (self.num_param_updates % self.target_update_freq == 0):
                print("actually update")
                self.session.run(self.update_target_fn)
            # exit()

        self.t += 1
        # print('self.t', self.t)

    def log_progress(self):
        #print(self.t)
        episode_rewards = get_wrapper_by_name(self.env,
                                              "Monitor").get_episode_rewards()

        if len(episode_rewards) > 0:
            self.mean_episode_reward = np.mean(episode_rewards[-100:])

        if len(episode_rewards) > 50:
            if self.mean_episode_reward > self.best_mean_episode_reward:
                # store the best_mean_reward
                self.best_mean_episode_reward = self.mean_episode_reward
                #print("init?",self.model_initialized)
                #print("eval?",self.eval)
                if self.model_initialized and not self.eval:
                    # store the best model
                    save_path = self.saver.save(self.session, "./models/model")
                    print("Model saved in path: %s" % save_path)
                    #self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward)
                    #print("Exemplar test output")
                    #for _ in range(10):
                    #    self.exemplar.predict(np.ones((1,9))/9 )

        if self.t % self.log_every_n_steps == 0 and self.model_initialized:
            print("Timestep %d" % (self.t, ))
            print("mean reward (100 episodes) %f" % self.mean_episode_reward)
            print("best mean reward %f" % self.best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % self.exploration.value(self.t))
            print("learning_rate %f" %
                  self.optimizer_spec.lr_schedule.value(self.t))
            if self.start_time is not None:
                print("running time %f" %
                      ((time.time() - self.start_time) / 60.))

            self.start_time = time.time()

            sys.stdout.flush()

            # Store variables
            self.timesteps.append(self.t)
            self.mean_episode_rewards.append(self.mean_episode_reward)
            self.best_mean_episode_rewards.append(
                self.best_mean_episode_reward)

            # TO-DO: it is weird, since every time it is doing dumpying, but every time it opens as new..
            # Actually if less steps required, we can only store once at the end
            with open(self.rew_file, 'wb') as f:
                store_result = {
                    'timestep': np.array(self.timesteps),
                    'reward': np.array(episode_rewards),
                    'mean_reward': np.array(self.mean_episode_rewards),
                    'best_reward': np.array(self.best_mean_episode_rewards)
                }
                pickle.dump(store_result, f, pickle.HIGHEST_PROTOCOL)