コード例 #1
0
		def create_normalizer():
			if len(self.args.obs_dims)==1:
				with tf.variable_scope('normalizer'):
					self.obs_normalizer = Normalizer(self.args.obs_dims, self.sess)
				self.obs_ph = self.obs_normalizer.normalize(self.raw_obs_ph)
				self.obs_next_ph = self.obs_normalizer.normalize(self.raw_obs_next_ph)
			else:
				self.obs_normalizer = None
				self.obs_ph = self.raw_obs_ph
				self.obs_next_ph = self.raw_obs_next_ph
コード例 #2
0
class DDPG:
	def __init__(self, args):
		self.args = args
		self.create_model()

		self.train_info_pi = {
			'Pi_q_loss': self.pi_q_loss,
			'Pi_l2_loss': self.pi_l2_loss
		}
		self.train_info_q = {
			'Q_loss': self.q_loss
		}
		self.train_info = {**self.train_info_pi, **self.train_info_q}

		self.step_info = {
			'Q_average': self.q_pi
		}

	def create_model(self):
		def create_session():
			config = tf.ConfigProto()
			config.gpu_options.allow_growth = True
			self.sess = tf.Session(config=config)

		def create_inputs():
			self.raw_obs_ph = tf.placeholder(tf.float32, [None]+self.args.obs_dims)
			self.raw_obs_next_ph = tf.placeholder(tf.float32, [None]+self.args.obs_dims)
			self.acts_ph = tf.placeholder(tf.float32, [None]+self.args.acts_dims)
			self.rews_ph = tf.placeholder(tf.float32, [None, 1])

		def create_normalizer():
			with tf.variable_scope('normalizer'):
				self.obs_normalizer = Normalizer(self.args.obs_dims, self.sess)
			self.obs_ph = self.obs_normalizer.normalize(self.raw_obs_ph)
			self.obs_next_ph = self.obs_normalizer.normalize(self.raw_obs_next_ph)

		def create_network():
			def mlp_policy(obs_ph):
				with tf.variable_scope('net', initializer=tf.contrib.layers.xavier_initializer()):
					pi_dense1 = tf.layers.dense(obs_ph, 256, activation=tf.nn.relu, name='pi_dense1')
					pi_dense2 = tf.layers.dense(pi_dense1, 256, activation=tf.nn.relu, name='pi_dense2')
					pi_dense3 = tf.layers.dense(pi_dense2, 256, activation=tf.nn.relu, name='pi_dense3')
					pi = tf.layers.dense(pi_dense3, self.args.acts_dims[0], activation=tf.nn.tanh, name='pi')
				return pi

			def mlp_value(obs_ph, acts_ph):
				state_ph = tf.concat([obs_ph, acts_ph], axis=1)
				with tf.variable_scope('net', initializer=tf.contrib.layers.xavier_initializer()):
					q_dense1 = tf.layers.dense(state_ph, 256, activation=tf.nn.relu, name='q_dense1')
					q_dense2 = tf.layers.dense(q_dense1, 256, activation=tf.nn.relu, name='q_dense2')
					q_dense3 = tf.layers.dense(q_dense2, 256, activation=tf.nn.relu, name='q_dense3')
					q = tf.layers.dense(q_dense3, 1, name='q')
				return q

			with tf.variable_scope('main'):
				with tf.variable_scope('policy'):
					self.pi = mlp_policy(self.obs_ph)
				with tf.variable_scope('value'):
					self.q = mlp_value(self.obs_ph, self.acts_ph)
				with tf.variable_scope('value', reuse=True):
					self.q_pi = mlp_value(self.obs_ph, self.pi)

			with tf.variable_scope('target'):
				with tf.variable_scope('policy'):
					self.pi_t = mlp_policy(self.obs_next_ph)
				with tf.variable_scope('value'):
					self.q_t = mlp_value(self.obs_next_ph, self.pi_t)

		def create_operators():
			self.pi_q_loss = -tf.reduce_mean(self.q_pi)
			self.pi_l2_loss = self.args.act_l2*tf.reduce_mean(tf.square(self.pi))
			self.pi_optimizer = tf.train.AdamOptimizer(self.args.pi_lr)
			self.pi_train_op = self.pi_optimizer.minimize(self.pi_q_loss+self.pi_l2_loss, var_list=get_vars('main/policy'))

			if self.args.clip_return:
				return_value = tf.clip_by_value(self.q_t, self.args.clip_return_l, self.args.clip_return_r)
			else:
				return_value = self.q_t
			target = tf.stop_gradient(self.rews_ph+self.args.gamma*return_value)
			self.q_loss = tf.reduce_mean(tf.square(self.q-target))
			self.q_optimizer = tf.train.AdamOptimizer(self.args.q_lr)
			self.q_train_op = self.q_optimizer.minimize(self.q_loss, var_list=get_vars('main/value'))

			self.target_update_op = tf.group([
				v_t.assign(self.args.polyak*v_t + (1.0-self.args.polyak)*v)
				for v, v_t in zip(get_vars('main'), get_vars('target'))
			])

			self.saver=tf.train.Saver()
			self.init_op = tf.global_variables_initializer()
			self.target_init_op = tf.group([
				v_t.assign(v)
				for v, v_t in zip(get_vars('main'), get_vars('target'))
			])

		self.graph = tf.Graph()
		with self.graph.as_default():
			create_session()
			create_inputs()
			create_normalizer()
			create_network()
			create_operators()
		self.init_network()

	def init_network(self):
		self.sess.run(self.init_op)
		self.sess.run(self.target_init_op)

	def step(self, obs, explore=False, test_info=False):
		if (not test_info) and (self.args.buffer.steps_counter<self.args.warmup):
			return np.random.uniform(-1, 1, size=self.args.acts_dims)
		if self.args.goal_based: obs = goal_based_process(obs)

		# eps-greedy exploration
		if explore and np.random.uniform()<=self.args.eps_act:
			return np.random.uniform(-1, 1, size=self.args.acts_dims)

		feed_dict = {
			self.raw_obs_ph: [obs]
		}
		action, info = self.sess.run([self.pi, self.step_info], feed_dict)
		action = action[0]

		# uncorrelated gaussian explorarion
		if explore: action += np.random.normal(0, self.args.std_act, size=self.args.acts_dims)
		action = np.clip(action, -1, 1)

		if test_info: return action, info
		return action

	def step_batch(self, obs):
		actions = self.sess.run(self.pi, {self.raw_obs_ph:obs})
		return actions

	def feed_dict(self, batch):
		return {
			self.raw_obs_ph: batch['obs'],
			self.raw_obs_next_ph: batch['obs_next'],
			self.acts_ph: batch['acts'],
			self.rews_ph: batch['rews']
		}

	def train(self, batch):
		feed_dict = self.feed_dict(batch)
		info, _, _ = self.sess.run([self.train_info, self.pi_train_op, self.q_train_op], feed_dict)
		return info

	def train_pi(self, batch):
		feed_dict = self.feed_dict(batch)
		info, _ = self.sess.run([self.train_info_pi, self.pi_train_op], feed_dict)
		return info

	def train_q(self, batch):
		feed_dict = self.feed_dict(batch)
		info, _ = self.sess.run([self.train_info_q, self.q_train_op], feed_dict)
		return info

	def normalizer_update(self, batch):
		self.obs_normalizer.update(np.concatenate([batch['obs'], batch['obs_next']], axis=0))

	def target_update(self):
		self.sess.run(self.target_update_op)
コード例 #3
0
ファイル: ddq6.py プロジェクト: MouseHu/gem
class DDQ6(BaseLearner):
    def __init__(self, args, flags={}):
        super(DDQ6, self).__init__()
        self.args = args
        self.gpu = args.gpu
        self.flags = flags
        self.acts_num = args.acts_dims[0]
        self.inner_q_type = args.inner_q_type

        self.num_q = self.args.num_q
        self.tau = self.args.tau
        self.alpha = self.args.alpha
        self.beta = self.args.beta

        self.q_funcs = []
        self.q_pi_funcs = []
        self.target_q_funcs = []
        self.target_q_pi_funcs = []

        self.meta_q_funcs = []
        self.meta_q_pi_funcs = []
        self.target_meta_q_funcs = []
        self.target_meta_q_pi_funcs = []

        self.target_qs = None
        self.qs = None
        self.meta_q_funcs_stack = None
        self.meta_q_pi = None

        self.meta_target_check_range = None
        self.create_model()

        self.train_info = {
            'Q_loss': self.q_loss,
            'Meta_Q_loss': self.meta_q_loss,
            # 'Q_target_0': self.q_step_target[:, 0],
            # 'Q_target_1': self.q_step_target[:, 1],
            'difference': self.buffer_target_diffence,
            'target_range': self.target_check_range,
            'meta_target_range': self.meta_target_check_range,
            'regression_target': self.qvalues_ph,
            'true_return': self.true_rews_ph,
        }
        self.step_info = {
            'Q_average': self.meta_q_pi,
            'sub_Q_average': self.q_pi,
        }

        self.args.buffer.update_func(self)

    def create_model(self):
        # def create_session():
        #     config = tf.ConfigProto()
        #     config.gpu_options.allow_growth = True
        #     self.sess = tf.Session(config=config)

        def create_inputs():
            self.raw_obs_ph = tf.placeholder(tf.float32,
                                             [None] + self.args.obs_dims)
            self.em_raw_obs_ph = tf.placeholder(tf.float32,
                                                [None] + self.args.obs_dims)
            self.acts_ph = tf.placeholder(tf.float32,
                                          [None] + self.args.acts_dims + [1])
            self.rews_ph = tf.placeholder(tf.float32, [None, 1])
            self.true_rews_ph = tf.placeholder(tf.float32, [None, 1])
            self.done_ph = tf.placeholder(tf.float32, [None, 1])
            self.qvalues_ph = tf.placeholder(tf.float32, [None, 2])

        def create_normalizer():
            if len(self.args.obs_dims) == 1:
                with tf.variable_scope('normalizer'):
                    self.obs_normalizer = Normalizer(self.args.obs_dims,
                                                     self.sess)
                self.obs_ph = self.obs_normalizer.normalize(self.raw_obs_ph)
                self.em_obs_ph = self.obs_normalizer.normalize(
                    self.em_raw_obs_ph)
            else:
                self.obs_normalizer = None
                self.obs_ph = self.raw_obs_ph
                self.em_obs_ph = self.em_raw_obs_ph

        def create_network():

            value_net = self.mlp_value if len(
                self.args.obs_dims) == 1 else self.conv_value

            with tf.variable_scope('main'):
                with tf.variable_scope('sub'):
                    for i in range(self.num_q):
                        with tf.variable_scope('value_{}'.format(i)):
                            q = value_net(self.obs_ph)
                            q_pi = tf.reduce_max(q, axis=1, keepdims=True)
                            self.q_funcs.append(q)
                            self.q_pi_funcs.append(q_pi)
                with tf.variable_scope('meta'):
                    for i in range(2):
                        with tf.variable_scope('meta_value_{}'.format(i)):
                            q = value_net(self.obs_ph)
                            q_pi = tf.reduce_max(q, axis=1, keepdims=True)
                            self.meta_q_funcs.append(q)
                            self.meta_q_pi_funcs.append(q_pi)

                self.qs = tf.stack(self.q_funcs, axis=-1)
                self.q_funcs_stack = tf.reshape(
                    self.qs, [-1] + self.args.acts_dims + [2, self.num_q // 2])

                self.q_pi = tf.reduce_mean(tf.reduce_max(tf.reduce_mean(
                    self.q_funcs_stack, axis=-1),
                                                         axis=1),
                                           axis=-1)

                self.meta_q_funcs_stack = tf.stack(self.meta_q_funcs, axis=-1)
                self.q_step = tf.reduce_mean(self.meta_q_funcs_stack, axis=-1)
                self.meta_q_pi = tf.reduce_mean(tf.reduce_max(self.q_step,
                                                              axis=1),
                                                axis=-1)

            with tf.variable_scope('target'):
                with tf.variable_scope('sub'):
                    for i in range(self.num_q):
                        with tf.variable_scope('value_{}'.format(i)):
                            tar_q = value_net(self.em_obs_ph)
                            tar_q_pi = tf.reduce_max(tar_q,
                                                     axis=1,
                                                     keepdims=True)
                            self.target_q_funcs.append(tar_q)
                            self.target_q_pi_funcs.append(tar_q_pi)
                with tf.variable_scope('meta'):
                    for i in range(2):
                        with tf.variable_scope('meta_value_{}'.format(i)):
                            tar_q = value_net(self.em_obs_ph)
                            tar_q_pi = tf.reduce_max(tar_q,
                                                     axis=1,
                                                     keepdims=True)
                            self.target_meta_q_funcs.append(tar_q)
                            self.target_meta_q_pi_funcs.append(tar_q_pi)

                self.target_qs = tf.stack(self.target_q_funcs, axis=-1)
                self.target_q_funcs_stack = tf.reshape(
                    self.target_qs,
                    [-1] + self.args.acts_dims + [2, self.num_q // 2])
                if self.inner_q_type == "min":
                    self.em_q = tf.reduce_max(tf.reduce_min(
                        self.target_q_funcs_stack, axis=-1),
                                              axis=1)
                else:
                    self.em_q = tf.reduce_max(tf.reduce_mean(
                        self.target_q_funcs_stack, axis=-1),
                                              axis=1)
                # self.q_target_mean = tf.reduce_mean(self.em_q, axis=1, keepdims=True)
                # self.q_step_target = tf.reduce_mean(self.target_q_funcs_stack, axis=-1)

        def create_operators():

            self.target_q_max = tf.reduce_max(self.target_qs, axis=1)
            self.target = tf.stop_gradient(self.rews_ph +
                                           (1.0 - self.done_ph) *
                                           self.args.gamma * self.target_q_max)

            self.q_acts = tf.reduce_sum(self.qs * self.acts_ph,
                                        axis=1,
                                        keepdims=True)
            q_acts = tf.reshape(self.q_acts, shape=(-1, self.num_q))

            self.meta_q_acts = tf.reduce_sum(self.meta_q_funcs_stack *
                                             self.acts_ph,
                                             axis=1,
                                             keepdims=True)
            meta_q_acts = tf.reshape(self.meta_q_acts, shape=(-1, 2))
            self.meta_target_check_range = tf.reduce_max(
                tf.abs(self.qvalues_ph))
            self.target_check_range = tf.reduce_max(tf.abs(self.target))

            duplicate_qvalues = tf.stack(
                [self.qvalues_ph for _ in range(self.num_q // 2)], axis=-1)
            duplicate_qvalues = tf.reshape(duplicate_qvalues, (-1, self.num_q))
            self.buffer_target_diffence = tf.reduce_mean(
                (duplicate_qvalues - self.target)**2)
            # self.q_loss = tf.reduce_mean(tf.abs(q_acts - self.qvalues_ph))
            # self.q_loss = tf.reduce_mean(tf.nn.leaky_relu(q_acts - self.qvalues_ph, alpha=self.alpha)**2)
            self.q_loss = tf.losses.huber_loss(q_acts,
                                               self.target,
                                               reduction=Reduction.SUM)
            # self.q_loss = tf.losses.huber_loss(q_acts, self.target)

            sym_q_loss = huber_loss(meta_q_acts, self.qvalues_ph)
            overestimate = (meta_q_acts - self.qvalues_ph) > 0
            self.meta_q_loss = tf.reduce_sum(
                tf.where(overestimate, sym_q_loss, self.alpha * sym_q_loss))
            if self.args.optimizer == 'adam':
                self.q_optimizer = tf.train.AdamOptimizer(
                    self.args.q_lr, epsilon=self.args.Adam_eps)
                self.meta_q_optimizer = tf.train.AdamOptimizer(
                    self.args.q_lr, epsilon=self.args.Adam_eps)
            elif self.args.optimizer == 'rmsprop':
                self.q_optimizer = tf.train.RMSPropOptimizer(
                    self.args.q_lr,
                    decay=self.args.RMSProp_decay,
                    epsilon=self.args.RMSProp_eps)
                self.meta_q_optimizer = tf.train.RMSPropOptimizer(
                    self.args.q_lr,
                    decay=self.args.RMSProp_decay,
                    epsilon=self.args.RMSProp_eps)
            self.q_train_op = self.q_optimizer.minimize(
                self.q_loss, var_list=get_vars('main/sub'))
            self.meta_q_train_op = self.meta_q_optimizer.minimize(
                10 * self.meta_q_loss, var_list=get_vars('main/meta'))

            self.target_update_op = tf.group([
                v_t.assign(self.tau * v + (1 - self.tau) * v_t)
                for v, v_t in zip(get_vars('main'), get_vars('target'))
            ])

            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            self.target_init_op = tf.group([
                v_t.assign(v)
                for v, v_t in zip(get_vars('main'), get_vars('target'))
            ])

        self.graph = tf.Graph()

        with self.graph.as_default():
            # with tf.device("/gpu:{}".format(self.gpu)):
            self.create_session()
            create_inputs()
            create_normalizer()
            create_network()
            create_operators()
        self.init_network()

    def init_network(self):
        self.sess.run(self.init_op)
        self.sess.run(self.target_init_op)

    def step(self, obs, explore=False, test_info=False):
        if (not test_info) and (self.args.buffer.steps_counter <
                                self.args.warmup):
            return np.random.randint(self.acts_num)

        # eps-greedy exploration
        if explore and np.random.uniform() <= self.args.eps_act:
            return np.random.randint(self.acts_num)

        feed_dict = {
            self.raw_obs_ph: [obs / 255.0],
        }
        q_values, info = self.sess.run([self.q_step, self.step_info],
                                       feed_dict)
        # q_value = np.mean(q_values, axis=-1)
        action = np.argmax(q_values[0])

        if test_info: return action, info
        return action

    def step_with_q(self, obs, explore=False, target=True):

        feed_dict = {
            self.raw_obs_ph: [obs / 255.0],
            self.em_raw_obs_ph: [obs / 255.0],
        }
        q_values, q_values_target = self.sess.run([self.q_step, self.em_q],
                                                  feed_dict)
        # q_value = np.mean(q_values, axis=-1)
        action = np.argmax(q_values[0])

        q = q_values_target if target else q_values
        if self.args.buffer.steps_counter < self.args.warmup:
            return np.random.randint(self.acts_num), q

        # eps-greedy exploration
        if explore and np.random.uniform() <= self.args.eps_act:
            return np.random.randint(self.acts_num), q

        return action, q

    def feed_dict(self, batch):
        def one_hot(idx):
            idx = np.array(idx).reshape(-1)
            batch_size = idx.shape[0]
            res = np.zeros((batch_size, self.acts_num), dtype=np.float32)
            res[np.arange(batch_size), idx] = 1.0
            return res

        batch_obs, batch_obs_next, batch_actions, batch_rewards, batch_next_obs, batch_dones, batch_returns, batch_true_returns = \
            batch['obs0'], batch['obs1'], batch[
                'actions'], batch['rewards'], batch['obs1'], batch['terminals1'], batch['return'], batch['true_return']
        # if self.num_q == 4:
        #     batch_returns = np.repeat(batch_returns, 2, axis=1)
        feed_dict = {
            self.raw_obs_ph: batch_obs,
            self.em_raw_obs_ph: batch_obs_next,
            self.acts_ph: one_hot(batch_actions)[..., np.newaxis],
            self.rews_ph: batch_rewards,
            self.done_ph: batch_dones,
            self.qvalues_ph: batch_returns,
            self.true_rews_ph: batch_true_returns,
        }

        return feed_dict

    def train(self, batch):
        feed_dict = self.feed_dict(batch)
        info, _, _ = self.sess.run(
            [self.train_info, self.q_train_op, self.meta_q_train_op],
            feed_dict)
        return info

    def test_q(self, batch):
        feed_dict = self.feed_dict(batch)
        q_loss, meta_q_loss = self.sess.run([self.q_loss, self.meta_q_loss],
                                            feed_dict)
        return q_loss, meta_q_loss

    def normalizer_update(self, batch):
        if not (self.obs_normalizer is None):
            self.obs_normalizer.update(
                np.concatenate([batch['obs'], batch['obs_next']], axis=0))

    def target_update(self):
        self.sess.run(self.target_update_op)

    def save_model(self, save_path):
        with self.graph.as_default():
            saver = tf.train.Saver()
            saver.save(self.sess, save_path)

    def load_model(self, load_path):
        with self.graph.as_default():
            saver = tf.train.Saver()
            saver.restore(self.sess, load_path)
コード例 #4
0
class LowRankDQN:
	def __init__(self, args, flags={}):
		self.args = args
		self.flags = flags
		self.acts_num = args.acts_dims[0]
		self.create_model()

		self.train_info = {
			'Q_loss': self.q_loss,
			'Q_L1_loss': self.q_l1_loss,
			'Q_sparsity': self.q_sparse_abs_avg,
			'target_range': self.target_check_range
		}
		self.step_info = {
			'Q_average': self.q_pi
		}

		if self.args.learn[-2:]=='lb':
			self.train_info = {
				**self.train_info,
				**{
					'Q_target': self.target,
					'Q_LB': self.q_lb_ph,
					'LB_ratio': self.lb_ratio
				}
			}

	def create_model(self):
		def create_session():
			config = tf.ConfigProto()
			config.gpu_options.allow_growth = True
			self.sess = tf.Session(config=config)

		def create_inputs():
			self.raw_obs_ph = tf.placeholder(tf.float32, [None]+self.args.obs_dims)
			self.raw_obs_next_ph = tf.placeholder(tf.float32, [None]+self.args.obs_dims)
			self.acts_ph = tf.placeholder(tf.float32, [None]+self.args.acts_dims)
			self.rews_ph = tf.placeholder(tf.float32, [None, 1])
			self.done_ph = tf.placeholder(tf.float32, [None, 1])

			if self.args.learn[-2:]=='lb':
				self.q_lb_ph = tf.placeholder(tf.float32, [None, 1])

		def create_normalizer():
			if len(self.args.obs_dims)==1:
				with tf.variable_scope('normalizer'):
					self.obs_normalizer = Normalizer(self.args.obs_dims, self.sess)
				self.obs_ph = self.obs_normalizer.normalize(self.raw_obs_ph)
				self.obs_next_ph = self.obs_normalizer.normalize(self.raw_obs_next_ph)
			else:
				self.obs_normalizer = None
				self.obs_ph = self.raw_obs_ph
				self.obs_next_ph = self.raw_obs_next_ph

		def create_network():
			def mlp_value(obs_ph):
				with tf.variable_scope('net', initializer=tf.contrib.layers.xavier_initializer()):
					q_dense1 = tf.layers.dense(obs_ph, 256, activation=tf.nn.relu, name='q_dense1')
					q_dense2 = tf.layers.dense(q_dense1, 256, activation=tf.nn.relu, name='q_dense2')
					q = tf.layers.dense(q_dense2, self.acts_num, name='q')
				return q

			def conv_value(obs_ph):
				with tf.variable_scope('net', initializer=tf.contrib.layers.xavier_initializer()):
					q_conv1 = tf.layers.conv2d(obs_ph, 32, 8, 4, 'same', activation=tf.nn.relu, name='q_conv1')
					q_conv2 = tf.layers.conv2d(q_conv1, 64, 4, 2, 'same', activation=tf.nn.relu, name='q_conv2')
					q_conv3 = tf.layers.conv2d(q_conv2, 64, 3, 1, 'same', activation=tf.nn.relu, name='q_conv3')
					q_conv3_flat = tf.layers.flatten(q_conv3)

					q_dense_act = tf.layers.dense(q_conv3_flat, 512, activation=tf.nn.relu, name='q_dense_act')

					q_lowrank_hidden = tf.layers.dense(q_dense_act, self.args.rank, activation=tf.nn.relu, name='q_lowrank_hidden')
					q_lowrank_act = tf.layers.dense(q_lowrank_hidden, self.acts_num, name='q_lowrank_act')
					q_sparse_act = tf.layers.dense(q_dense_act, self.acts_num, name='q_sparse_act')

				return q_lowrank_act+q_sparse_act, q_sparse_act

			value_net = mlp_value if len(self.args.obs_dims)==1 else conv_value

			with tf.variable_scope('main'):
				with tf.variable_scope('value'):
					self.q, self.q_sparse = value_net(self.obs_ph)
					self.q_sparse_avg = tf.reduce_mean(self.q_sparse)
					self.q_sparse_abs_avg = tf.reduce_mean(tf.abs(self.q_sparse))
					self.q_pi = tf.reduce_max(self.q, axis=1, keepdims=True)
				if self.args.double:
					with tf.variable_scope('value', reuse=True):
						self.q_next, _ = value_net(self.obs_next_ph)
						self.pi_next = tf.one_hot(tf.argmax(self.q_next, axis=1), self.acts_num, dtype=tf.float32)

			with tf.variable_scope('target'):
				with tf.variable_scope('value'):
					if self.args.double:
						self.q_t = tf.reduce_sum(value_net(self.obs_next_ph)[0]*self.pi_next, axis=1, keepdims=True)
					else:
						self.q_t = tf.reduce_max(value_net(self.obs_next_ph)[0], axis=1, keepdims=True)

		def create_operators():
			self.target = tf.stop_gradient(self.rews_ph+(1.0-self.done_ph)*(self.args.gamma**self.args.nstep)*self.q_t)
			target = self.target
			if self.args.learn[-2:]=='lb':
				self.lb_ratio = tf.less(target, self.q_lb_ph)
				target = tf.maximum(target, self.q_lb_ph)
			self.target_check_range = tf.reduce_max(tf.abs(target))
			self.q_acts = tf.reduce_sum(self.q*self.acts_ph, axis=1, keepdims=True)
			self.q_loss = tf.losses.huber_loss(target, self.q_acts) + self.args.beta*tf.losses.huber_loss(0, self.q_sparse_avg)
			self.q_l1_loss = tf.reduce_mean(tf.abs(target-self.q_acts))
			if self.args.optimizer=='adam':
				self.q_optimizer = tf.train.AdamOptimizer(self.args.q_lr, epsilon=self.args.Adam_eps)
			elif self.args.optimizer=='rmsprop':
				self.q_optimizer = tf.train.RMSPropOptimizer(self.args.q_lr, decay=self.args.RMSProp_decay, epsilon=self.args.RMSProp_eps)
			self.q_train_op = self.q_optimizer.minimize(self.q_loss, var_list=get_vars('main/value'))

			self.target_update_op = tf.group([
				v_t.assign(v)
				for v, v_t in zip(get_vars('main'), get_vars('target'))
			])

			self.saver=tf.train.Saver()
			self.init_op = tf.global_variables_initializer()
			self.target_init_op = tf.group([
				v_t.assign(v)
				for v, v_t in zip(get_vars('main'), get_vars('target'))
			])

		self.graph = tf.Graph()
		with self.graph.as_default():
			create_session()
			create_inputs()
			create_normalizer()
			create_network()
			create_operators()
		self.init_network()

	def init_network(self):
		self.sess.run(self.init_op)
		self.sess.run(self.target_init_op)

	def step(self, obs, explore=False, test_info=False):
		if (not test_info) and (self.args.buffer.steps_counter<self.args.warmup):
			return np.random.randint(self.acts_num)

		# eps-greedy exploration
		if explore and np.random.uniform()<=self.args.eps_act:
			return np.random.randint(self.acts_num)

		feed_dict = {
			self.raw_obs_ph: [obs/255.0]
		}
		q_value, info = self.sess.run([self.q, self.step_info], feed_dict)
		action = np.argmax(q_value[0])

		if test_info: return action, info
		return action

	def feed_dict(self, batch):
		def one_hot(idx):
			idx = np.array(idx)
			batch_size = idx.shape[0]
			res = np.zeros((batch_size, self.acts_num), dtype=np.float32)
			res[np.arange(batch_size),idx] = 1.0
			return res

		feed_dict = {
			self.raw_obs_ph: np.array(batch['obs']),
			self.raw_obs_next_ph: np.array(batch['obs_next']),
			self.acts_ph: one_hot(batch['acts']),
			self.rews_ph: np.clip(np.array(batch['rews']), -self.args.rews_scale, self.args.rews_scale),
			self.done_ph: batch['done']
		}

		if self.args.learn[-2:]=='lb':
			feed_dict[self.q_lb_ph]= batch['rets']

		return feed_dict

	def train(self, batch):
		feed_dict = self.feed_dict(batch)
		info, _ = self.sess.run([self.train_info, self.q_train_op], feed_dict)
		return info

	def test_q(self, batch):
		feed_dict = self.feed_dict(batch)
		q_loss = self.sess.run(self.q_loss, feed_dict)
		return q_loss

	def normalizer_update(self, batch):
		if not(self.obs_normalizer is None):
			self.obs_normalizer.update(np.concatenate([batch['obs'], batch['obs_next']], axis=0))

	def target_update(self):
		self.sess.run(self.target_update_op)

	def save_model(self, save_path):
		with self.graph.as_default():
			saver = tf.train.Saver()
			saver.save(self.sess, save_path)

	def load_model(self, load_path):
		with self.graph.as_default():
			saver = tf.train.Saver()
			saver.restore(self.sess, load_path)