def __init__(self, obs_dim, act_dim, *, seed, norm, model_weights, target_temp, med_dist, hidden_layers_q, activation_q, hidden_layers_w=[32, 32], scope='mwl', lr=5e-3, reg_factor=0, gamma=0.999): super().__init__(scope) self.obs_dim = obs_dim self.act_dim = act_dim self.gamma = gamma self.lr = lr self.reg_factor = reg_factor self.hidden_layers_w = hidden_layers_w self.hidden_layers_q = hidden_layers_q # import pdb; pdb.set_trace() self.act_encoded = np.linspace(-1, 1, act_dim) self.med_dist = med_dist self.seed = seed self.norm = norm assert self.norm[ 'type'] is not None, 'data should already be processed before calling the algorithm' self._session = tf.compat.v1.Session() self._session.__enter__() with self._session.as_default(): self.q_net = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_q, temperature=target_temp, seed = self.seed,\ activation = activation_q) #XXX debug self.debug_w = {} self.trainable_vars = [] self.build_graph() self.build_estimation_graph() self.create_loss_func() self._session.run( [tf.compat.v1.variables_initializer(self.trainable_vars)]) self.q_net.load_weight(model_weights)
def __init__(self, obs_dim, act_dim, *, seed, norm, model_weights, target_temp, med_dist, hidden_layers_p, activation_p, hidden_layers=[32, 32], scope='mql', lr=5e-3, reg_factor=0, gamma=0.999): super().__init__(scope) self.obs_dim = obs_dim self.act_dim = act_dim self.gamma = gamma self.lr = lr self.reg_factor = reg_factor self.hidden_layers = hidden_layers self.seed = seed self.med_dist = med_dist # self.q_net = q_net self.norm = norm self.act_encoded = np.linspace(-1, 1, act_dim) #XXX debug self.debug_q = {} self.trainable_vars = [] self._session = tf.compat.v1.Session() self._session.__enter__() with self._session.as_default(): self.q_net = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_p, temperature=target_temp, seed = self.seed,\ activation = activation_p) self.build_graph() self.build_estimation_graph() self.create_loss_func() self._session.run( [tf.compat.v1.variables_initializer(self.trainable_vars)]) self.q_net.load_weight(model_weights)
def __init__(self, parameters, target_policy_config, solve_for_state_action_ratio = True, average_next_nu = True, average_samples = 1, function_exponent = 1.5): """Initializes the solver. Args: parameters: An object holding the common neural network parameters. solve_for_state_action_ratio: Whether to solve for state-action density ratio. Defaults to True, which is recommended, since solving for the state density ratio requires importance weights which can introduce training instability. average_next_nu: Whether to take an empirical expectation over next nu. This can improve stability of training. average_samples: Number of empirical samples to average over for next nu computation (only relevant in continuous environments). function_exponent: The form of the function f(x). We use a polynomial f(x)=|x|^p / p where p is function_exponent. Raises: ValueError: If function_exponent is less than or equal to 1. NotImplementedError: If actions are continuous. """ self._parameters = parameters self._solve_for_state_action_ratio = solve_for_state_action_ratio self._average_next_nu = average_next_nu self._average_samples = average_samples if not self._parameters.discrete_actions: raise NotImplementedError('Continuous actions are not fully supported.') if function_exponent <= 1: raise ValueError('Exponent for f must be at least 1.') # Conjugate of f(x) = |x|^p / p is f*(x) = |x|^q / q where q = p / (p - 1). conjugate_exponent = function_exponent / (function_exponent - 1) self._f = lambda x: tf.abs(x) ** function_exponent / function_exponent self._fstar = lambda x: tf.abs(x) ** conjugate_exponent / conjugate_exponent # Build and initialize graph. self._build_graph() self._session = tf.compat.v1.Session() self._session.__enter__() obs_dim = target_policy_config['obs_dim'] act_dim = target_policy_config['act_dim'] hidden_layers_p = target_policy_config['hidden_layers_p'] target_temp = target_policy_config['target_temp'] seed = target_policy_config['seed'] activation_p = target_policy_config['activation_p'] model_weights = target_policy_config['model_weights'] with self._session.as_default(): self.target_policy = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_p, temperature=target_temp, seed = seed,\ activation = activation_p) self._session.run(tf.compat.v1.global_variables_initializer()) self.target_policy.load_weight(model_weights)
class NeuralDualDice(base_algo.BaseAlgo): """Approximate the density ratio using neural networks.""" def __init__(self, parameters, target_policy_config, solve_for_state_action_ratio = True, average_next_nu = True, average_samples = 1, function_exponent = 1.5): """Initializes the solver. Args: parameters: An object holding the common neural network parameters. solve_for_state_action_ratio: Whether to solve for state-action density ratio. Defaults to True, which is recommended, since solving for the state density ratio requires importance weights which can introduce training instability. average_next_nu: Whether to take an empirical expectation over next nu. This can improve stability of training. average_samples: Number of empirical samples to average over for next nu computation (only relevant in continuous environments). function_exponent: The form of the function f(x). We use a polynomial f(x)=|x|^p / p where p is function_exponent. Raises: ValueError: If function_exponent is less than or equal to 1. NotImplementedError: If actions are continuous. """ self._parameters = parameters self._solve_for_state_action_ratio = solve_for_state_action_ratio self._average_next_nu = average_next_nu self._average_samples = average_samples if not self._parameters.discrete_actions: raise NotImplementedError('Continuous actions are not fully supported.') if function_exponent <= 1: raise ValueError('Exponent for f must be at least 1.') # Conjugate of f(x) = |x|^p / p is f*(x) = |x|^q / q where q = p / (p - 1). conjugate_exponent = function_exponent / (function_exponent - 1) self._f = lambda x: tf.abs(x) ** function_exponent / function_exponent self._fstar = lambda x: tf.abs(x) ** conjugate_exponent / conjugate_exponent # Build and initialize graph. self._build_graph() self._session = tf.compat.v1.Session() self._session.__enter__() obs_dim = target_policy_config['obs_dim'] act_dim = target_policy_config['act_dim'] hidden_layers_p = target_policy_config['hidden_layers_p'] target_temp = target_policy_config['target_temp'] seed = target_policy_config['seed'] activation_p = target_policy_config['activation_p'] model_weights = target_policy_config['model_weights'] with self._session.as_default(): self.target_policy = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_p, temperature=target_temp, seed = seed,\ activation = activation_p) self._session.run(tf.compat.v1.global_variables_initializer()) self.target_policy.load_weight(model_weights) def _build_graph(self): self._create_placeholders() # Convert discrete actions to one-hot vectors. action = tf.one_hot(self._action, self._parameters.action_dim) next_action = tf.one_hot(self._next_action, self._parameters.action_dim) initial_action = tf.one_hot(self._initial_action, self._parameters.action_dim) nu, next_nu, initial_nu, zeta = self._compute_values( action, next_action, initial_action) # Density ratio given by approximated zeta values. self._density_ratio = zeta delta_nu = nu - next_nu * self._parameters.gamma unweighted_zeta_loss = (delta_nu * zeta - self._fstar(zeta) - (1 - self._parameters.gamma) * initial_nu) self._zeta_loss = -(tf.reduce_sum(input_tensor=self._weights * unweighted_zeta_loss) / tf.reduce_sum(input_tensor=self._weights)) # TODO, deterministic env ??? if self._parameters.deterministic_env and self._average_next_nu: # Dont use Fenchel conjugate trick and instead optimize primal. unweighted_nu_loss = (self._f(delta_nu) - (1 - self._parameters.gamma) * initial_nu) self._nu_loss = (tf.reduce_sum(input_tensor=self._weights * unweighted_nu_loss) / tf.reduce_sum(input_tensor=self._weights)) else: self._nu_loss = -self._zeta_loss self._train_nu_op = tf.compat.v1.train.AdamOptimizer( self._parameters.nu_learning_rate).minimize( self._nu_loss, var_list=tf.compat.v1.trainable_variables('nu')) self._train_zeta_op = tf.compat.v1.train.AdamOptimizer( self._parameters.zeta_learning_rate).minimize( self._zeta_loss, var_list=tf.compat.v1.trainable_variables('zeta')) self._train_op = tf.group(self._train_nu_op, self._train_zeta_op) # Debug quantity (should be close to 1). self._debug = ( tf.reduce_sum(input_tensor=self._weights * self._density_ratio) / tf.reduce_sum(input_tensor=self._weights)) def _create_placeholders(self): self._state = tf.compat.v1.placeholder( tf.float32, [None, self._parameters.state_dim], 'state') self._next_state = tf.compat.v1.placeholder( tf.float32, [None, self._parameters.state_dim], 'next_state') self._initial_state = tf.compat.v1.placeholder( tf.float32, [None, self._parameters.state_dim], 'initial_state') self._action = tf.compat.v1.placeholder(tf.int32, [None], 'action') self._next_action = tf.compat.v1.placeholder(tf.int32, [None], 'next_action') self._initial_action = tf.compat.v1.placeholder(tf.int32, [None], 'initial_action') # Policy sampling probabilities associated with next state. self._target_policy_next_probs = tf.compat.v1.placeholder( tf.float32, [None, self._parameters.action_dim]) self._weights = tf.compat.v1.placeholder(tf.float32, [None], 'weights') def _compute_values(self, action, next_action, initial_action): self.act_w = 1.0 nu = self._nu_network(self._state, action) initial_nu = self._nu_network(self._initial_state, initial_action) if self._average_next_nu: # Average next nu over all actions weighted by target policy # probabilities. all_next_actions = [ tf.one_hot(act * tf.ones_like(self._next_action), self._parameters.action_dim) for act in range(self._parameters.action_dim)] all_next_nu = [self._nu_network(self._next_state, next_action_i) for next_action_i in all_next_actions] next_nu = sum( self._target_policy_next_probs[:, act_index] * all_next_nu[act_index] for act_index in range(self._parameters.action_dim)) else: next_nu = self._nu_network(self._next_state, next_action) zeta = self._zeta_network(self._state, action) return nu, next_nu, initial_nu, zeta def _nu_network(self, state, action): with tf.compat.v1.variable_scope('nu', reuse=tf.compat.v1.AUTO_REUSE): inputs = tf.concat([state, action * self.act_w], -1) outputs = self._network(inputs) return outputs def _zeta_network(self, state, action, act_w=1.0): with tf.compat.v1.variable_scope('zeta', reuse=tf.compat.v1.AUTO_REUSE): inputs = tf.concat([state, action * self.act_w], -1) outputs = self._network(inputs) return outputs def _network(self, inputs): with tf.compat.v1.variable_scope('network', reuse=tf.compat.v1.AUTO_REUSE): input_dim = int(inputs.shape[-1]) prev_dim = input_dim prev_outputs = inputs # Hidden layers. #XXX, for fair comparison, change the network build method? (tf.layers.dense???) for layer in range(self._parameters.hidden_layers): with tf.compat.v1.variable_scope('layer%d' % layer, reuse=tf.compat.v1.AUTO_REUSE): weight = tf.compat.v1.get_variable( 'weight', [prev_dim, self._parameters.hidden_dim], initializer=tf.compat.v1.glorot_uniform_initializer()) bias = tf.compat.v1.get_variable( 'bias', initializer=tf.zeros([self._parameters.hidden_dim])) pre_activation = tf.matmul(prev_outputs, weight) + bias post_activation = self._parameters.activation(pre_activation) prev_dim = self._parameters.hidden_dim prev_outputs = post_activation # Final layer. weight = tf.compat.v1.get_variable( 'weight_final', [prev_dim, 1], initializer=tf.compat.v1.glorot_uniform_initializer()) bias = tf.compat.v1.get_variable( 'bias_final', [1], initializer=tf.compat.v1.zeros_initializer()) output = tf.matmul(prev_outputs, weight) + bias return output[Ellipsis, 0] def _sample_data(self, dataset, sample_num): data_size = dataset['obs'].shape[0] init_size = dataset['init_obs'].shape[0] index = np.random.choice(data_size, sample_num) init_index = np.random.choice(init_size, sample_num) return { 'obs': dataset['obs'][index], 'acts': dataset['acts'][index], 'next_obs': dataset['next_obs'][index], 'next_acts': dataset['next_acts'][index], 'init_obs': dataset['init_obs'][init_index], 'time_step': dataset['time_step'][index], 'target_prob_next_obs': dataset['target_prob_next_obs'][index] } # def solve(self, dataset, target_policy, norm, logger): def solve(self, dataset, norm, logger): """Solves for density ratios and then approximates target policy value. Args: dataset: The transition data store to use. target_policy: The policy whose value we want to estimate. baseline_policy: The policy used to collect the data. If None, we default to data.policy. Returns: Estimated average per-step reward of the target policy. Raises: ValueError: If NaNs encountered in policy ratio computation. """ value_estimates = [] for step in range(self._parameters.num_steps): batch = self._sample_data(dataset, self._parameters.batch_size) feed_dict = { self._state: batch['obs'], self._action: batch['acts'], self._next_state: batch['next_obs'], self._initial_state: batch['init_obs'], self._weights: self._parameters.gamma ** batch['time_step'], } # On-policy next action and initial action. feed_dict[self._next_action] = self.target_policy.sample_action( batch['next_obs'], norm) feed_dict[self._initial_action] = self.target_policy.sample_action( batch['init_obs'], norm) if self._average_next_nu: # next_probabilities = self.target_policy.get_probabilities(batch['next_obs'], norm) # feed_dict[self._target_policy_next_probs] = next_probabilities #* replace the above with calling from the data feed_dict[self._target_policy_next_probs] = batch['target_prob_next_obs'] self._session.run(self._train_op, feed_dict=feed_dict) if step % self._parameters.log_every == 0: debug = self._session.run(self._debug, feed_dict=feed_dict) value_estimate = self.estimate_average_reward(dataset) value_estimates.append(value_estimate) # if step % 1000 == 0: # print('Iter: {}. DualDICE Estimate: {:.2f}'.format(step, np.mean(value_estimates[-self._parameters.smooth_over:]))) # logger.info('At step %d' % step) # logger.info('Debug: %s' % debug) # value_estimate = self.estimate_average_reward(dataset) # logger.info('Estimated value: %s' % value_estimate) # value_estimates.append(value_estimate) # logger.info( # 'Estimated smoothed value: %s' % # np.mean(value_estimates[-self._parameters.smooth_over:])) # if self._parameters.summary_writer: # summary = tf.compat.v1.Summary(value=[ # tf.compat.v1.Summary.Value( # tag='%sdebug' % self._parameters.summary_prefix, # simple_value=debug), # tf.compat.v1.Summary.Value( # tag='%svalue_estimate' % self._parameters.summary_prefix, # simple_value=value_estimate)]) # self._parameters.summary_writer.add_summary(summary, step) # logger.add(step, value_estimate, np.mean(value_estimates[-self._parameters.smooth_over:])) # logger.dump() # if step % self._parameters.log_every == 0 and (logger is not None): # debug = self._session.run(self._debug, feed_dict=feed_dict) # # tf.logging.info('At step %d' % step) # # tf.logging.info('Debug: %s' % debug) # # value_estimate = self.estimate_average_reward(dataset) # # tf.logging.info('Estimated value: %s' % value_estimate) # # value_estimates.append(value_estimate) # # tf.logging.info( # # 'Estimated smoothed value: %s' % # # np.mean(value_estimates[-self._parameters.smooth_over:])) # logger.info('At step %d' % step) # logger.info('Debug: %s' % debug) # value_estimate = self.estimate_average_reward(dataset) # logger.info('Estimated value: %s' % value_estimate) # value_estimates.append(value_estimate) # logger.info( # 'Estimated smoothed value: %s' % # np.mean(value_estimates[-self._parameters.smooth_over:])) # if self._parameters.summary_writer: # summary = tf.compat.v1.Summary(value=[ # tf.compat.v1.Summary.Value( # tag='%sdebug' % self._parameters.summary_prefix, # simple_value=debug), # tf.compat.v1.Summary.Value( # tag='%svalue_estimate' % self._parameters.summary_prefix, # simple_value=value_estimate)]) # self._parameters.summary_writer.add_summary(summary, step) # logger.add(step, value_estimate, np.mean(value_estimates[-self._parameters.smooth_over:])) # logger.dump() value_estimate = self.estimate_average_reward(dataset) tf.compat.v1.logging.info('Estimated value: %s' % value_estimate) value_estimates.append(value_estimate) tf.compat.v1.logging.info('Estimated smoothed value: %s' % np.mean(value_estimates[-self._parameters.smooth_over:])) if logger is not None: logger.add(step, value_estimate, np.mean(value_estimates[-self._parameters.smooth_over:])) logger.dump() # Return estimate that is smoothed over last few iterates. return np.mean(value_estimates[-self._parameters.smooth_over:]) def _state_action_density_ratio(self, state, action): batched = len(np.shape(state)) > 1 if not batched: state = np.expand_dims(state, 0) action = np.expand_dims(action, 0) density_ratio = self._session.run( self._density_ratio, feed_dict={ self._state: state, self._action: action }) if not batched: return density_ratio[0] return density_ratio def estimate_average_reward(self, dataset): """Estimates value (average per-step reward) of policy. The estimation is based on solved values of zeta, so one should call solve() before calling this function. Returns: Estimated average per-step reward of the target policy. """ return estimate_value_from_state_action_ratios( dataset, self._parameters.gamma, self._state_action_density_ratio) def close(self): self._session.close()
class MQL(Basic_Alg): def __init__(self, obs_dim, act_dim, *, seed, norm, model_weights, target_temp, med_dist, hidden_layers_p, activation_p, hidden_layers=[32, 32], scope='mql', lr=5e-3, reg_factor=0, gamma=0.999): super().__init__(scope) self.obs_dim = obs_dim self.act_dim = act_dim self.gamma = gamma self.lr = lr self.reg_factor = reg_factor self.hidden_layers = hidden_layers self.seed = seed self.med_dist = med_dist # self.q_net = q_net self.norm = norm #XXX debug self.debug_q = {} self.trainable_vars = [] self._session = tf.compat.v1.Session() self._session.__enter__() with self._session.as_default(): self.q_net = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_p, temperature=target_temp, seed = self.seed,\ activation = activation_p) self.build_graph() self.build_estimation_graph() self.create_loss_func() self._session.run( [tf.compat.v1.variables_initializer(self.trainable_vars)]) self.q_net.load_weight(model_weights) # self._session.run(tf.global_variables_initializer()) # tf.get_default_session().run( # [tf.variables_initializer(self.trainable_vars)] # ) def build_graph(self): ''' firs sample ''' self.rew_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 1]) self.obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.act_ph = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1]) self.obs_act = tf.concat( [self.obs_ph, tf.cast(self.act_ph, tf.float32)], axis=1) self.q = self.create_value_func(self.obs_ph, self.act_ph, func_type='q', reuse=False) self.next_obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.v_next = self.create_value_func(self.next_obs_ph, None, func_type='v', reuse=True) ''' second sample ''' self.rew_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 1]) self.obs_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.act_ph_2 = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1]) self.obs_act_2 = tf.concat( [self.obs_ph_2, tf.cast(self.act_ph_2, tf.float32)], axis=1) self.q_2 = self.create_value_func(self.obs_ph_2, self.act_ph_2, func_type='q', reuse=True) self.next_obs_ph_2 = tf.compat.v1.placeholder( dtype=tf.float32, shape=[None, self.obs_dim]) self.v_next_2 = self.create_value_func(self.next_obs_ph_2, None, func_type='v', reuse=True) def create_value_func(self, obs_tf, act_tf, *, func_type, reuse=False, normalize=True): if func_type == 'v': if self.norm['type'] is not None: org_obs = obs_tf * self.norm['scale'] + self.norm['shift'] else: org_obs = obs_tf prob_mask = self.q_net.build_prob(org_obs, split=True) with tf.compat.v1.variable_scope(self.scope, reuse=reuse): x = tf.concat( [obs_tf, tf.zeros([tf.shape(input=obs_tf)[0], 1])], axis=1) for h in self.hidden_layers: x = tf.compat.v1.layers.dense(x, h, activation=tf.nn.relu) q0 = tf.compat.v1.layers.dense( x, 1, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.5 * (1.)), bias_regularizer=tf.keras.regularizers.l2(0.5 * (1.))) with tf.compat.v1.variable_scope(self.scope, reuse=True): x = tf.concat( [obs_tf, tf.ones([tf.shape(input=obs_tf)[0], 1])], axis=1) for h in self.hidden_layers: x = tf.compat.v1.layers.dense(x, h, activation=tf.nn.relu) q1 = tf.compat.v1.layers.dense( x, 1, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.5 * (1.)), bias_regularizer=tf.keras.regularizers.l2(0.5 * (1.))) value = q0 * prob_mask[0] + q1 * prob_mask[1] else: with tf.compat.v1.variable_scope(self.scope, reuse=reuse): x = tf.concat([obs_tf, tf.cast(act_tf, tf.float32)], axis=1) for h in self.hidden_layers: x = tf.compat.v1.layers.dense(x, h, activation=tf.nn.relu) q = tf.compat.v1.layers.dense( x, 1, activation=None, kernel_regularizer=tf.keras.regularizers.l2(0.5 * (1.)), bias_regularizer=tf.keras.regularizers.l2(0.5 * (1.))) if reuse == False: self.trainable_vars += tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope) value = q return value def build_estimation_graph(self): self.init_obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.value_estimation = tf.reduce_mean( input_tensor=self.create_value_func( self.init_obs_ph, None, func_type='v', reuse=True)) def create_loss_func(self): error = self.rew_ph + self.gamma * self.v_next - self.q error_2 = self.rew_ph_2 + self.gamma * self.v_next_2 - self.q_2 diff = tf.expand_dims(self.obs_act, 1) - tf.expand_dims( self.obs_act_2, 0) K = tf.exp(-tf.reduce_sum(input_tensor=tf.square(diff), axis=-1) / 2.0 / self.med_dist**2) all_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope) self.loss = tf.matmul(tf.matmul(tf.transpose(a=error), K), error_2) self.reg_loss = self.reg_factor * tf.reduce_sum( input_tensor=tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, self.scope)) self.debug_q.update({'reg loss': self.reg_loss}) self.loss += self.reg_loss self.loss = tf.squeeze(self.loss) sample_num = tf.cast( tf.shape(input=K)[0] * tf.shape(input=K)[1], tf.float32) self.loss /= sample_num self.opt = tf.compat.v1.train.AdamOptimizer(self.lr) self.train_op = self.opt.minimize(self.loss, var_list=all_vars) self.trainable_vars += self.opt.variables() def train(self, data): # debug, loss, _ = tf.get_default_session().run( debug, loss, _ = self._session.run( [self.debug_q, self.loss, self.train_op], feed_dict={ self.obs_ph: data['obs_1'], self.obs_ph_2: data['obs_2'], self.next_obs_ph: data['next_obs_1'], self.next_obs_ph_2: data['next_obs_2'], self.act_ph: data['act_1'], self.act_ph_2: data['act_2'], self.rew_ph: data['rew_1'], self.rew_ph_2: data['rew_2'], self.q_net.tau_ph: self.q_net.temperature, }) return debug, loss def evaluation(self, init_obs): # value = tf.get_default_session().run( value = self._session.run(self.value_estimation, feed_dict={ self.init_obs_ph: init_obs, self.q_net.tau_ph: self.q_net.temperature, }) return value def close(self): self._session.close()
class MWL(Basic_Alg): def __init__(self, obs_dim, act_dim, *, seed, norm, model_weights, target_temp, med_dist, hidden_layers_q, activation_q, hidden_layers_w=[32, 32], scope='mwl', lr=5e-3, reg_factor=0, gamma=0.999): super().__init__(scope) self.obs_dim = obs_dim self.act_dim = act_dim self.gamma = gamma self.lr = lr self.reg_factor = reg_factor self.hidden_layers_w = hidden_layers_w self.hidden_layers_q = hidden_layers_q # import pdb; pdb.set_trace() self.med_dist = med_dist self.seed = seed self.norm = norm assert self.norm['type'] is not None, 'data should already be processed before calling the algorithm' self._session = tf.compat.v1.Session() self._session.__enter__() with self._session.as_default(): self.q_net = Q_Model_Tf(obs_dim, act_dim, hidden_layers = hidden_layers_q, temperature=target_temp, seed = self.seed,\ activation = activation_q) #XXX debug self.debug_w = {} self.trainable_vars = [] self.build_graph() self.build_estimation_graph() self.create_loss_func() self._session.run([tf.compat.v1.variables_initializer(self.trainable_vars)]) self.q_net.load_weight(model_weights) # tf.get_default_session().run( # [tf.variables_initializer(self.trainable_vars)] # ) def build_graph(self): self.rew_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 1]) self.done_ph = tf.compat.v1.placeholder(dtype=tf.bool, shape=[None, 1]) ''' Initial Part ''' # ''' firs sample ''' self.init_obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.init_act_b = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1]) self.init_act_e = self.build_action(self.init_obs_ph) self.init_obs_act_b = tf.concat([self.init_obs_ph, tf.cast(self.init_act_b, tf.float32)], axis=1) self.init_obs_act_e = tf.concat([self.init_obs_ph, tf.cast(self.init_act_e, tf.float32)], axis=1) # ''' second sample ''' self.init_obs_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.init_act_b_2 = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1]) self.init_act_e_2 = self.build_action(self.init_obs_ph_2) self.init_obs_act_b_2 = tf.concat([self.init_obs_ph_2, tf.cast(self.init_act_b_2, tf.float32)], axis=1) self.init_obs_act_e_2 = tf.concat([self.init_obs_ph_2, tf.cast(self.init_act_e_2, tf.float32)], axis=1) ''' Current Part ''' # first sample self.obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.act_ph = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1]) self.obs_act = tf.concat([self.obs_ph, tf.cast(self.act_ph, tf.float32)], axis=1) self.factor = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 1]) # second sample self.obs_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.act_ph_2 = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1]) self.obs_act_2 = tf.concat([self.obs_ph_2, tf.cast(self.act_ph_2, tf.float32)], axis=1) self.factor_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 1]) ''' Next Part ''' # first sample self.next_obs_ph = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.next_act_b = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1]) self.next_obs_act_b = tf.concat([self.next_obs_ph, \ tf.cast(self.next_act_b, tf.float32)], axis=1) self.next_act_e = [ tf.zeros([tf.shape(input=self.next_obs_ph)[0], 1], dtype=tf.int32), tf.ones([tf.shape(input=self.next_obs_ph)[0], 1], dtype=tf.int32), ] self.next_obs_act_e = [ tf.concat([self.next_obs_ph, tf.cast(self.next_act_e[0], tf.float32)], axis=1), tf.concat([self.next_obs_ph, tf.cast(self.next_act_e[1], tf.float32)], axis=1), ] self.prob_next = self.build_prob(self.next_obs_ph) # second part self.next_obs_ph_2 = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.obs_dim]) self.next_act_b_2 = tf.compat.v1.placeholder(dtype=tf.int32, shape=[None, 1]) self.next_obs_act_b_2 = tf.concat([self.next_obs_ph_2, \ tf.cast(self.next_act_b_2, tf.float32)], axis=1) self.next_act_e_2 = [ tf.zeros([tf.shape(input=self.next_obs_ph_2)[0], 1], dtype=tf.int32), tf.ones([tf.shape(input=self.next_obs_ph_2)[0], 1], dtype=tf.int32), ] self.next_obs_act_e_2 = [ tf.concat([self.next_obs_ph_2, tf.cast(self.next_act_e_2[0], tf.float32)], axis=1), tf.concat([self.next_obs_ph_2, tf.cast(self.next_act_e_2[1], tf.float32)], axis=1), ] self.prob_next_2 = self.build_prob(self.next_obs_ph_2) ''' Density Ratio ''' # first part self.w = self.create_density_ratio(self.obs_ph, self.act_ph, factor=self.factor, reuse=False) self.w_next = self.create_density_ratio(self.next_obs_ph, self.next_act_b, factor=self.factor, reuse=True) # second part self.w_2 = self.create_density_ratio(self.obs_ph_2, self.act_ph_2, factor=self.factor_2, reuse=True) self.w_next_2 = self.create_density_ratio(self.next_obs_ph_2, self.next_act_b_2, factor=self.factor_2, reuse=True) self.w_init = self.create_density_ratio(self.init_obs_ph, self.init_act_b, reuse=True, normalize=True) self.w_init_2 = self.create_density_ratio(self.init_obs_ph_2, self.init_act_b_2, reuse=True, normalize=True) def build_action(self, obs_ph): # recover the original obs, in order to get the correct action prob if self.norm['type'] is not None: org_obs = obs_ph * self.norm['scale'] + self.norm['shift'] else: org_obs = obs_ph act = self.q_net.build_random_policy(org_obs, reuse=True) return tf.stop_gradient(act) def build_prob(self, obs_ph): # recover the original obs, in order to get the correct action prob if self.norm['type'] is not None: org_obs = obs_ph * self.norm['scale'] + self.norm['shift'] else: org_obs = obs_ph return self.q_net.build_prob(org_obs, reuse=True) def create_density_ratio(self, obs_tf, act_tf, *, factor=None, reuse=False, normalize=True): with tf.compat.v1.variable_scope(self.scope, reuse=reuse): x = tf.concat([obs_tf, tf.cast(act_tf, tf.float32)], axis=1) for h in self.hidden_layers_w: x = tf.compat.v1.layers.dense(x, h, activation=tf.nn.relu) w = tf.compat.v1.layers.dense(x, 1, activation=None, kernel_regularizer=tf.keras.regularizers.l2(l=0.5 * (1.0))) if reuse == False: self.trainable_vars += tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope) w = tf.math.log(1 + tf.exp(w)) if factor is not None: w = w * factor if normalize: w = w / tf.reduce_mean(input_tensor=w) return w def build_estimation_graph(self): rew = self.rew_ph w = self.create_density_ratio(self.obs_ph, self.act_ph, factor=self.factor, reuse=True, normalize=True) assert self.gamma < 1.0 self.value_estimation = tf.reduce_mean(input_tensor=w * rew) / (1 - self.gamma) ''' create loss function, drop those term which do not depend pn w(s,a) ''' def create_loss_func(self): coeff = [self.gamma ** 2] * 4 + \ [self.gamma ** 2] * 1 + \ [(1-self.gamma) ** 2] * 1 + \ [(1-self.gamma) ** 2] * 1 + \ [-self.gamma ** 2] * 4 + \ [-self.gamma * (1 - self.gamma)] * 4 + \ [self.gamma * (1 - self.gamma)] * 4 + \ [self.gamma * (1 - self.gamma)] * 2 + \ [-self.gamma * (1 - self.gamma)] * 2 + \ [-(1 - self.gamma)**2] * 2 Kernel = [ # Term 1 (self.next_obs_act_e[0], self.next_obs_act_e_2[0]), (self.next_obs_act_e[0], self.next_obs_act_e_2[1]), (self.next_obs_act_e[1], self.next_obs_act_e_2[0]), (self.next_obs_act_e[1], self.next_obs_act_e_2[1]), # Term 2 (self.next_obs_act_b, self.next_obs_act_b_2), # Term 3 (self.init_obs_act_b, self.init_obs_act_b_2), # Term 4 (self.init_obs_act_e, self.init_obs_act_e_2), # Term 5 (self.next_obs_act_e[0], self.next_obs_act_b_2), (self.next_obs_act_e[1], self.next_obs_act_b_2), (self.next_obs_act_b, self.next_obs_act_e_2[0]), (self.next_obs_act_b, self.next_obs_act_e_2[1]), # Term 6 (self.next_obs_act_e[0], self.init_obs_act_b_2), (self.next_obs_act_e[1], self.init_obs_act_b_2), (self.init_obs_act_b, self.next_obs_act_e_2[0]), (self.init_obs_act_b, self.next_obs_act_e_2[1]), # Term 7 (self.next_obs_act_e[0], self.init_obs_act_e_2), (self.next_obs_act_e[1], self.init_obs_act_e_2), (self.init_obs_act_e, self.next_obs_act_e_2[0]), (self.init_obs_act_e, self.next_obs_act_e_2[1]), # Term 8 (self.next_obs_act_b, self.init_obs_act_b_2), (self.init_obs_act_b, self.next_obs_act_b_2), # Term 9 (self.next_obs_act_b, self.init_obs_act_e_2), (self.init_obs_act_e, self.next_obs_act_b_2), # Term 10 (self.init_obs_act_b, self.init_obs_act_e_2), (self.init_obs_act_e, self.init_obs_act_b_2), ] prob_mask = [ # Term 1 self.prob_next[0] * tf.reshape(self.prob_next_2[0], [1, -1]), self.prob_next[0] * tf.reshape(self.prob_next_2[1], [1, -1]), self.prob_next[1] * tf.reshape(self.prob_next_2[0], [1, -1]), self.prob_next[1] * tf.reshape(self.prob_next_2[1], [1, -1]), ] + \ [ # Term 2, 3, 4 None ] * 3 + \ [ # Term 5, 6, 7 self.prob_next[0] * tf.ones([1, tf.shape(input=self.prob_next_2[0])[0]]), self.prob_next[1] * tf.ones([1, tf.shape(input=self.prob_next_2[0])[0]]), tf.ones(tf.shape(input=self.prob_next[0])) * tf.reshape(self.prob_next_2[0], [1, -1]), tf.ones(tf.shape(input=self.prob_next[0])) * tf.reshape(self.prob_next_2[1], [1, -1]), ] + \ [ # Term 5, 6, 7 self.prob_next[0] * tf.ones([1, tf.shape(input=self.w_init_2)[0]]), self.prob_next[1] * tf.ones([1, tf.shape(input=self.w_init_2)[0]]), tf.ones(tf.shape(input=self.w_init)) * tf.reshape(self.prob_next_2[0], [1, -1]), tf.ones(tf.shape(input=self.w_init)) * tf.reshape(self.prob_next_2[1], [1, -1]), ] * 2 + \ [ # Term 8, 9, 10 None ] * 6 w_ones = tf.ones(tf.shape(input=self.w_init)) w_2_ones = tf.ones(tf.shape(input=self.w_init_2)) weights = [ # Term 1 (self.w, self.w_2), (self.w, self.w_2), (self.w, self.w_2), (self.w, self.w_2), # Term 2 (self.w_next, self.w_next_2), # Term 3 (self.w_init, self.w_init_2), # Term 4 None, # Term 5 (self.w, self.w_next_2), (self.w, self.w_next_2), (self.w_next, self.w_2), (self.w_next, self.w_2), # Term 6 (self.w, self.w_init_2), (self.w, self.w_init_2), (self.w_init, self.w_2), (self.w_init, self.w_2), # Term 7 (self.w, w_2_ones), (self.w, w_2_ones), (w_ones, self.w_2), (w_ones, self.w_2), # Term 8 (self.w_next, self.w_init_2), (self.w_init, self.w_next_2), # Term 9 (self.w_next, w_2_ones), (w_ones, self.w_next_2), # Term 10 (self.w_init, w_2_ones), (w_ones, self.w_init_2), ] self.reg_loss = self.reg_factor * tf.reduce_sum(input_tensor=tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES, self.scope)) self.debug_w.update({'reg loss': self.reg_loss}) self.loss = self.reg_loss K = 0 for index in range(len(Kernel)): if weights[index] is None: continue k1, k2 = Kernel[index] c = coeff[index] w1, w2 = weights[index] diff = tf.expand_dims(k1, 1) - tf.expand_dims(k2, 0) K = tf.exp(-tf.reduce_sum(input_tensor=tf.square(diff), axis=-1)/2.0/self.med_dist[index] ** 2) if prob_mask[index] is not None: K = K * prob_mask[index] sample_num = tf.cast(tf.shape(input=K)[0] * tf.shape(input=K)[1], tf.float32) loss = c * tf.matmul(tf.matmul(tf.transpose(a=w1), K), w2) / sample_num self.loss += tf.squeeze(loss) self.opt = tf.compat.v1.train.AdamOptimizer(self.lr) self.train_op = self.opt.minimize(self.loss) self.trainable_vars += self.opt.variables() def train(self, data): # debug, loss, _ = tf.get_default_session().run( debug, loss, _ = self._session.run( [self.debug_w, self.loss, self.train_op], feed_dict={ self.obs_ph: data['obs_1'], self.act_ph: data['acts_1'], self.next_obs_ph: data['next_obs_1'], self.next_act_b: data['next_acts_1'], self.init_obs_ph: data['init_obs_1'], self.init_act_b: data['init_acts_1'], self.factor: data['factor_1'], self.obs_ph_2: data['obs_2'], self.act_ph_2: data['acts_2'], self.next_obs_ph_2: data['next_obs_2'], self.next_act_b_2: data['next_acts_2'], self.init_obs_ph_2: data['init_obs_2'], self.init_act_b_2: data['init_acts_2'], self.factor_2: data['factor_2'], self.q_net.tau_ph: self.q_net.temperature, } ) return debug, loss def evaluation(self, obs, acts, factor, rew): # value = tf.get_default_session().run( value = self._session.run( self.value_estimation, feed_dict={ self.obs_ph: obs, self.act_ph: acts, self.rew_ph: rew, self.factor: factor, self.q_net.tau_ph: self.q_net.temperature, } ) return value def get_w(self, obs, acts, factor, rew): # w = tf.get_default_session().run( w = self._session.run( self.w, feed_dict={ self.obs_ph: obs, self.act_ph: acts, self.rew_ph: rew, self.factor: factor, self.q_net.tau_ph: self.q_net.temperature, } ) return w def close(self): self._session.close()