def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") logli_old = tf.placeholder(tf.float32, [None], name="logli_old") value_old = tf.placeholder(tf.float32, [None], name="value_old") ratio = tf.exp(self.policy.logli - logli_old) clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) value_err = (self.value - returns)**2 if self.clip_value > 0.0: clipped_value = tf.clip_by_value(self.value, value_old - self.clip_value, value_old + self.clip_value) clipped_value_err = (clipped_value - returns)**2 value_err = tf.maximum(value_err, clipped_value_err) policy_loss = -tf.reduce_mean( tf.minimum(adv * ratio, adv * clipped_ratio)) value_loss = tf.reduce_mean(value_err) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns, logli_old, value_old]
def _ph_op(self): with tf.name_scope("init_ph"): x, y, y_feature = self._input_shapes # x driving series self.x = tf.placeholder(dtype=tf.float32, shape=(None, ) + x, name='x') # future values of driving series self.y = tf.placeholder(dtype=tf.float32, shape=(None, ) + y, name='y') # future values of the ancillary series self.y_features = tf.placeholder(dtype=tf.float32, shape=(None, ) + y_feature, name='y_features') self.mu = tf.placeholder_with_default(0., shape=(), name='mu') self.std = tf.placeholder_with_default(1., shape=(), name='std') self.keep_prob = tf.placeholder_with_default(1., shape=(), name='keep_prob') self.is_training = tf.placeholder_with_default(True, shape=(), name='is_training') self.gen_len = tf.placeholder_with_default(1, shape=(), name='gen_len') self.flag = tf.placeholder(shape=(), dtype=tf.bool)
def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") policy_loss = -tf.reduce_mean(self.policy.logli * adv) value_loss = tf.reduce_mean( (self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss try: with open("loss_fn.txt", "x+") as f: f.write("out\n") f.write("full_loss: {0} type: {1}\n".format( type(full_loss), full_loss.dtype)) f.write("policy_loss: {0} type: {1}\n".format( type(policy_loss), policy_loss.dtype)) f.write("value_loss: {0} type: {1}\n".format( type(value_loss), value_loss.dtype)) f.write("entropy_loss: {0} type: {1}\n".format( type(entropy_loss), entropy_loss.dtype)) f.write("adv: {0} type: {1}\n".format(type(adv), adv.dtype)) f.write("returns: {0} type: {1}\n".format( type(returns), returns.dtype)) f.close() except FileExistsError: print("") return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns]
def __init__(self, sess, observation_space, action_space, optimizer_name='', select_slate_fn=None, compute_target_fn=None, stack_size=1, eval_mode=False, **kwargs): """Initializes SlateDecompQAgent. Args: sess: a Tensorflow session. observation_space: A gym.spaces object that specifies the format of observations. action_space: A gym.spaces object that specifies the format of actions. optimizer_name: The name of the optimizer. select_slate_fn: A function that selects the slate. compute_target_fn: A function that omputes the target q value. stack_size: The stack size for the replay buffer. eval_mode: A bool for whether the agent is in training or evaluation mode. **kwargs: Keyword arguments to the DQNAgent. """ self._response_adapter = dqn_agent.ResponseAdapter( observation_space.spaces['response']) response_names = self._response_adapter.response_names expected_response_names = ['click', 'watch_time'] if not all(key in response_names for key in expected_response_names): raise ValueError( "Couldn't find all fields needed for the decomposition: %r" % expected_response_names) self._click_response_index = response_names.index('click') self._reward_response_index = response_names.index('watch_time') self._quality_response_index = response_names.index('quality') self._cluster_id_response_index = response_names.index('cluster_id') self._env_action_space = action_space self._num_candidates = int(action_space.nvec[0]) abstract_agent.AbstractEpisodicRecommenderAgent.__init__(self, action_space) # The doc score is a [num_candidates] vector. self._doc_affinity_scores_ph = tf.placeholder( tf.float32, (self._num_candidates,), name='doc_affinity_scores_ph') self._prob_no_click_ph = tf.placeholder( tf.float32, (), name='prob_no_click_ph') self._select_slate_fn = select_slate_fn self._compute_target_fn = compute_target_fn dqn_agent.DQNAgentRecSim.__init__( self, sess, observation_space, num_actions=0, # Unused. stack_size=1, optimizer_name=optimizer_name, eval_mode=eval_mode, **kwargs)
def build_graph(self): """Builds the neural network graph.""" # define graph self.g = tf.Graph() with self.g.as_default(): # create and store a new session for the graph self.sess = tf.Session() # define placeholders self.x = tf.placeholder(shape=[None, self.dim_input], dtype=tf.float32) self.y = tf.placeholder(shape=[None, self.num_classes], dtype=tf.float32) # define simple model with tf.variable_scope('last_layer'): self.z = tf.layers.dense(inputs=self.x, units=self.num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y, logits=self.z)) self.output_probs = tf.nn.softmax(self.z) # Variables of the last layer self.ll_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.ll_vars_concat = tf.concat( [self.ll_vars[0], tf.expand_dims(self.ll_vars[1], axis=0)], 0) # Summary _variable_summaries(self.ll_vars_concat) # saving the weights of last layer when running bootstrap algorithm self.saver = tf.train.Saver(var_list=self.ll_vars) self.gd_opt = tf.train.GradientDescentOptimizer(self.step_size) # SGD optimizer for the last layer grads_vars_sgd = self.gd_opt.compute_gradients(self.loss) self.train_op = self.gd_opt.apply_gradients(grads_vars_sgd) for g, v in grads_vars_sgd: if g is not None: s = list(v.name) s[v.name.rindex(':')] = '_' tf.summary.histogram(''.join(s) + '/grad_hist_boot_sgd', g) # Merge all the summaries and write them out self.all_summaries = tf.summary.merge_all() location = os.path.join(self.working_dir, 'logs') self.writer = tf.summary.FileWriter(location, graph=self.g) saver_network = tf.train.Saver(var_list=self.ll_vars) print('Loading the network...') # Restores from checkpoint saver_network.restore(self.sess, self.model_dir) print('Graph successfully loaded.')
def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") policy_loss = -tf.reduce_mean(self.policy.logli * adv) value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns]
def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") logli_old = tf.placeholder(tf.float32, [None], name="logli_old") ratio = tf.exp(self.policy.logli - logli_old) clipped_ratio = tf.clip_by_value(ratio, 1-self.clip_ratio, 1+self.clip_ratio) policy_loss = -tf.reduce_mean(tf.minimum(adv * ratio, adv * clipped_ratio)) # TODO clip value loss value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns, logli_old]
def loss_fn(self): """ Sample trajectories and fit a cost function C. Form grad estimate with C and take a TRPO step for next policy. """ adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") policy_loss = -tf.reduce_mean(self.policy.logli * adv) # value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef value_loss = tf.reduce_mean(self.value - returns) entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return value_loss
def loss_fn(self, policy=None, value=None): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") if not self.subenvs: policy_loss = -tf.reduce_mean(self.policy.logli * adv) value_loss = tf.reduce_mean( (self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean( self.policy.entropy) * self.entropy_coef else: assert policy is not None and value is not None, "Missing variables representing <policy> and <value>" policy_loss = -tf.reduce_mean(policy.logli * adv) value_loss = tf.reduce_mean((value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns]
def _build_eval_metric(self): """Build a network to evaluate the metric between all prototypical states. For each pair of states (s, t) we return max(d(s, t), d(t, s)), since the approximant cannot in general guarantee symmetry. Returns: An op computing the euclidean distance between the representations of all pairs of states in self.eval_states_ph. """ self.eval_states_ph = tf.placeholder(tf.float64, (self.num_states, 2), name='eval_states_ph') distances = tf.maximum( self.online_network(self._concat_states(self.eval_states_ph)), self.online_network(self._concat_states(self.eval_states_ph, transpose=True))) return distances
def __init__(self, num_actions, observation_size, stack_size, use_staging=True, replay_capacity=1000000, batch_size=32, update_horizon=1, gamma=1.0, wrapped_memory=None): """Initializes a graph wrapper for the python replay memory. Args: num_actions: int, number of possible actions. observation_size: int, size of an input frame. stack_size: int, number of frames to use in state stack. use_staging: bool, when True it would use a staging area to prefetch the next sampling batch. replay_capacity: int, number of transitions to keep in memory. batch_size: int. update_horizon: int, length of update ('n' in n-step update). gamma: int, the discount factor. wrapped_memory: The 'inner' memory data structure. Defaults to None, which creates the standard DQN replay memory. Raises: ValueError: If update_horizon is not positive. ValueError: If discount factor is not in [0, 1]. """ if replay_capacity < update_horizon + 1: raise ValueError( 'Update horizon (%i) should be significantly smaller ' 'than replay capacity (%i).' % (update_horizon, replay_capacity)) if not update_horizon >= 1: raise ValueError('Update horizon must be positive.') if not 0.0 <= gamma <= 1.0: raise ValueError('Discount factor (gamma) must be in [0, 1].') # Allow subclasses to create self.memory. if wrapped_memory is not None: self.memory = wrapped_memory else: self.memory = OutOfGraphReplayMemory(num_actions, observation_size, stack_size, replay_capacity, batch_size, update_horizon, gamma) with tf.name_scope('replay'): with tf.name_scope('add_placeholders'): self.add_obs_ph = tf.placeholder(tf.uint8, [observation_size], name='add_obs_ph') self.add_action_ph = tf.placeholder(tf.int32, [], name='add_action_ph') self.add_reward_ph = tf.placeholder(tf.float32, [], name='add_reward_ph') self.add_terminal_ph = tf.placeholder(tf.uint8, [], name='add_terminal_ph') self.add_legal_actions_ph = tf.placeholder( tf.float32, [num_actions], name='add_legal_actions_ph') add_transition_ph = [ self.add_obs_ph, self.add_action_ph, self.add_reward_ph, self.add_terminal_ph, self.add_legal_actions_ph ] with tf.device('/cpu:*'): self.add_transition_op = tf.py_func(self.memory.add, add_transition_ph, [], name='replay_add_py_func') self.transition = tf.py_func( self.memory.sample_transition_batch, [], [ tf.uint8, tf.int32, tf.float32, tf.uint8, tf.uint8, tf.int32, tf.float32 ], name='replay_sample_py_func') if use_staging: # To hide the py_func latency use a staging area to pre-fetch the next # batch of transitions. (states, actions, rewards, next_states, terminals, indices, next_legal_actions) = self.transition # StagingArea requires all the shapes to be defined. states.set_shape( [batch_size, observation_size, stack_size]) actions.set_shape([batch_size]) rewards.set_shape([batch_size]) next_states.set_shape( [batch_size, observation_size, stack_size]) terminals.set_shape([batch_size]) indices.set_shape([batch_size]) next_legal_actions.set_shape([batch_size, num_actions]) # Create the staging area in CPU. prefetch_area = tf.contrib.staging.StagingArea([ tf.uint8, tf.int32, tf.float32, tf.uint8, tf.uint8, tf.int32, tf.float32 ]) self.prefetch_batch = prefetch_area.put( (states, actions, rewards, next_states, terminals, indices, next_legal_actions)) else: self.prefetch_batch = tf.no_op() if use_staging: # Get the sample_transition_batch in GPU. This would do the copy from # CPU to GPU. self.transition = prefetch_area.get() (self.states, self.actions, self.rewards, self.next_states, self.terminals, self.indices, self.next_legal_actions) = self.transition # Since these are py_func tensors, no information about their shape is # present. Setting the shape only for the necessary tensors self.states.set_shape([None, observation_size, stack_size]) self.next_states.set_shape([None, observation_size, stack_size])
def build_graph(self): """Builds the neural network graph.""" # define graph self.g = tf.Graph() with self.g.as_default(): # create and store a new session for the graph self.sess = tf.Session() # define placeholders self.x = tf.placeholder(shape=[None, self.dim_input], dtype=tf.float32) self.y = tf.placeholder(shape=[None, self.num_classes], dtype=tf.float32) # linear layer(WX + b) with tf.variable_scope('last_layer/dense') as scope: weights = tf.get_variable('kernel', [self.dim_input, self.num_classes], dtype=tf.float32) biases = tf.get_variable('bias', [self.num_classes], dtype=tf.float32) wb = tf.concat([weights, tf.expand_dims(biases, axis=0)], 0) wb_renorm = tf.matmul(self.sigma_half_inv, wb) weights_renorm = wb_renorm[:self.dim_input, :] biases_renorm = wb_renorm[-1, :] self.z = tf.add(tf.matmul(self.x, weights_renorm), biases_renorm, name=scope.name) # Gaussian prior # prior = tf.nn.l2_loss(weights) + tf.nn.l2_loss(biases) # Non normalized loss, because of the preconditioning self.loss = self.n * tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y, logits=self.z)) # Bayesian loss self.bayesian_loss = self.loss # + prior self.output_probs = tf.nn.softmax(self.z) # Variables of the last layer self.ll_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.ll_vars_concat = tf.concat( [self.ll_vars[0], tf.expand_dims(self.ll_vars[1], axis=0)], 0) # Summary _variable_summaries(self.ll_vars_concat) # saving the weights of last layer when running SGLD/SGD/MCMC algorithm self.saver = tf.train.Saver(var_list=self.ll_vars, max_to_keep=self.num_samples) self.gd_opt = tf.train.GradientDescentOptimizer(self.step_size) # SGLD optimizer for the last layer if self.sampler in ['sgld', 'lmc']: grads_vars = self.gd_opt.compute_gradients(self.bayesian_loss) grads_vars_sgld = [] for g, v in grads_vars: if g is not None: s = list(v.name) s[v.name.rindex(':')] = '_' # Adding Gaussian noise to the gradient gaussian_noise = (np.sqrt(2. / self.step_size) * tf.random_normal(tf.shape(g))) g_sgld = g + gaussian_noise tf.summary.histogram(''.join(s) + '/grad_hist_mcmc', g) tf.summary.histogram( ''.join(s) + '/gaussian_noise_hist_mcmc', gaussian_noise) tf.summary.histogram( ''.join(s) + '/grad_total_hist_mcmc', g_sgld) grads_vars_sgld.append((g_sgld, v)) self.train_op = self.gd_opt.apply_gradients(grads_vars_sgld) # SGD optimizer for the last layer if self.sampler == 'sgd': grads_vars_sgd = self.gd_opt.compute_gradients(self.loss) self.train_op = self.gd_opt.apply_gradients(grads_vars_sgd) for g, v in grads_vars_sgd: if g is not None: s = list(v.name) s[v.name.rindex(':')] = '_' tf.summary.histogram(''.join(s) + '/grad_hist_sgd', g) # Merge all the summaries and write them out self.all_summaries = tf.summary.merge_all() location = os.path.join(self.working_dir, 'logs') self.writer = tf.summary.FileWriter(location, graph=self.g) saver_network = tf.train.Saver(var_list=self.ll_vars) print('loading the network ...') # Restores from checkpoint saver_network.restore(self.sess, self.model_dir) print('Graph successfully loaded.')
def broken(sess): index = tf.placeholder(tf.int32, name='index') slice_op = tf.range(10)[index] sess.run(slice_op, feed_dict={index: 11})
def __init__(self, num_actions=None, observation_size=None, num_players=None, gamma=0.99, update_horizon=1, min_replay_history=500, update_period=4, stack_size=1, target_update_period=500, epsilon_fn=linearly_decaying_epsilon, epsilon_train=0.02, epsilon_eval=0.001, epsilon_decay_period=1000, graph_template=dqn_template, tf_device='/cpu:*', use_staging=True, optimizer=tf.train.RMSPropOptimizer(learning_rate=.0025, decay=0.95, momentum=0.0, epsilon=1e-6, centered=True)): """Initializes the agent and constructs its graph. Args: num_actions: int, number of actions the agent can take at any state. observation_size: int, size of observation vector. num_players: int, number of players playing this game. gamma: float, discount factor as commonly used in the RL literature. update_horizon: int, horizon at which updates are performed, the 'n' in n-step update. min_replay_history: int, number of stored transitions before training. update_period: int, period between DQN updates. stack_size: int, number of observations to use as state. target_update_period: Update period for the target network. epsilon_fn: Function expecting 4 parameters: (decay_period, step, warmup_steps, epsilon), and which returns the epsilon value used for exploration during training. epsilon_train: float, final epsilon for training. epsilon_eval: float, epsilon during evaluation. epsilon_decay_period: int, number of steps for epsilon to decay. graph_template: function for building the neural network graph. tf_device: str, Tensorflow device on which to run computations. use_staging: bool, when True use a staging area to prefetch the next sampling batch. optimizer: Optimizer instance used for learning. """ self.partial_reload = False tf.logging.info('Creating %s agent with the following parameters:', self.__class__.__name__) tf.logging.info('\t gamma: %f', gamma) tf.logging.info('\t update_horizon: %f', update_horizon) tf.logging.info('\t min_replay_history: %d', min_replay_history) tf.logging.info('\t update_period: %d', update_period) tf.logging.info('\t target_update_period: %d', target_update_period) tf.logging.info('\t epsilon_train: %f', epsilon_train) tf.logging.info('\t epsilon_eval: %f', epsilon_eval) tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) tf.logging.info('\t tf_device: %s', tf_device) tf.logging.info('\t use_staging: %s', use_staging) tf.logging.info('\t optimizer: %s', optimizer) # Global variables. self.num_actions = num_actions self.observation_size = observation_size self.num_players = num_players self.gamma = gamma self.update_horizon = update_horizon self.cumulative_gamma = math.pow(gamma, update_horizon) self.min_replay_history = min_replay_history self.target_update_period = target_update_period self.epsilon_fn = epsilon_fn self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.epsilon_decay_period = epsilon_decay_period self.update_period = update_period self.eval_mode = False self.training_steps = 0 self.batch_staged = False self.optimizer = optimizer with tf.device(tf_device): # Calling online_convnet will generate a new graph as defined in # graph_template using whatever input is passed, but will always share # the same weights. online_convnet = tf.make_template('Online', graph_template) target_convnet = tf.make_template('Target', graph_template) # The state of the agent. The last axis is the number of past observations # that make up the state. states_shape = (1, observation_size, stack_size) self.state = np.zeros(states_shape) self.state_ph = tf.placeholder(tf.uint8, states_shape, name='state_ph') self.legal_actions_ph = tf.placeholder(tf.float32, [self.num_actions], name='legal_actions_ph') self._q = online_convnet(state=self.state_ph, num_actions=self.num_actions) self._replay = self._build_replay_memory(use_staging) self._replay_qs = online_convnet(self._replay.states, self.num_actions) self._replay_next_qt = target_convnet(self._replay.next_states, self.num_actions) self._train_op = self._build_train_op() self._sync_qt_ops = self._build_sync_op() self._q_argmax = tf.argmax(self._q + self.legal_actions_ph, axis=1)[0] # Set up a session and initialize variables. self._sess = tf.Session( '', config=tf.ConfigProto(allow_soft_placement=True)) self._init_op = tf.global_variables_initializer() self._sess.run(self._init_op) self._saver = tf.train.Saver(max_to_keep=3) # This keeps tracks of the observed transitions during play, for each # player. self.transitions = [[] for _ in range(num_players)]
def sample_distance_pairs(self, num_samples_per_cell=2, verbose=False): """Sample a set of points from each cell and compute all pairwise distances. This method also writes the resulting distances to disk. Args: num_samples_per_cell: int, number of samples to draw per cell. verbose: bool, whether to print verbose messages. """ paired_states_ph = tf.placeholder(tf.float64, (1, 4), name='paired_states_ph') online_network = tf.make_template('Online', self._network_template) distance = online_network(paired_states_ph) saver = tf.train.Saver() if not self.add_noise: num_samples_per_cell = 1 with tf.Session() as sess: saver.restore(sess, os.path.join(self.base_dir, 'tf_ckpt-239900')) total_samples = None for s_idx in range(self.num_states): s = self.inverse_index_states[s_idx] s = s.astype(np.float32) s += 0.5 # Place in center of cell. s = np.tile([s], (num_samples_per_cell, 1)) if self.add_noise: sampled_noise = np.clip( np.random.normal(0, 0.1, size=(num_samples_per_cell, 2)), -0.3, 0.3) s += sampled_noise if total_samples is None: total_samples = s else: total_samples = np.concatenate([total_samples, s]) num_total_samples = len(total_samples) distances = np.zeros((num_total_samples, num_total_samples)) if verbose: tf.logging.info('Will compute distances for %d samples', num_total_samples) for i in range(num_total_samples): s1 = total_samples[i] if verbose: tf.logging.info('Will compute distances from sample %d', i) for j in range(num_total_samples): s2 = total_samples[j] paired_states_1 = np.reshape(np.append(s1, s2), (1, 4)) paired_states_2 = np.reshape(np.append(s2, s1), (1, 4)) distance_np_1 = sess.run( distance, feed_dict={paired_states_ph: paired_states_1}) distance_np_2 = sess.run( distance, feed_dict={paired_states_ph: paired_states_2}) max_dist = max(distance_np_1, distance_np_2) distances[i, j] = max_dist distances[j, i] = max_dist sampled_distances = { 'samples_per_cell': num_samples_per_cell, 'samples': total_samples, 'distances': distances, } file_path = os.path.join(self.base_dir, 'sampled_distances.pkl') with tf.gfile.GFile(file_path, 'w') as f: pickle.dump(sampled_distances, f)
def build_graph(self): """Builds the neural network graph.""" # define graph self.g = tf.Graph() with self.g.as_default(): # create and store a new session for the graph self.sess = tf.Session() # define placeholders self.x = tf.placeholder(shape=[None, self.dim_input], dtype=tf.float32) self.y = tf.placeholder(shape=[None, self.num_classes], dtype=tf.float32) # define simple model with tf.variable_scope('last_layer'): self.z = tf.layers.dense(inputs=self.x, units=self.num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y, logits=self.z)) self.output_probs = tf.nn.softmax(self.z) # Variables of the last layer self.ll_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.ll_vars_concat = tf.concat( [self.ll_vars[0], tf.expand_dims(self.ll_vars[1], axis=0)], 0) # Summary _variable_summaries(self.ll_vars_concat) # add regularization that acts as a unit Gaussian prior on the last layer regularizer = tf.contrib.layers.l2_regularizer(1.0) # regularization prior = tf.contrib.layers.apply_regularization( regularizer, self.ll_vars) self.bayesian_loss = self.n * self.loss + prior # saving the weights of last layer when running SGLD/SGD/MCMC algorithm self.saver = tf.train.Saver(var_list=self.ll_vars, max_to_keep=self.num_samples) # SGLD optimizer for the last layer if self.sampler in ['sgld', 'lmc']: step = self.step_size / self.n gd_opt = tf.train.GradientDescentOptimizer(step) grads_vars = gd_opt.compute_gradients(self.bayesian_loss) grads_vars_sgld = [] for g, v in grads_vars: if g is not None: s = list(v.name) s[v.name.rindex(':')] = '_' # Adding Gaussian noise to the gradient gaussian_noise = (np.sqrt(2. / step) * tf.random_normal(tf.shape(g))) g_sgld = g + gaussian_noise tf.summary.histogram(''.join(s) + '/grad_hist_mcmc', g / self.n) tf.summary.histogram( ''.join(s) + '/gaussian_noise_hist_mcmc', gaussian_noise / self.n) tf.summary.histogram( ''.join(s) + '/grad_total_hist_mcmc', g_sgld / self.n) grads_vars_sgld.append((g_sgld, v)) self.train_op = gd_opt.apply_gradients(grads_vars_sgld) # SGD optimizer for the last layer if self.sampler == 'sgd': gd_opt = tf.train.GradientDescentOptimizer(self.step_size) grads_vars_sgd = gd_opt.compute_gradients(self.loss) self.train_op = gd_opt.apply_gradients(grads_vars_sgd) for g, v in grads_vars_sgd: if g is not None: s = list(v.name) s[v.name.rindex(':')] = '_' tf.summary.histogram(''.join(s) + '/grad_hist_sgd', g) # Merge all the summaries and write them out self.all_summaries = tf.summary.merge_all() location = os.path.join(self.working_dir, 'logs') self.writer = tf.summary.FileWriter(location, graph=self.g) saver_network = tf.train.Saver(var_list=self.ll_vars) print('loading the network ...') # Restores from checkpoint # self.sess.run(tf.global_variables_initializer()) saver_network.restore(self.sess, self.model_dir) print('Graph successfully loaded.')
def _build_train_op(self, optimizer): """Build the TensorFlow graph used to learn the bisimulation metric. Args: optimizer: a tf.train optimizer. Returns: A TensorFlow op to minimize the bisimulation loss. """ self.online_network = tf.make_template('Online', self._network_template) self.target_network = tf.make_template('Target', self._network_template) self.s1_ph = tf.placeholder(tf.float64, (self.batch_size, 2), name='s1_ph') self.s2_ph = tf.placeholder(tf.float64, (self.batch_size, 2), name='s2_ph') self.s1_online_distances = self.online_network( self._concat_states(self.s1_ph)) self.s1_target_distances = self.target_network( self._concat_states(self.s1_ph)) self.s2_target_distances = self.target_network( self._concat_states(self.s2_ph)) self.action_ph = tf.placeholder(tf.int32, (self.batch_size,)) self.rewards_ph = tf.placeholder(tf.float64, (self.batch_size,)) # We use an expanding horizon for computing the distances. self.bisim_horizon_ph = tf.placeholder(tf.float64, ()) # bisimulation_target_1 = rew_diff + gamma * next_distance. bisimulation_target_1 = tf.stop_gradient(self._build_bisimulation_target()) # bisimulation_target_2 = curr_distance. bisimulation_target_2 = tf.stop_gradient(self.s1_target_distances) # We slowly taper in the maximum according to the bisim horizon. bisimulation_target = tf.maximum( bisimulation_target_1, bisimulation_target_2 * self.bisim_horizon_ph) # We zero-out diagonal entries, since those are estimating the distance # between a state and itself, which we know to be 0. diagonal_mask = 1.0 - tf.diag(tf.ones(self.batch_size, dtype=tf.float64)) diagonal_mask = tf.reshape(diagonal_mask, (self.batch_size**2, 1)) bisimulation_target *= diagonal_mask bisimulation_estimate = self.s1_online_distances # We start with a mask that includes everything. loss_mask = tf.ones(tf.shape(bisimulation_estimate)) # We have to enforce that states being compared are done only using the same # action. indicators = self.action_ph indicators = tf.cast(indicators, tf.float64) # indicators will initially have shape [batch_size], we first tile it: square_ids = tf.tile([indicators], [self.batch_size, 1]) # We subtract square_ids from its transpose: square_ids = square_ids - tf.transpose(square_ids) # At this point all zero-entries are the ones with equal IDs. # Now we would like to convert the zeros in this matrix to 1s, and make # everything else a 0. We can do this with the following operation: loss_mask = 1 - tf.abs(tf.sign(square_ids)) # Now reshape to match the shapes of the estimate and target. loss_mask = tf.reshape(loss_mask, (self.batch_size**2, 1)) larger_targets = bisimulation_target - bisimulation_estimate larger_targets_count = tf.reduce_sum( tf.cast(larger_targets > 0., tf.float64)) tf.summary.scalar('Learning/LargerTargets', larger_targets_count) tf.summary.scalar('Learning/NumUpdates', tf.count_nonzero(loss_mask)) tf.summary.scalar('Learning/BisimHorizon', self.bisim_horizon_ph) bisimulation_loss = tf.losses.mean_squared_error( bisimulation_target, bisimulation_estimate, weights=loss_mask) tf.summary.scalar('Learning/loss', bisimulation_loss) # Plot average distance between sampled representations. average_distance = tf.reduce_mean(bisimulation_estimate) tf.summary.scalar('Approx/AverageDistance', average_distance) return optimizer.minimize(bisimulation_loss)