def build_model_training_(self): encoder_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ENCODER_NAMESPACE) encoder_optimizer = agent_utils.get_optimizer(self.optimizer, self.learning_rate) self.encoder_train_step = encoder_optimizer.minimize( self.loss_t, global_step=self.global_step, var_list=encoder_variables )
def build_prior_training_(self): self.mixture_variables = [self.mixtures_mu_v, self.mixtures_logvar_v, self.t_v] mixture_optimizer = agent_utils.get_optimizer(self.optimizer, self.model_learning_rate) self.prior_train_step = mixture_optimizer.minimize( self.prior_loss_t, global_step=self.global_step, var_list=self.mixture_variables )
def learn_model(self, state_probs, next_state_probs, actions, rewards, dones, rlr=0.01, tlr=0.5, num_steps=500, ad_hoc_loss=False, batch_size=1000, opt=constants.OPT_ADAM): dataset = ArrayDataset({ constants.STATE_PROBABILITIES: state_probs, constants.NEXT_STATE_PROBABILITIES: next_state_probs, constants.ACTIONS: actions, constants.REWARDS: rewards, constants.DONES: dones }) with tf.variable_scope("learn_model"): R = tf.get_variable( "reward_matrix", shape=(len(self.actions), len(self.states)), dtype=tf.float32, initializer=tf.random_uniform_initializer(minval=0, maxval=1, dtype=tf.float32) ) T = tf.get_variable( "transition_matrix", shape=(len(self.actions), len(self.states), len(self.states)), dtype=tf.float32, initializer=tf.random_uniform_initializer(minval=-1, maxval=1, dtype=tf.float32) ) T_softmax = tf.nn.softmax(T, axis=2) T_logsoftmax = tf.nn.log_softmax(T, axis=2) state_probs_pl = tf.placeholder(dtype=tf.float32, name="state_probs_pl") next_state_probs_pl = tf.placeholder(dtype=tf.float32, name="next_state_probs_pl") actions_pl = tf.placeholder(dtype=tf.int32, name="actions_pl") rewards_pl = tf.placeholder(dtype=tf.float32, name="rewards_pl") dones_pl = tf.placeholder(dtype=tf.bool, name="dones_pl") R_gather = tf.gather(R, actions_pl) T_softmax_gather = tf.gather(T_softmax, actions_pl) T_logsoftmax_gather = tf.gather(T_logsoftmax, actions_pl) if ad_hoc_loss: reward_loss = (1 / 2) * tf.reduce_mean( tf.square(rewards_pl - tf.reduce_sum(R_gather * state_probs_pl, axis=1)), axis=0 ) transition_loss = (1 / 2) * tf.reduce_mean(tf.reduce_sum( tf.square( next_state_probs_pl - tf.matmul(tf.transpose(T_softmax_gather, perm=[0, 2, 1]), state_probs_pl[:, :, tf.newaxis])[:, :, 0] ), axis=1 ), axis=0) else: reward_loss = (1 / 2) * tf.reduce_mean( tf.reduce_sum(tf.square(rewards_pl[:, tf.newaxis] - R_gather) * state_probs_pl, axis=1), axis=0 ) transition_loss = - tf.reduce_mean(tf.reduce_sum( state_probs_pl[:, :, tf.newaxis] * next_state_probs_pl[:, tf.newaxis, :] * T_logsoftmax_gather, axis=[1, 2] ) * (1 - tf.cast(dones_pl, tf.float32)), axis=0) R_step = utils.get_optimizer(opt, rlr).minimize(reward_loss) T_step = utils.get_optimizer(opt, tlr).minimize(transition_loss) losses = [] epoch_size = max(len(state_probs) // batch_size, 1) with tf.Session() as sess: sess.run(tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="learn_model")) ) for i in range(num_steps): epoch_step = i % epoch_size if epoch_step == 0: dataset.shuffle() b = np.index_exp[epoch_step * batch_size: (epoch_step + 1) * batch_size] _, _, tmp_reward_loss, tmp_transition_loss = sess.run( [R_step, T_step, reward_loss, transition_loss], feed_dict={ state_probs_pl: dataset[constants.STATE_PROBABILITIES][b], next_state_probs_pl: dataset[constants.NEXT_STATE_PROBABILITIES][b], actions_pl: dataset[constants.ACTIONS][b], rewards_pl: dataset[constants.REWARDS][b], dones_pl: dataset[constants.DONES][b] } ) losses.append([tmp_reward_loss, tmp_transition_loss]) self.rewards = np.transpose(sess.run(R), axes=[1, 0]) self.transitions = np.transpose(sess.run(T_softmax), axes=[1, 2, 0]) losses = np.stack(losses, axis=0) return losses