def clipped_masked_error(args): y_pred, y_true, mask = args loss = huber_loss(y_true=y_true, y_pred=y_pred, max_grad=self.max_grad) loss *= mask return K.sum(loss, axis=-1)
#output_Q is the sum of 2 Q and created for action selection: output_Q = tf.add(output_Q1, output_Q2) #The followings are just for the convience of calculating the loss #Since I have 2 losses in this case, I need to make 2 pairs of the target and #Prediction: index1argmax = tf.argmax(tf.matmul(next_x,weight1) + bias1, axis = 1) y_true1 = tf.gather_nd(tf.matmul(next_x, weight2) + bias2, index1 ) * gamma * (1-terminal) + r y_pred1 = tf.gather_nd(output_Q1, action) index2argmax = tf.argmax(tf.matmul(next_x,weight2) + bias2, axis = 1) y_true2 = tf.gather_nd(tf.matmul(next_x, weight1) + bias1, index2 ) * gamma * (1-terminal) + r y_pred2 = tf.gather_nd(output_Q2, action) #create the loss function and set up the training step: #samples and form a batch to calculate the 2 kinds of loss #along with the 2 trainning step loss1 = tf.reduce_mean(objectives.huber_loss(y_true1, y_pred1)) loss2 = tf.reduce_mean(objectives.huber_loss(y_true2, y_pred2)) train_step1 = tf.train.AdamOptimizer(alpha).minimize(loss1) train_step2 = tf.train.AdamOptimizer(alpha).minimize(loss2) sess.run(tf.global_variables_initializer()) #==================DOING THE TRAINING LOOP========================= #NOTE: if current number of samples is less than the batch size I will create t #the batch first by performing random action to get enough samples. update_counter = 0 rewardOneEpisode = 0 Q4Saver(weight1,bias1,weight2,bias2, 'P0', sess) while update_counter < num_iteration: env.reset() for j in range(batch_size, Max_TimeStep): stateBatch, actionBatch, rewardBatch, nextStateBatch, terminalBatch = \
x = tf.placeholder(tf.float32, shape = [None, num_frame_skip*84*84], name = 'x') next_x = tf.placeholder(tf.float32, shape = [None, num_frame_skip*84*84], name = 'next_x') online_weight = tf.Variable(tf.truncated_normal([num_frame_skip*84*84, output_num],stddev = 0.1), name = 'online_weight') target_weight = tf.placeholder(dtype = tf.float32, shape = [num_frame_skip*84*84, output_num], name = 'target_weight') online_bias = tf.Variable(tf.zeros([output_num]),dtype = tf.float32, name='online_bias') target_bias = tf.placeholder(dtype = tf.float32, shape = [output_num], name='target_bias') output_Q = tf.matmul(x, online_weight) + online_bias target_Q = tf.matmul(next_x, target_weight) + target_bias y_true = r + gamma * tf.reduce_max(target_Q) * (1-terminal) y_pred = tf.gather_nd(output_Q, action) #create the loss function and set up the training step: #samples and form a batch to calculate the loss loss = tf.reduce_mean(objectives.huber_loss(y_true, y_pred)) train_step = tf.train.AdamOptimizer(alpha).minimize(loss) sess.run(tf.global_variables_initializer()) #doing the trainning loop: #I used the batch update version here update_counter = 0 Q2Saver(online_weight, online_bias, 'P0', sess) while update_counter < num_iteration: env.reset() rewardOneEpisode = 0 #collect reward obtained in one episode. for j in range(batch_size, Max_TimeStep): target_w = online_weight.eval(session = sess) target_b = online_bias.eval(session = sess) stateBatch, actionBatch, rewardBatch, nextStateBatch, terminalBatch, recentReward, break_flag = getTrainingBatch(env, batch_size, output_Q, LinearPolicy, sample_queue, sess)
import numpy as np import objectives import operator import scipy.special import tensorflow as tf y_true_ph = tf.placeholder(tf.float32, shape=(4)) y_pred_ph = tf.placeholder(tf.float32, shape=(4)) huber_loss_tensor = objectives.huber_loss(y_true_ph, y_pred_ph) mean_huber_loss_tensor = objectives.mean_huber_loss(y_true_ph, y_pred_ph) sess = tf.Session() with sess.as_default(): y_true = [1, 2, 3, 4] y_pred = [1, 2.5, 4, 33] expected = scipy.special.huber(1, list(map(operator.sub, y_true, y_pred))) expected_mean = np.mean(expected) sess.run(tf.global_variables_initializer()) feed_dict = {y_true_ph: y_true, y_pred_ph: y_pred} output = sess.run(huber_loss_tensor, feed_dict=feed_dict) #print(output) #print(expected) print(output == expected) output = sess.run(mean_huber_loss_tensor, feed_dict=feed_dict) #print(output) #print(expected_mean) print(output == expected_mean)