def _train_body(self, state, next_state, action, reward, not_done, discount): with tf.device("/cpu:0"): with tf.GradientTape() as tape: # action_val = tf.expand_dims(tf.argmax(action, axis=1), axis=1) # action_val = tf.cast(action_val, dtype=tf.int32) action = tf.cast(tf.expand_dims(action, axis=1), dtype=tf.int32) indices = tf.concat(values=[ tf.expand_dims(tf.range(state.shape[0]), axis=1), action ], axis=1) current_Q = tf.expand_dims(tf.gather_nd( self.q_func(state), indices), axis=1) target_Q = self.q_func(next_state) target_Q = reward + (not_done * discount * tf.reduce_max( target_Q, keepdims=True, axis=1)) target_Q = tf.stop_gradient(target_Q) td_error = current_Q - target_Q q_func_loss = tf.reduce_mean(huber_loss(td_error, delta=2.)) # q_func_loss = tf.reduce_mean(tf.square(td_error)) q_func_grad = tape.gradient(q_func_loss, self.q_func.trainable_variables) self.q_func_optimizer.apply_gradients( zip(q_func_grad, self.q_func.trainable_variables)) return td_error, q_func_loss
def _setup_loss_graph(self, s_output_tbi, s_target_tbi, s_step_size): """ Connect a loss function to the graph See data.py for explanation of the slicing part """ s_sliced_output_tbi = s_output_tbi[-s_step_size :] s_sliced_target_tbi = s_target_tbi[-s_step_size :] if self._options['loss_type'] == 'l2': return l2_loss(s_sliced_output_tbi, s_sliced_target_tbi) if self._options['loss_type'] == 'l1': return l1_loss(s_sliced_output_tbi, s_sliced_target_tbi) if self._options['loss_type'] == 'huber': delta = self._options['huber_delta'] return huber_loss(s_sliced_output_tbi, s_sliced_target_tbi, delta) assert False, 'Invalid loss_type option' return tt.alloc(np.float32(0.))
def __init__(self, observation_type): self.weights = { "fc1" : fc_init_he(8, 128), "fc2" : fc_init_he(128, 128), "fc3" : fc_init_he(128, 4) } self.observation = observation_type self.targets = T.matrix() # self.targets will 99.99% of the time be bs x actions, so we'll just assume it's that way self.lr = T.scalar() Q = self.forward(self.observation) # squared error loss = huber_loss(self.targets, Q) updates = RMSprop(cost = loss, params = self.get_weights(), lr = self.lr) self.get_Q = theano.function(inputs = [self.observation], outputs = Q) self.train_Q = theano.function(inputs = [self.observation, self.targets, self.lr], outputs = loss, updates = updates)
def __init__(self,batch_size=10,im_size=64,channels=3,dtype=tf.float32,analytics=True): self.analytics = analytics self.batch_size = batch_size self.x_a = tf.placeholder(dtype,[None,im_size,im_size,channels],name='xa') self.x_b = tf.placeholder(dtype,[None,im_size,im_size,channels],name='xb') #Generator Networks self.g_ab = utils.generator(self.x_a,name="gen_AB",im_size=im_size) self.g_ba = utils.generator(self.x_b,name="gen_BA",im_size=im_size) #Secondary generator networks, reusing params of previous two self.g_aba = utils.generator(self.g_ab,name="gen_BA",im_size=im_size,reuse=True) self.g_bab = utils.generator(self.g_ba,name="gen_AB",im_size=im_size,reuse=True) #Discriminator for input a self.disc_a_real = utils.discriminator(self.x_a,name="disc_a",im_size=im_size) self.disc_a_fake = utils.discriminator(self.g_ba,name="disc_a",im_size=im_size,reuse=True) #Discriminator for input b self.disc_b_real = utils.discriminator(self.x_b,name="disc_b") self.disc_b_fake = utils.discriminator(self.g_ab,name="disc_b",reuse=True) #Reconstruction loss for generators self.l_const_a = tf.reduce_mean(utils.huber_loss(self.g_aba,self.x_a)) self.l_const_b = tf.reduce_mean(utils.huber_loss(self.g_bab,self.x_b)) #Generation loss for generators self.l_gan_a = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_a_fake,labels=tf.ones_like(self.disc_a_fake))) self.l_gan_b = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_b_fake,labels=tf.ones_like(self.disc_b_fake))) #Real example loss for discriminators self.l_disc_a_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_a_real,labels=tf.ones_like(self.disc_a_real))) self.l_disc_b_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_b_real,labels=tf.ones_like(self.disc_b_real))) #Fake example loss for discriminators self.l_disc_a_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_a_fake,labels=tf.zeros_like(self.disc_a_fake))) self.l_disc_b_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_b_fake,labels=tf.zeros_like(self.disc_b_fake))) #Combined loss for individual discriminators self.l_disc_a = self.l_disc_a_real + self.l_disc_a_fake self.l_disc_b = self.l_disc_b_real + self.l_disc_b_fake #Total discriminator loss self.l_disc = self.l_disc_a + self.l_disc_b #Combined loss for individual generators self.l_ga = self.l_gan_a + self.l_const_b self.l_gb = self.l_gan_b + self.l_const_a #Total GAN loss self.l_g = self.l_ga + self.l_gb #Parameter Lists self.disc_params = [] self.gen_params = [] for v in tf.trainable_variables(): if 'disc' in v.name: self.disc_params.append(v) if 'gen' in v.name: self.gen_params.append(v) if self.analytics: self.init_analytics() self.gen_a_dir = 'generator a->b' self.gen_b_dir = 'generator b->a' self.rec_a_dir = 'reconstruct a' self.rec_b_dir = 'reconstruct b' self.model_directory = "models" if not os.path.exists(self.gen_a_dir): os.makedirs(self.gen_a_dir) if not os.path.exists(self.gen_b_dir): os.makedirs(self.gen_b_dir) if not os.path.exists(self.rec_b_dir): os.makedirs(self.rec_b_dir) if not os.path.exists(self.rec_a_dir): os.makedirs(self.rec_a_dir) self.sess = tf.Session() self.saver = tf.train.Saver()
u1 = tf.compat.v1.get_variable('weights_1_2', initializer=tf.constant(0.0)) b1 = tf.compat.v1.get_variable('bias_1', initializer=tf.constant(0.0)) w2 = tf.compat.v1.get_variable('weights_2', initializer=tf.constant(0.0)) b2 = tf.compat.v1.get_variable('bias_2', initializer=tf.constant(0.0)) # Step 4: 构造用来预测 Y 的模型 # 把前面的 w, X, b 都用上 Y_predicted = w * X + b Y_predicted_1 = w1 * X * X + u1 * X + b1 Y_predicted_2 = w2 * X + b2 # Step 5: 损失函数用 MSE,也可以使用 Huber loss loss = tf.square(Y - Y_predicted, name='loss') loss_1 = tf.square(Y - Y_predicted_1, name='loss_1') loss_2 = utils.huber_loss(Y, Y_predicted_2) # Step 6: 使用梯度下降方法来最小化 loss,学习率是 0.001 optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss) optimizer_1 = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss_1) optimizer_2 = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss_2) start = time.time() # 写入 graph writer = tf.summary.FileWriter('data/graphs/linear_reg', tf.compat.v1.get_default_graph()) with tf.compat.v1.Session() as sess: # Step 7: 初始化所有的变量 sess.run(tf.compat.v1.global_variables_initializer())
def dqn_worker(env, name, optimizer_spec, session, exploration, replay_buffer_size, batch_size, gamma, learn_start, learn_freq, history_frames_num, target_update_freq, grad_norm_clipping, stop_criterion=None): ################# # build network # ################# def q_net(input, act_num, scope, reuse=False): with tf.variable_scope(scope, reuse=reuse): out = input with tf.variable_scope('convnet'): out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) out = layers.flatten(out) with tf.variable_scope('action_value'): out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) out = layers.fully_connected(out, num_outputs=act_num, activation_fn=None) return out ################### # build functions # ################### print('yeah') img_h, img_w, img_c = env.observation_size input_shape = (img_h, img_w, history_frames_num * img_c) act_num = len(env.action_space) # c : current # n : next ### set up placeholders ### obs_c_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) act_c_ph = tf.placeholder(tf.int32, [None]) rew_c_ph = tf.placeholder(tf.float32, [None]) obs_n_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) done_ph = tf.placeholder(tf.float32, [None]) # if next state is the end, 0. or 1. ### transform the observation pixels value to float between 0.~1. ### obs_c_float = tf.cast(obs_c_ph, tf.float32) obs_n_float = tf.cast(obs_n_ph, tf.float32) ### compute the TD error ### q_c_values = q_net(obs_c_float, act_num, scope='q_net', reuse=False) q_c_selected = tf.reduce_sum(q_c_values * tf.one_hot(act_c_ph, act_num), 1) q_n_values = q_net(obs_n_float, act_num, scope='target_q_net') q_n_max = tf.reduce_max(q_n_values, 1) q_c_selected_target = rew_c_ph + gamma * (1.0 - done_ph) * q_n_max td_error = q_c_selected - tf.stop_gradient(q_c_selected_target) errors = utils.huber_loss(td_error) mean_error = tf.reduce_mean(errors) ### collection parameters of q_net and target_q_net ### q_net_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_net') target_q_net_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net') ### optimization function ### learn_rate = tf.placeholder(tf.float32, (), name='learn_rate') optimizer = optimizer_spec.constructor(learning_rate=learn_rate, **optimizer_spec.kwargs) train_fn = utils.minimize_and_clip(optimizer, mean_error, var_list=q_net_params, clip_val=grad_norm_clipping) ### update target q network function ### update_target_fn = [] for param, target_param in zip( sorted(q_net_params, key=lambda p: p.name), sorted(target_q_net_params, key=lambda p: p.name)): update_target_fn.append(target_param.assign(param)) ### set up replay buffer ### replay_buffer = Replay_buffer(replay_buffer_size, history_frames_num) ####################### # interation with env # ####################### ### initialization ### ### webdriver setting options = webdriver.ChromeOptions() options.add_argument("--window-size=600,800") options.add_argument("--allow-file-access-from-files") options.add_argument("--disable-infobars") driver = webdriver.Chrome(chrome_options=options) driver.get('http://localhost:3000') driver.implicitly_wait(2) #print(driver.find_element_by_css_selector("#startMenuWrapper").value_of_css_property('max-height')) env.click_play_btn_with_name(name, driver) driver.implicitly_wait(2) time.sleep(1) ### model_initialized = False train_num = -1 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') episode_reward_list = [] radius = env.get_radius(name) last_obs = env.process_screenshot(driver) LOG_EVERY_N_STEPS = 100 init_op = tf.global_variables_initializer() session.run(init_op) ### the counter of time steps ### for t in itertools.count(): if stop_criterion is not None and stop_criterion(env, t): break idx = replay_buffer.store_frame(last_obs) obs_c = replay_buffer.stack_recent_obs() explore_prob = random.random() ### epsilon greedy exploration policy ### if explore_prob < exploration.value(t): action = random.randrange(act_num) else: action_values = session.run(q_c_values, feed_dict={obs_c_ph: obs_c[None]}) action = np.argmax(action_values) ### step to next state ### obs, reward, done, info = env.step(driver, name, action) replay_buffer.store_transition(idx, action, reward, done) if done: env.reset(driver) radius_ = env.get_radius(name) episode_reward = radius_ - radius episode_reward_list.append(episode_reward) radius = radius_ ### train the networks ### if (t > learn_start and t % learn_freq == 0 and replay_buffer.can_sample(batch_size)): # 1.sample transitions # obs_batch,act_batch,rew_batch,next_obs_batch,done_batch\ =replay_buffer.sample(batch_size) # 2.initialize the model # if not model_initialized: utils.initialize_interdependent_variables( session, tf.global_variables(), { obs_c_ph: obs_batch, obs_n_ph: next_obs_batch, }) model_initialized = True # 3.train the model # session.run(train_fn, feed_dict={ obs_c_ph: obs_batch, act_c_ph: act_batch, rew_c_ph: rew_batch, obs_n_ph: next_obs_batch, done_ph: done_batch, learn_rate: optimizer_spec.lr_schedule.value(t) }) train_num += 1 # 4.update target network # if train_num % target_update_freq == 0: session.run(update_target_fn) ####### # log # ####### if len(episode_reward_list) > 0: mean_episode_reward = np.mean(episode_reward_list[-100:]) if len(episode_reward_list) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: print('########## log ##########') print('Timestep %d' % (t, )) print('mean reward (100 episodes) %f' % mean_episode_reward) print('best mean reward %f' % best_mean_episode_reward) print('episodes %d' % len(episode_reward_list)) print('exploration %f' % exploration.value(t)) print('learning_rate %f' % optimizer_spec.lr_schedule.value(t)) sys.stdout.flush( ) # show all information on terminal before next interation begin frame = env.get_screenshot(driver) if np.mean(frame) < 50: env.reset(driver) last_obs = env.process_screenshot(driver)
def loss_function(self, y_true, y_pred): del_t = self.huber_loss_init(self.optical_flow.outputs) huber_loss = utils.huber_loss(del_t, self.delta) return K.square(y_pred - y_true) + (self.huber_weight * huber_loss)
# Create Dataset and iterator dataset = tf.data.Dataset.from_tensor_slices((data[:,0], data[:,1])) iterator = dataset.make_initializable_iterator() X, Y = iterator.get_next() # Create weight and bias, initialized to 0 w = tf.get_variable('weights', initializer=tf.constant(0.0)) b = tf.get_variable('bias', initializer=tf.constant(0.0)) # Build model to predict Y Y_predicted = w * X + b # Use either square (type = 'SQUARE') or Huber (type = 'HUBER') loss function loss = tf.square(Y - Y_predicted, name='loss') if LOSS_TYPE == 'SQUARE' else utils.huber_loss(Y, Y_predicted) # Use gradient descent to minimize loss optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE).minimize(loss) start = time.time() with tf.Session() as sess: # Initialize variables sess.run(tf.global_variables_initializer()) # Create writer for TensorBoard writer = tf.summary.FileWriter('./graphs/linear_reg', sess.graph) # Train the model for 100 epochs for i in range(N_EPOCHS):
bl = tf.get_variable('bias', initializer=tf.constant(0.0)) wq = tf.get_variable('weights_1', initializer=tf.constant(0.0)) uq = tf.get_variable('weights_2', initializer=tf.constant(0.0)) bq = tf.get_variable('biasq', initializer=tf.constant(0.0)) wlh = tf.get_variable('weightsh', initializer=tf.constant(0.0)) blh = tf.get_variable('biash', initializer=tf.constant(0.0)) Y_q = wq * X * X + uq * X + bq Y_l = wl * X + bl Y_lh = wlh * X + blh loss_q = tf.square(Y - Y_q, name='loss_q') loss_l = tf.square(Y - Y_l, name='loss_l') loss_lh = utils.huber_loss(Y, Y_lh) optimizer_q = tf.train.GradientDescentOptimizer( learning_rate=0.001).minimize(loss_q) optimizer_l = tf.train.GradientDescentOptimizer( learning_rate=0.001).minimize(loss_l) optimizer_lh = tf.train.GradientDescentOptimizer( learning_rate=0.001).minimize(loss_lh) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(100): for x, y in data: sess.run(optimizer_q, feed_dict={X: x, Y: y}) sess.run(optimizer_l, feed_dict={X: x, Y: y}) sess.run(optimizer_lh, feed_dict={X: x, Y: y})
g_ba = utils.generator(x_b,BATCH_SIZE,name="gen_BA") #Secondary generator networks, reusing params of previous two g_aba = utils.generator(g_ab,BATCH_SIZE,name="gen_BA",reuse=True) g_bab = utils.generator(g_ba,BATCH_SIZE,name="gen_AB",reuse=True) #Discriminator for input a disc_a_real = utils.discriminator(x_a,name="disc_a") disc_a_fake = utils.discriminator(g_ba,name="disc_a",reuse=True) #Discriminator for input b disc_b_real = utils.discriminator(x_b,name="disc_b") disc_b_fake = utils.discriminator(g_ab,name="disc_b",reuse=True) #Reconstruction loss for generators l_const_a = tf.reduce_mean(utils.huber_loss(g_aba,x_a)) l_const_b = tf.reduce_mean(utils.huber_loss(g_bab,x_b)) #Generation loss for generators l_gan_a = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_a_fake,labels=tf.ones_like(disc_a_fake))) l_gan_b = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_b_fake,labels=tf.ones_like(disc_b_fake))) #Real example loss for discriminators l_disc_a_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_a_real,labels=tf.ones_like(disc_a_real))) l_disc_b_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_b_real,labels=tf.ones_like(disc_b_real))) #Fake example loss for discriminators l_disc_a_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_a_fake,labels=tf.zeros_like(disc_a_fake))) l_disc_b_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_b_fake,labels=tf.zeros_like(disc_b_fake))) #Combined loss for individual discriminators
# Remember both X and Y are scalars with type float X = tf.placeholder(tf.float32, name='X') Y = tf.placeholder(tf.float32, name='Y') # Step 3: 建立 weight 和 bias, 並設定初始值 0.0 # Make sure to use tf.get_variable w = tf.get_variable('weight', initializer=tf.constant(0.0)) b = tf.get_variable('bias', initializer=tf.constant(0.0)) # Step 4: 建立 model 來預測 Y # e.g. how would you derive at Y_predicted given X, w, and b Y_predicted = w * X + b # Step 5: 利用 square error 來當作 loss function # loss = tf.square(Y - Y_predicted, name='loss') loss = utils.huber_loss(Y, Y_predicted) # 自定義的loss function # Step 6: 利用 gradient descent 搭配 learning rate 0.001 來降低 loss optimizer = tf.train.GradientDescentOptimizer( learning_rate=0.001).minimize(loss) start = time.time() # 建立 filewriter 將 model's graph 寫入 TensorBoard writer = tf.summary.FileWriter('./graphs/linear_reg', tf.get_default_graph()) with tf.Session() as sess: # Step 7: 初始化所需要的變數 w 和 b sess.run(tf.global_variables_initializer()) # Step 8: 訓練 model, 100 epochs
# Step 2: create placeholders for input X (number of fire) and label Y (number of theft) x = tf.placeholder(dtype=tf.float32, shape=(), name='x') y = tf.placeholder(dtype=tf.float32, shape=(), name='y') # Step 3: create weight and bias, initialized to 0 w = tf.Variable(initial_value=0.0, name='w') b = tf.Variable(initial_value=0.0, name='b') # Step 4: predict Y (number of theft) from the number of fire # name your variable Y_predicted y_predicted = x * w + b # Step 5: use the square error as the loss function #loss = tf.nn.l2_loss(y_predicted - y, name='loss') loss = utils.huber_loss(y, y_predicted) # Step 6: using gradient descent with learning rate of 0.01 to minimize loss optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss) # Phase 2: Train our model with tf.Session() as sess: # Step 7: initialize the necessary variables, in this case, w and b sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('/tmp/linear-reg', sess.graph) # Step 8: train the model for i in range(100): # run 100 epochs total_loss = 0 for train_x, train_y in data: # Session runs optimizer to minimize loss and fetch the value of loss. Name the received value as l
def loss_func(x, y): return utils.huber_loss(y, prediction(x), 10.0)
def learn_by_dqn(env, q_net, optimizer_spec, session, exploration, replay_buffer_size, batch_size, gamma, learn_start, learn_freq, history_frames_num, target_update_freq, grad_norm_clipping, stop_criterion=None): ################### # build functions # ################### img_h, img_w, img_c = env.observation_space.shape input_shape = (img_h, img_w, history_frames_num * img_c) act_num = env.action_space.n # c : current # n : next ### set up placeholders ### obs_c_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) act_c_ph = tf.placeholder(tf.int32, [None]) rew_c_ph = tf.placeholder(tf.float32, [None]) obs_n_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) done_ph = tf.placeholder(tf.float32, [None]) # if next state is the end, 0. or 1. ### transform the observation pixels value to float between 0.~1. ### obs_c_float = tf.cast(obs_c_ph, tf.float32) / 255.0 obs_n_float = tf.cast(obs_n_ph, tf.float32) / 255.0 ### compute the TD error ### q_c_values = q_net(obs_c_float, act_num, scope='q_net', reuse=False) q_c_selected = tf.reduce_sum(q_c_values * tf.one_hot(act_c_ph, act_num), 1) q_n_values = q_net(obs_n_float, act_num, scope='target_q_net') q_n_max = tf.reduce_max(q_n_values, 1) q_c_selected_target = rew_c_ph + gamma * (1.0 - done_ph) * q_n_max td_error = q_c_selected - tf.stop_gradient(q_c_selected_target) errors = utils.huber_loss(td_error) mean_error = tf.reduce_mean(errors) ### collection parameters of q_net and target_q_net ### q_net_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_net') target_q_net_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net') ### optimization function ### learn_rate = tf.placeholder(tf.float32, (), name='learn_rate') optimizer = optimizer_spec.constructor(learning_rate=learn_rate, **optimizer_spec.kwargs) train_fn = utils.minimize_and_clip(optimizer, mean_error, var_list=q_net_params, clip_val=grad_norm_clipping) ### update target q network function ### update_target_fn = [] for param, target_param in zip( sorted(q_net_params, key=lambda p: p.name), sorted(target_q_net_params, key=lambda p: p.name)): update_target_fn.append(target_param.assign(param)) ### set up replay buffer ### replay_buffer = utils.Replay_buffer(replay_buffer_size, history_frames_num) ####################### # interation with env # ####################### model_initialized = False train_num = -1 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() #env.render() LOG_EVERY_N_STEPS = 10000 init_op = tf.global_variables_initializer() session.run(init_op) ### the counter of time steps ### for t in itertools.count(): if stop_criterion is not None and stop_criterion(env, t): break idx = replay_buffer.store_frame(last_obs) obs_c = replay_buffer.stack_recent_obs() explore_prob = random.random() ### epsilon greedy exploration policy ### if explore_prob < exploration.value(t): action = env.action_space.sample() # ??? # else: action_values = session.run(q_c_values, feed_dict={obs_c_ph: obs_c[None]}) action = np.argmax(action_values) ### step to next state ### obs, reward, done, info = env.step(action) #env.render() replay_buffer.store_transition(idx, action, reward, done) if not done: last_obs = obs else: last_obs = env.reset() ### train the networks ### if (t > learn_start and t % learn_freq == 0 and replay_buffer.can_sample(batch_size)): # 1.sample transitions # obs_batch,act_batch,rew_batch,next_obs_batch,done_batch\ =replay_buffer.sample(batch_size) # 2.initialize the model # if not model_initialized: utils.initialize_interdependent_variables( session, tf.global_variables(), { obs_c_ph: obs_batch, obs_n_ph: next_obs_batch, }) model_initialized = True # 3.train the model # session.run(train_fn, feed_dict={ obs_c_ph: obs_batch, act_c_ph: act_batch, rew_c_ph: rew_batch, obs_n_ph: next_obs_batch, done_ph: done_batch, learn_rate: optimizer_spec.lr_schedule.value(t) }) train_num += 1 # 4.update target network # if train_num % target_update_freq == 0: session.run(update_target_fn) ####### # log # ####### episode_rewards = utils.get_wrapper_by_name( env, "Monitor").get_episode_rewards() # ??? # if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: print('##########log##########') print('Timestep %d' % (t, )) print('mean reward (100 episodes) %f' % mean_episode_reward) print('best mean reward %f' % best_mean_episode_reward) print('episodes %d' % len(episode_rewards)) print('exploration %f' % exploration.value(t)) print('learning_rate %f' % optimizer_spec.lr_schedule.value(t)) print('#######################') sys.stdout.flush( ) # show all information on terminal before next interation begin
# Step 2: create Dataset and iterator dataset = tf.contrib.data.Dataset.from_tensor_slices((data[:, 0], data[:, 1])) iterator = dataset.make_initializable_iterator() X, Y = iterator.get_next() # Step 3: create weight and bias, initialized to 0 w = tf.get_variable('weights', initializer=tf.constant(0.0)) b = tf.get_variable('bias', initializer=tf.constant(0.0)) # Step 4: build model to predict Y Y_predicted = X * w + b # Step 5: use the square error as the loss function # loss = tf.square(Y - Y_predicted, name='loss') loss = utils.huber_loss(Y, Y_predicted) # Step 6: using gradient descent with learning rate of 0.001 to minimize loss optimizer = tf.train.GradientDescentOptimizer( learning_rate=0.001).minimize(loss) start = time.time() with tf.Session() as sess: # Step 7: initialize the necessary variables, in this case, w and b sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('./graphs/linear_reg', sess.graph) # Step 8: train the model for 100 epochs for i in range(100): sess.run(iterator.initializer) # initialize the iterator total_loss = 0
w, b = tf.get_variable(initializer=tf.constant(0.0), name="w"), tf.get_variable(initializer=tf.constant(0.0), name="b") ############################# ########## TO DO ############ ############################# # Step 4: build model to predict Y # e.g. how would you derive at Y_predicted given X, w, and b Y_predicted = w * X + b ############################# ########## TO DO ############ ############################# # Step 5: use the square error as the loss function loss = utils.huber_loss(Y, Y_predicted, delta=14.0) ############################# ########## TO DO ############ ############################# # Step 6: using gradient descent with learning rate of 0.001 to minimize loss optimizer = tf.train.GradientDescentOptimizer( learning_rate=0.001).minimize(loss) start = time.time() # Create a filewriter to write the model's graph to TensorBoard ############################# ########## TO DO ############ ############################# writer = tf.summary.FileWriter('./graphs/linear_reg', tf.get_default_graph())
n_samples = sheet.nrows - 1 # Step 2: create placeholders for input X (number of fire) and label Y (number of theft) X = tf.placeholder(tf.float32, name='X') Y = tf.placeholder(tf.float32, name='Y') # Step 3: create weight and bias, initialized to 0 w = tf.Variable(0.0, name='weights') b = tf.Variable(0.0, name='bias') # Step 4: build model to predict Y Y_predicted = X * w + b # Step 5: use the square error as the loss function #loss = tf.square(Y - Y_predicted, name='loss') loss = utils.huber_loss(Y, Y_predicted) # Step 6: using gradient descent with learning rate of 0.01 to minimize loss optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss) with tf.Session() as sess: # Step 7: initialize the necessary variables, in this case, w and b sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('./graphs/linear_reg', sess.graph) # Step 8: train the model for i in range(100): # train the model 100 epochs total_loss = 0 for x, y in data: # Session runs train_op and fetch values of loss
dataset = tf.data.Dataset.from_tensor_slices((data[:,0], data[:,1])) #iterator = dataset.make_one_shot_iterator() iterator = dataset.make_initializable_iterator() X, Y = iterator.get_next() w = tf.get_variable('weights', initializer = tf.constant(0.0)) b = tf.get_variable('bias', initializer = tf.constant(0.0)) wh = tf.get_variable('weights_h', initializer = tf.constant(0.0)) bh = tf.get_variable('bias_h', initializer = tf.constant(0.0)) Y_predict = w *X + b Y_predicth = wh * X + bh loss = tf.square(Y - Y_predict, name = 'loss') loss_h = utils.huber_loss(Y, Y_predicth) optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001).minimize(loss) optimizerh = tf.train.GradientDescentOptimizer(learning_rate = 0.001).minimize(loss_h) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(100): sess.run(iterator.initializer) try: while True: sess.run([optimizer, optimizerh])