def get_optimizer(learning_rate, hparams): """Get the tf.train.Optimizer for this optimizer string. Args: learning_rate: The learning_rate tensor. hparams: tf.contrib.training.HParams object with the optimizer and momentum values. Returns: optimizer: The tf.train.Optimizer based on the optimizer string. """ return { "rmsprop": tf.RMSPropOptimizer(learning_rate, decay=0.95, momentum=hparams.momentum, epsilon=1e-4), "adam": tf.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8), "adagrad": tf.AdagradOptimizer(learning_rate, initial_accumulator_value=1.0), "mom": tf.MomentumOptimizer(learning_rate, momentum=hparams.momentum), "sgd": tf.GradientDescentOptimizer(learning_rate) }.get(hparams.optimizer)
def build_optimize_graph(self, loss): items_to_train = float(self.items_per_epoch * self.num_epochs) global_step = tf.Variable(0, name="global_step") self.global_step = global_step learning_rate = 0.001*self.learning_rate optimizer = tf.GradientDescentOptimizer(learning_rate) train = optimizer.minimize(loss, global_step=self.global_step, gate_gradients=optimizer.GATE_NONE) return train
def word2vec(batch_gen): """ Build the graph for word2vec model and train it """ # Step 1: define the placeholders for input and output # center_words have to be int to work on embedding lookup # TO DO with tf.name_scope('data'): center_words = tf.placeholder(tf.int32, [BATCH_SIZE], name='center_words') target_words = tf.placeholder(tf.int32, [BATCH_SIZE, 1], name='target_words') # Step 2: define weights. In word2vec, it's actually the weights that we care about # vocab size x embed size # initialized to random uniform -1 to 1 # TOO DO with tf.name_scope('embedding_matrix'): embed_matrix = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0), name='embed_matrix') # Step 3: define the inference # get the embed of input words using tf.nn.embedding_lookup # embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed') # TO DO with tf.name_scope('loss'): embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed') # Step 4: construct variables for NCE loss # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...) # nce_weight (vocab size x embed size), intialized to truncated_normal stddev=1.0 / (EMBED_SIZE ** 0.5) # bias: vocab size, initialized to 0 # TO DO nce_weights = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / (EMBED_SIZE**0.5)), name='nce_weights') nce_biases = tf.Variable(tf.zeros(VOCAB_SIZE), name='nce_biases') # define loss function to be NCE loss function # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...) # need to get the mean accross the batch # note: you should use embedding of center words for inputs, not center words themselves # TO DO nce_loss = tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=target_words, inputs=embed, num_sampled=NUM_SAMPLED, num_classes=VOCAB_SIZE, name='loss') loss = tf.reduce_mean(nce_loss) # Step 5: define optimizer # TO DO optimizer = tf.GradientDescentOptimizer(LEARNING_RATE).minimize(loss) with tf.Session() as sess: # TO DO: initialize variables sess.run(tf.global_variable_initializer()) total_loss = 0.0 # we use this to calculate the average loss in the last SKIP_STEP steps writer = tf.summary.FileWriter('./graphs/no_frills/', sess.graph) for index in range(NUM_TRAIN_STEPS): centers, targets = next(batch_gen) # TO DO: create feed_dict, run optimizer, fetch loss_batch _, loss_batch = sess.run([optimizer, loss], feed_dict={ center_words: centers, target_words: targets }) total_loss += loss_batch if (index + 1) % SKIP_STEP == 0: print('Average loss at step {}: {:5.1f}'.format( index, total_loss / SKIP_STEP)) total_loss = 0.0 writer.close()
def kfac_optimizer(model_creator): stats_batch_size = 10000 main_batch_size = 10000 stats_model, loss, labels = model_creator(stats_batch_size) # replace labels_node with synthetic labels main_model, _, _ = model_creator(main_batch_size) opt = tf.GradientDescentOptimizer(0.2) grads_and_vars = opt.compute_gradients(loss) trainable_vars = tf.trainable_variables() # create SVD and preconditioning variables for matmul vars for var in trainable_vars: if var not in matmul_registry: continue dW = u.extract_grad(grads_and_vars, var) A[var] = get_activations(var) B[var] = get_backprops(var) B2[var] = get_backprops2(var) # get backprops with synthetic labels dW[var] = B[var] @ t(A[var]) # todo: sort out dsize division cov_A[var] = init_var(A[var] @ t(A[var]) / dsize, "cov_A_%s" % (var.name, )) cov_B2[var] = init_var(B2[var] @ t(B2[var]) / dsize, "cov_B2_%s" % (var.name, )) vars_svd_A[var] = SvdWrapper(cov_A[var], "svd_A_%d" % (var.name, )) vars_svd_B2[var] = SvdWrapper(cov_B2[var], "svd_B2_%d" % (var.name, )) whitened_A = u.pseudo_inverse2(vars_svd_A[var]) @ A[var] whitened_B2 = u.pseudo_inverse2(vars_svd_B2[var]) @ B[var] whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[var]) @ A[var] whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[var]) @ B[var] pre_dW[var] = (whitened_B2 @ t(whitened_A)) / dsize pre_dW_stable[var] = ( whitened_B2_stable @ t(whitened_A_stable)) / dsize dW[var] = (B[var] @ t(A[var])) / dsize # create update params ops # new_grads_and_vars = [] # for grad, var in grads_and_vars: # if var in kfac_registry: # pre_A, pre_B = kfac_registry[var] # new_grad_live = pre_B @ grad @ t(pre_A) # new_grads_and_vars.append((new_grad, var)) # print("Preconditioning %s"%(var.name)) # else: # new_grads_and_vars.append((grad, var)) # train_op = opt.apply_gradients(new_grads_and_vars) # Each variable has an associated gradient, pre_gradient, variable save op def update_grad(): ops = [grad_update_ops[var] for var in trainable_vars] sess.run(ops) def update_pre_grad(): ops = [pre_grad_update_ops[var] for var in trainable_vars] sess.run(ops) def update_pre_grad2(): ops = [pre_grad2_update_ops[var] for var in trainable_vars] sess.run(ops) def save_params(): ops = [var_save_ops[var] for var in trainable_vars] sess.run(ops) for step in range(num_steps): update_covariances() if step % whitened_every_n_steps == 0: update_svds() update_grad() update_pre_grad() # perf todo: update one of these update_pre_grad2() # stable alternative lr0, loss0 = sess.run([lr, loss]) save_params() # when grad norm<1, Fisher is unstable, switch to Sqrt(Fisher) # TODO: switch to per-matrix normalization stabilized_mode = grad_norm.eval() < 1 if stabilized_mode: update_params2() else: update_params() loss1 = loss.eval() advance_batch() # line search stuff target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else -pre_grad_stable_dot_grad.eval()) target_delta = lr0 * target_slope actual_delta = loss1 - loss0 actual_slope = actual_delta / lr0 slope_ratio = actual_slope / target_slope # between 0 and 1.01 losses.append(loss0) step_lengths.append(lr0) ratios.append(slope_ratio) if step % report_frequency == 0: print( "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f" % (step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval())) u.record_time()
# note: you should use embedding of center words for inputs, not center words themselves # TO DO nce_loss = tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=target_words, inputs=embed, num_sampled=NUM_SAMPLED, num_classes=VOCAB_SIZE, name='loss') loss = tf.reduce_mean(nce_loss) # Step 5: define optimizer # TO DO optimizer = tf.GradientDescentOptimizer(LEARNING_RATE).minimize(loss) with tf.Session() as sess: # TO DO: initialize variables sess.run(tf.global_variable_initializer()) total_loss = 0.0 # we use this to calculate the average loss in the last SKIP_STEP steps writer = tf.summary.FileWriter('./graphs/no_frills/', sess.graph) for index in range(NUM_TRAIN_STEPS): centers, targets = next(batch_gen) # TO DO: create feed_dict, run optimizer, fetch loss_batch _, loss_batch = sess.run([optimizer, loss], feed_dict={center_words: centers, target_words: targets}) total_loss += loss_batch if (index + 1) % SKIP_STEP == 0:
Y = tf.placeholder(tf.int32, [None, 1]) Y_one_hot = tf.one_hot(Y, nb_classes) Y_one_hot = tf.reshpae(Y_one_hot, [-1, nb_classes]) # -1 means everything W = tf.Variable(tf.random_normal([16, nb_classes]), name='weight') b = tf.Variable(tf.fandom_normal([nb_classes]), name='bias') logits = tf.matmul(X, W) + b hypothesis = tf.nn.softmax(logits) #Cross_Entropy cost_i = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y_one_hot) cost = tf.reduce_mean(cost_i) optimizer = tf.GradientDescentOptimizer(learning_rate=0.1).minimize(cost) prediction = tf.argmax(hypothesis, 1) # Find the maximum value from the array correct_prediction = tf.equal(prediction, tf.argmax(Y_one_hot, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for step in range(2001): sess.run(optimizer, feed_dict={X: x_data, Y: y_data}) if step % 100 == 0: loss, acc = sess.run([cost, accuracy], feed_dict={ X: x_data,
import tensorflow as tf import numpy as np data = np.array([1, 1], [2, 2], [3, 3], [4, 4]) # defining variables and constants x = tf.placeholder() y = tf.placeholder() m = tf.variable() b = tf.variable() # prediction, loss and others prediction = m * x + b loss = np.sum(np.square(y - prediction)) optimizer = tf.GradientDescentOptimizer(loss) session = tf.Session() #initialize variables # iterate over all data points to minimize loss # find the new value of slope and things #for i in data.shape[0]: # iterate # do for actual dataset, house prices # push to github in a tf repo
import tensorflow as tf # model: parameters, input and output W = tf.Variable([0.3], dtype=tf.float32) b = tf.Variable([-0.3], dtype=tf.float32) x = tf.placeholder(tf.float32) y = tf.placeholder(tf.float32) linear_model = W * x + b # loss & optimizer loss = tf.reduce_sum(tf.square(linear_model - y)) optimizer = tf.GradientDescentOptimizer(0.01) train = optimizer.minimize(loss) x_train = [1, 2, 3, 4] y_train = [0, -1, -2, -3] # train the linear regression model sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) for i in range(1000): sess.run(train, feed_dict={x: x_train, y: y_train}) # Evaluate training accuracy print(sess.run([curr_W, curr_b, loss], feed_dict={x: x_train, y: y_train}))