def _ModelFn(features, labels, mode): if is_training: logits_out = self._BuildGraph(features) else: graph_def = self._GetGraphDef(use_trt, batch_size, model_dir) logits_out = importer.import_graph_def( graph_def, input_map={INPUT_NODE_NAME: features}, return_elements=[OUTPUT_NODE_NAME + ':0'], name='')[0] loss = losses.sparse_softmax_cross_entropy( labels=labels, logits=logits_out) summary.scalar('loss', loss) classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out') accuracy = metrics.accuracy( labels=labels, predictions=classes_out, name='acc_op') summary.scalar('accuracy', accuracy[1]) if mode == ModeKeys.EVAL: return EstimatorSpec( mode, loss=loss, eval_metric_ops={'accuracy': accuracy}) elif mode == ModeKeys.TRAIN: optimizer = AdamOptimizer(learning_rate=1e-2) train_op = optimizer.minimize(loss, global_step=get_global_step()) return EstimatorSpec(mode, loss=loss, train_op=train_op)
def add_optimizer(loss): global_step = tf.Variable(0, trainable=False) optimizer = AdamOptimizer() grads_and_vars = optimizer.compute_gradients(loss) for grad, var in grads_and_vars: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) return optimizer.apply_gradients(grads_and_vars, global_step)
def test_optimizer_garbage_collection(self): graph = ops.Graph() with graph.as_default(): optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) keras.backend.track_tf_optimizer(optimizer) optimizer_weak = weakref.ref(optimizer) graph_weak = weakref.ref(graph) del graph, optimizer gc.collect() # Check that the weak references are dead now. self.assertIs(graph_weak(), None) self.assertIs(optimizer_weak(), None)
def __init__(self, inputs, network, check_point="dqn.ckpt"): self.saver = tf.train.Saver() self.summary_writer = tf.summary.FileWriter("/tmp/dqn") self.inputs = inputs self.network = network self.targets = tf.placeholder(tf.float32, shape=(None, self.output_shape[1])) summary_names = ["actions", "loss", "exploration_rate", "fruits_eaten", "timesteps_survived"] self.summary_placeholders = {name: tf.placeholder(dtype=tf.float32) for name in summary_names} # self.summary_placeholders = [tf.placeholder(dtype=summary_variables[i].dtype) # for i in range(len(summary_names))] # summary_ops = [tf.assign(summary_variables[i],self.summary_placeholders[i]) # for i in range(len(summary_names)) summary = [tf.summary.histogram(summary_names[i], self.summary_placeholders[summary_names[i]]) for i in range(1)] summary += [tf.summary.scalar(summary_names[i], self.summary_placeholders[summary_names[i]]) for i in range(1, len(summary_names))] self.summary_ops = tf.summary.merge_all() self.loss = tf.losses.mean_squared_error(self.network, self.targets) optimizer = AdamOptimizer() self.train_step = optimizer.minimize(loss=self.loss) # # with tf.colocate_with(global_step): # self.update_op = tf.assign_add(global_step, 1) self.sess = tf.Session() self.summary_writer.add_graph(tf.get_default_graph()) with self.sess.as_default(): tf.global_variables_initializer().run() if os.path.exists(check_point): self.saver.restore(self.sess, check_point)
def get(self, name=None, lr_decay=None, global_step=None): params = {} if self.params is None else self.params.copy() with tf.variable_scope('opt'): lr_tensor = tf.get_variable('lr', dtype=tf.float32, initializer=tf.constant( params['learning_rate']), trainable=False) if lr_decay is not None: params['learning_rate'] = lr_decay( learning_rate=params['learning_rate'], global_step=global_step, name='lr_decay') self.lr_op = lr_tensor if lr_decay is None else lr_tensor.assign( params['learning_rate']) params['learning_rate'] = self.lr_op if self.opt_name == "Adam": if name is None: return AdamOptimizer(**params) else: return AdamOptimizer(name=name, **params) elif self.opt_name == "Adadelta": if name is None: return AdadeltaOptimizer(**params) else: return AdadeltaOptimizer(name=name, **params) elif self.opt_name == "RMSprop": if name is None: return RMSPropOptimizer(**params) else: return RMSPropOptimizer(name=name, **params) elif self.opt_name == "Momentum": if name is None: return MomentumOptimizer(**params) else: return MomentumOptimizer(name=name, **params) else: raise NotImplemented()
def __init__(self, options, data_train, session=None): self.statistics = DBQAStatistics.from_data(data_train) self.options = options self.optimizer = AdamOptimizer() self.global_step = tf.train.get_or_create_global_step() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) self.question_2d_pl = tf.placeholder(tf.int32, (None, None)) self.question_bigram_2d_pl = tf.placeholder(tf.int32, (None, None)) self.answer_2d_pl = tf.placeholder(tf.int32, (None, None)) self.answer_bigram_2d_pl = tf.placeholder(tf.int32, (None, None)) self.wrong_answer_2d_pl = tf.placeholder(tf.int32, (None, None)) self.wrong_answer_bigram_2d_pl = tf.placeholder(tf.int32, (None, None)) self.network = PairwiseSimilarity(options, self.statistics) self.loss, self.accuracy = self.network.get_loss( self.question_2d_pl, self.question_bigram_2d_pl, self.answer_2d_pl, self.answer_bigram_2d_pl, self.wrong_answer_2d_pl, self.wrong_answer_bigram_2d_pl, ) self.similarity = self.network.get_similarity( self.question_2d_pl, self.question_bigram_2d_pl, self.answer_2d_pl, self.answer_bigram_2d_pl) self.optimize_op = self.optimizer.minimize( self.loss, global_step=self.global_step) if session is None: self.session = self.create_session() self.session.run(tf.global_variables_initializer()) else: self.session = session self.random = Random(42)
def test_optimizer_garbage_collection(self): if context.executing_eagerly(): self.skipTest('v1 optimizer does not run in eager mode') graph = ops.Graph() with graph.as_default(): optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01)) keras.backend.track_tf_optimizer(optimizer) optimizer_weak = weakref.ref(optimizer) graph_weak = weakref.ref(graph) del graph, optimizer gc.collect() # Check that the weak references are dead now. self.assertIs(graph_weak(), None) self.assertIs(optimizer_weak(), None)
def _add_optimizer(self): self.optimizer = AdamOptimizer() self.final_train_loss = self.main_train_loss with tf.variable_scope('l2_regularization'): # Find variables to regularize by iterating over all variables and checking if in set. Haven't found way to # directly get variables by absolute path. l2_regularized_names = { 'encoder/bidirectional_rnn/fw/gru_cell/gates/weights:0' # If used, add additional complete variables names } l2_regularized = [ variable for variable in tf.trainable_variables() if variable.name in l2_regularized_names ] l2_loss = 0.001 * tf.add_n( [tf.nn.l2_loss(variable) for variable in l2_regularized]) gradients = self.optimizer.compute_gradients(self.final_train_loss) with tf.variable_scope('gradient_clipping'): def clip_gradient(gradient, variable): # Only clip normal tensors, IndexedSlices gives warning otherwise if isinstance(gradient, tf.Tensor): gradient = tf.clip_by_norm(gradient, 10) return gradient, variable gradients = [ clip_gradient(gradient, variable) for gradient, variable in gradients ] self.minimize_operation = self.optimizer.apply_gradients( gradients, global_step=self.global_step)
def test_mixed_precision_loss_scale_optimizer(self): if context.executing_eagerly(): self.skipTest('v1 optimizer does not run in eager mode') optimizer = MixedPrecisionLossScaleOptimizer(AdamOptimizer(), 'dynamic') model = keras.models.Sequential() model.add( keras.layers.Dense(2, input_shape=(3, ), kernel_constraint=keras.constraints.MaxNorm(1))) model.compile(loss='mean_squared_error', optimizer=optimizer, run_eagerly=testing_utils.should_run_eagerly()) model.fit(np.random.random((5, 3)), np.random.random((5, 2)), epochs=1, batch_size=5, verbose=0)
def test_tfoptimizer(self): optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) model = keras.models.Sequential() model.add(keras.layers.Dense( 2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1))) # This is possible model.compile(loss='mean_squared_error', optimizer=optimizer) keras.backend.track_tf_optimizer(optimizer) model.fit(np.random.random((5, 3)), np.random.random((5, 2)), epochs=1, batch_size=5, verbose=0) # not supported with self.assertRaises(NotImplementedError): _ = optimizer.weights with self.assertRaises(NotImplementedError): optimizer.get_config() with self.assertRaises(NotImplementedError): optimizer.from_config(None)
def get_conv_classifier(): n_classes = 5 feature_columns = [layers.real_valued_column("", dimension=3)] # learning_rate = 1.0 # optimizer = AdagradOptimizer(learning_rate) # # learning_rate = 1.0 # optimizer = AdadeltaOptimizer(learning_rate=learning_rate) # ~ 62.55% learning_rate = 0.01 optimizer = AdamOptimizer(learning_rate, epsilon=0.1) # learning_rate = 0.05 # optimizer = GradientDescentOptimizer(learning_rate) # learning_rate = 0.1 # optimizer = RMSPropOptimizer(learning_rate, momentum=0.1) # learning_rate = 0.1 # optimizer = FtrlOptimizer(learning_rate) return SKCompat( Estimator( model_fn=get_conv_model, params={ 'head': head_lib._multi_class_head( # pylint: disable=protected-access n_classes, enable_centered_bias=False), 'feature_columns': feature_columns, 'activation_fn': tf.nn.relu, 'learning_rate': learning_rate, 'optimizer': optimizer }, model_dir='saved_model'))
def test_tf_optimizer_iterations(self): if testing_utils.should_run_tf_function() or context.executing_eagerly(): self.skipTest( 'v1 optimizer does not run in experimental_run_tf_function mode or ' 'eager mode') with self.cached_session(): optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) model = keras.models.Sequential() model.add(keras.layers.Dense( 2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1))) model.compile( loss='mean_squared_error', optimizer=optimizer, run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils.should_run_tf_function()) keras.backend.track_tf_optimizer(optimizer) self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 0) model.fit(np.random.random((55, 3)), np.random.random((55, 2)), epochs=1, batch_size=5, verbose=0) self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 11)
user_num, item_num, cum_table, batch_size=batch_size, max_len=max_len, n_workers=3) model, emb = build_model(max_len=max_len, input_dim=item_num + 1, embedding_dim=50, feed_forward_units=50, head_num=1, block_num=2, dropout_rate=0.2) optimizer = AdamOptimizer(0.001) tbcb = TensorBoard(log_dir='/logs', histogram_freq=1, write_graph=True, write_grads=True, write_images=True, embeddings_freq=1) loss_history = [] cos_loss_history = [] T = 0.0 t0 = time.time() tbcb.set_model(model) tbcb.on_train_begin()
def __init__(self, word_vector_size): tf.reset_default_graph() self.vector_size = word_vector_size self.vectors = tf.placeholder(tf.float32, shape=(None, None, word_vector_size)) self.user_terms = tf.placeholder(tf.float32, shape=(None, None)) self.ut2 = tf.placeholder(tf.float32, shape=(None, None)) self.group_by = tf.placeholder(tf.float32, shape=(None, None, None)) self.padding = tf.placeholder(tf.float32, shape=(None, None)) self.output = tf.placeholder(tf.float32, shape=(None, 1)) self.dropout_rate = tf.placeholder(tf.float32) xavier = tf.contrib.layers.xavier_initializer() # 50 tri-gram, 50 4-gram and 50 5-gram filter_tri = tf.Variable(xavier((1, 2, word_vector_size, 50)), name="weight") # bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias") # self.f3 = filter_tri self.b3 = bias_tri filter_4 = tf.Variable(xavier((1, 3, word_vector_size, 50)), name="weight") # bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f4 = filter_4 self.b4 = bias_4 filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)), name="weight") # bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f5 = filter_5 self.b5 = bias_5 with tf.name_scope("relevance"): hidden = 150 self.relevance_weight = tf.Variable(0.01 * xavier((hidden, 2))) self.relevance_bias = tf.Variable(0.0 * xavier((1, 2))) self.relevance_attention_weight = tf.Variable(0.01 * xavier((100, 2))) self.relevance_attention_bias = tf.Variable(0.0 * xavier((1, 2))) rel, pre_max_true_dropped, pre_max_sum = self.forward(self.vectors) self.relevance = rel[:, 1] ut = tf.expand_dims(self.ut2, 2) # NWC rel_masked, pre_max_true_masked_dropped, _ = self.forward(self.vectors * ut) self.rel_masked = rel_masked self.pre_max = pre_max_sum self.get_attention() # true_attention_error = 0.0 att_reg = 0.0 prediction_error = -tf.reduce_sum((self.output * tf.log(rel[:, 1] + 10 ** -5, name="log2rel") + ( 1 - self.output) * tf.log(rel[:, 0] + 10 ** -5, name="log3rel"))) # N, num_unique, text_length ; N,text_length pos_attention = tf.squeeze(tf.matmul(self.group_by, tf.expand_dims(self.pos_attention, -1)), squeeze_dims=-1) neg_attention = tf.squeeze(tf.matmul(self.group_by, tf.expand_dims(self.neg_attention, -1)), squeeze_dims=-1) self.pos_att_grouped = pos_attention self.neg_att_grouped = neg_attention pos_heads = tf.reduce_sum(tf.multiply(pos_attention, self.user_terms), axis=1) neg_heads = tf.reduce_sum(tf.multiply(neg_attention, self.user_terms), axis=1) self.pos_heads = pos_heads attention_error = 0.0 occlusion_error = 0.0 if use_attention: attention_error += tf.reduce_sum(self.output*(pos_heads - 0.5) ** 2) att_reg = tf.reduce_sum(self.output * tf.nn.relu(self.pos_attention - att_max_value) + (1-self.output) * tf.nn.relu(self.neg_attention-att_max_value)) occlusion_error = -tf.reduce_sum((self.output * tf.log(rel_masked[:, 1] + 10 ** -5, name="log2rel2") + ( 1 - self.output) * tf.log(rel_masked[:, 0] + 10 ** -5, name="log3rel2"))) self.att = attention_error self.error = ( prediction_error + tf.sign(tf.reduce_sum(self.user_terms)) * attention_error + tf.sign(tf.reduce_sum(self.user_terms)) * occlusion_error + tf.sign(tf.reduce_sum(self.user_terms)) * att_reg) self.a = tf.check_numerics(attention_error, message="att") + tf.check_numerics(pos_heads, message="pos-heads") + tf.check_numerics( neg_heads, message="neg-heads") self.opt = AdamOptimizer() self.optimizer = self.opt.minimize(self.error) self.uncertainty = 1 self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.n_trained = 0 self.training = False
class CNN_prior: def get_attention(self): self.pos_attention = tf.reduce_sum(tf.gradients(self.pre_max[:, 1], self.vectors)[0] * self.vectors, axis=2) self.pos_attention = softmax_padding(self.pos_attention, self.padding, axis=1) self.neg_attention = tf.reduce_sum(tf.gradients(self.pre_max[:, 0], self.vectors)[0] * self.vectors, axis=2) self.neg_attention = softmax_padding(self.neg_attention, self.padding, axis=1) def forward(self, v): vectors2d = tf.expand_dims(v, 1) # None x 1 x 200 x 300 ... NHWC conv1 = tf.nn.conv2d( input=vectors2d, filter=self.f3, strides=[1, 1, 1, 1], padding="VALID" ) # None x 1 x words x 50 A1 = tf.nn.leaky_relu(conv1 + self.b3) self.a1 = A1 conv2 = tf.nn.conv2d( input=vectors2d, filter=self.f4, strides=[1, 1, 1, 1], padding="VALID" ) # None x 1 x words x 50 A2 = tf.nn.leaky_relu(conv2 + self.b4) self.a2 = A2 conv3 = tf.nn.conv2d( input=vectors2d, filter=self.f5, strides=[1, 1, 1, 1], padding="VALID" ) # None x 1 x words x 5 A3 = tf.nn.leaky_relu(conv3 + self.b5) max_A1_train = tf.reshape(tf.squeeze(tf.reduce_max(A1, 2)), [-1, 50]) # None x 5 max_A2_train = tf.reshape(tf.squeeze(tf.reduce_max(A2, 2)), [-1, 50]) # None x 5 max_A3_train = tf.reshape(tf.squeeze(tf.reduce_max(A3, 2)), [-1, 50]) # None x 5 concat = tf.concat([max_A1_train, max_A2_train, max_A3_train], axis=1) concat_drop = tf.nn.dropout(concat,keep_prob=self.dropout_rate) pre_max_true_drop = tf.matmul(concat_drop, self.relevance_weight) + self.relevance_bias rel = tf.nn.softmax(pre_max_true_drop, axis=1) sum_A1_train = tf.reshape(tf.squeeze(tf.reduce_sum(A1, 2)), [-1, 50]) # None x 5 sum_A2_train = tf.reshape(tf.squeeze(tf.reduce_sum(A2, 2)), [-1, 50]) # None x 5 sum_A3_train = tf.reshape(tf.squeeze(tf.reduce_sum(A3, 2)), [-1, 50]) # None x 5 concat_sums = tf.concat([sum_A1_train, sum_A2_train, sum_A3_train], axis=1) pre_max_sum = tf.matmul(concat_sums, self.relevance_weight) + self.relevance_bias return rel, pre_max_true_drop, pre_max_sum def groupby(self,att): return ndmatmul(self.group_by,att) def __init__(self, word_vector_size): tf.reset_default_graph() self.vector_size = word_vector_size self.vectors = tf.placeholder(tf.float32, shape=(None, None, word_vector_size)) self.user_terms = tf.placeholder(tf.float32, shape=(None, None)) self.ut2 = tf.placeholder(tf.float32, shape=(None, None)) self.group_by = tf.placeholder(tf.float32, shape=(None, None, None)) self.padding = tf.placeholder(tf.float32, shape=(None, None)) self.output = tf.placeholder(tf.float32, shape=(None, 1)) self.dropout_rate = tf.placeholder(tf.float32) xavier = tf.contrib.layers.xavier_initializer() # 50 tri-gram, 50 4-gram and 50 5-gram filter_tri = tf.Variable(xavier((1, 2, word_vector_size, 50)), name="weight") # bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias") # self.f3 = filter_tri self.b3 = bias_tri filter_4 = tf.Variable(xavier((1, 3, word_vector_size, 50)), name="weight") # bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f4 = filter_4 self.b4 = bias_4 filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)), name="weight") # bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f5 = filter_5 self.b5 = bias_5 with tf.name_scope("relevance"): hidden = 150 self.relevance_weight = tf.Variable(0.01 * xavier((hidden, 2))) self.relevance_bias = tf.Variable(0.0 * xavier((1, 2))) self.relevance_attention_weight = tf.Variable(0.01 * xavier((100, 2))) self.relevance_attention_bias = tf.Variable(0.0 * xavier((1, 2))) rel, pre_max_true_dropped, pre_max_sum = self.forward(self.vectors) self.relevance = rel[:, 1] ut = tf.expand_dims(self.ut2, 2) # NWC rel_masked, pre_max_true_masked_dropped, _ = self.forward(self.vectors * ut) self.rel_masked = rel_masked self.pre_max = pre_max_sum self.get_attention() # true_attention_error = 0.0 att_reg = 0.0 prediction_error = -tf.reduce_sum((self.output * tf.log(rel[:, 1] + 10 ** -5, name="log2rel") + ( 1 - self.output) * tf.log(rel[:, 0] + 10 ** -5, name="log3rel"))) # N, num_unique, text_length ; N,text_length pos_attention = tf.squeeze(tf.matmul(self.group_by, tf.expand_dims(self.pos_attention, -1)), squeeze_dims=-1) neg_attention = tf.squeeze(tf.matmul(self.group_by, tf.expand_dims(self.neg_attention, -1)), squeeze_dims=-1) self.pos_att_grouped = pos_attention self.neg_att_grouped = neg_attention pos_heads = tf.reduce_sum(tf.multiply(pos_attention, self.user_terms), axis=1) neg_heads = tf.reduce_sum(tf.multiply(neg_attention, self.user_terms), axis=1) self.pos_heads = pos_heads attention_error = 0.0 occlusion_error = 0.0 if use_attention: attention_error += tf.reduce_sum(self.output*(pos_heads - 0.5) ** 2) att_reg = tf.reduce_sum(self.output * tf.nn.relu(self.pos_attention - att_max_value) + (1-self.output) * tf.nn.relu(self.neg_attention-att_max_value)) occlusion_error = -tf.reduce_sum((self.output * tf.log(rel_masked[:, 1] + 10 ** -5, name="log2rel2") + ( 1 - self.output) * tf.log(rel_masked[:, 0] + 10 ** -5, name="log3rel2"))) self.att = attention_error self.error = ( prediction_error + tf.sign(tf.reduce_sum(self.user_terms)) * attention_error + tf.sign(tf.reduce_sum(self.user_terms)) * occlusion_error + tf.sign(tf.reduce_sum(self.user_terms)) * att_reg) self.a = tf.check_numerics(attention_error, message="att") + tf.check_numerics(pos_heads, message="pos-heads") + tf.check_numerics( neg_heads, message="neg-heads") self.opt = AdamOptimizer() self.optimizer = self.opt.minimize(self.error) self.uncertainty = 1 self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.n_trained = 0 self.training = False def get_feed_dict(self, doc): return {self.vectors: np.array(doc.vectors, dtype=np.float32).reshape([1, -1, self.vector_size]), self.output: [[doc.class_ * 1]], self.user_terms: np.array(doc.user_terms, dtype=np.float32).reshape([1, -1]), self.padding: np.array([1 for i in doc.words]).reshape([1, -1])} def blow_up(self,mat,num_rows,num_cols): blowed_mat = [i+[0]*(num_cols-len(i)) for i in mat] x=([0] * num_cols) * (num_rows - len(blowed_mat)) if x: blowed_mat.append(x) return blowed_mat def get_feed_dict_multiple(self, docs): dp = 0.7 if self.training else 1 maximum = max([len(doc.vectors) for doc in docs]) maximum = max([maximum,7]) max_terms = max([len(doc.user_terms) for doc in docs]) return {self.vectors: np.array( [doc.vectors[:maximum] + [[0] * (self.vector_size)] * (maximum - len(doc.vectors[:maximum])) for doc in docs]).reshape([-1, maximum, self.vector_size]), self.group_by:np.array([self.blow_up(doc.gb,max_terms,maximum) for doc in docs]), self.ut2: np.array( [doc.ut2[:maximum] + [0] * (maximum - len(doc.ut2[:maximum])) for doc in docs]).reshape([-1, maximum]), self.output: [[doc.class_ * 1] for doc in docs], self.user_terms: np.array( [doc.user_terms[:max_terms] + [0] * (max_terms - len(doc.user_terms[:max_terms])) for doc in docs]).reshape([-1, max_terms]), self.padding: np.array( [[1] * len(doc.vectors[:maximum]) + [0] * (maximum - len(doc.vectors[:maximum])) for doc in docs]).reshape([-1, maximum]), self.dropout_rate:dp} def load(self, filename): saver = tf.train.Saver() saver.restore(self.sess, filename) pass def train(self, docs, train_full=False): self.training = True self.sess.run(tf.global_variables_initializer()) sess = self.sess print("====23") n = len(docs) epochs = 200 if train_full: epochs = 10 self.n_trained = n import random random.shuffle(docs) last_10 = [100] * 10 prev_error = None for epoch in range(epochs): total_error = 0 for doc_s in [docs[i:i + 1] for i in range(0, len(docs), 1)]: fd = self.get_feed_dict_multiple(doc_s) try: sess.run(self.a, feed_dict=fd) except Exception as e: print("check") _, error = sess.run([self.optimizer, self.error], feed_dict=fd) # print(x,y) # if epoch>50 and x>=0.5: # print("ch") # print(error,error-x,x) total_error += error total_error = total_error / len(docs) # print(total_error) if train_full: saver = tf.train.Saver() saver.save(sess, "./{}.pkl".format(epoch)) # print(total_error) if epoch>10 and total_error > 4: self.train(docs) return last_10.pop(0) last_10.append(total_error) if max(last_10) < 0.05: print("breaking") break print(total_error) self.training = False def run(self, docs): sess = self.sess for doc_s in [docs[i:i + 1] for i in range(0, len(docs), 1)]: fd = self.get_feed_dict_multiple(doc_s) try: l1 = sess.run([self.relevance, self.pos_att_grouped, self.neg_att_grouped,self.pos_heads], feed_dict=fd) except Exception as e: print("here") for ind, doc in enumerate(doc_s): d = { "rel": l1[0][ind], "pos_att": l1[1][ind], "neg_att": l1[2][ind], "pos_heads": l1[3][ind] } doc.pred_class = 0 if d["rel"] < 0.5 else 1 doc.parameters = d
def __init__(self, word_vector_size): tf.reset_default_graph() self.vector_size = word_vector_size self.vectors = tf.placeholder(tf.float32, shape=(None, None, word_vector_size)) self.user_terms = tf.placeholder(tf.float32, shape=(None, None)) self.padding = tf.placeholder(tf.float32, shape=(None, None)) self.output = tf.placeholder(tf.float32, shape=(None, 1)) self.dropout_rate = tf.placeholder(tf.float32) xavier = tf.contrib.layers.xavier_initializer() # 50 tri-gram, 50 4-gram and 50 5-gram filter_tri = tf.Variable(xavier((1, 3, word_vector_size, 50)), name="weight") # bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias") # self.f3 = filter_tri self.b3 = bias_tri filter_4 = tf.Variable(xavier((1, 4, word_vector_size, 50)), name="weight") # bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f4 = filter_4 self.b4 = bias_4 filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)), name="weight") # bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f5 = filter_5 self.b5 = bias_5 with tf.name_scope("relevance"): hidden = 150 self.relevance_weight = tf.Variable(0.01 * xavier( (hidden, 2))) self.relevance_bias = tf.Variable(0.0 * xavier((1, 2))) self.relevance_attention_weight = tf.Variable( 0.01 * xavier((100, 2))) self.relevance_attention_bias = tf.Variable(0.0 * xavier( (1, 2))) rel, pre_max_true_dropped, pre_max_sum = self.forward( self.vectors) self.relevance = rel[:, 1] ut = tf.expand_dims(self.user_terms, 2) # NWC rel_masked, pre_max_true_masked_dropped, _ = self.forward( self.vectors * ut) self.rel_masked = rel_masked self.pre_max_sum = pre_max_sum self.get_attribution() prediction_error = -tf.reduce_sum( (self.output * tf.log(rel[:, 1] + 10**-5, name="log2rel") + (1 - self.output) * tf.log(rel[:, 0] + 10**-5, name="log3rel"))) pos_heads = tf.reduce_sum(tf.multiply(self.pos_attribution, self.user_terms), axis=1) neg_heads = tf.reduce_sum(tf.multiply(self.neg_attribution, self.user_terms), axis=1) misattribution_error = 0.0 corrective_error = 0.0 att_reg = 0.0 if use_attribution: misattribution_error += tf.reduce_sum( self.output * (pos_heads - 0.9)**2 + (1 - self.output) * (neg_heads - 0.9)**2) att_reg = tf.reduce_sum( self.output * tf.nn.relu(self.pos_attribution - att_max_value) + (1 - self.output) * tf.nn.relu(self.neg_attribution - att_max_value)) corrective_error = -tf.reduce_sum( (self.output * tf.log(rel_masked[:, 1] + 10**-5, name="log2rel2") + (1 - self.output) * tf.log(rel_masked[:, 0] + 10**-5, name="log3rel2"))) self.error = ( prediction_error + tf.sign(tf.reduce_sum(self.user_terms)) * (misattribution_error + corrective_error + att_reg)) self.opt = AdamOptimizer() self.optimizer = self.opt.minimize(self.error) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.training = False
class CNN_prior: def get_attribution(self): self.pos_attribution = tf.reduce_sum( tf.gradients(self.pre_max_sum[:, 1], self.vectors)[0] * self.vectors, axis=2) self.pos_attribution = softmax_padding(self.pos_attribution, self.padding, axis=1) self.neg_attribution = tf.reduce_sum( tf.gradients(self.pre_max_sum[:, 0], self.vectors)[0] * self.vectors, axis=2) self.neg_attribution = softmax_padding(self.neg_attribution, self.padding, axis=1) def forward(self, v): vectors2d = tf.expand_dims(v, 1) # None x 1 x 200 x 300 ... NHWC conv1 = tf.nn.conv2d(input=vectors2d, filter=self.f3, strides=[1, 1, 1, 1], padding="VALID") # None x 1 x words x 50 A1 = tf.nn.leaky_relu(conv1 + self.b3) self.a1 = A1 conv2 = tf.nn.conv2d(input=vectors2d, filter=self.f4, strides=[1, 1, 1, 1], padding="VALID") # None x 1 x words x 50 A2 = tf.nn.leaky_relu(conv2 + self.b4) self.a2 = A2 conv3 = tf.nn.conv2d(input=vectors2d, filter=self.f5, strides=[1, 1, 1, 1], padding="VALID") # None x 1 x words x 5 A3 = tf.nn.leaky_relu(conv3 + self.b5) max_A1_train = tf.reshape(tf.squeeze(tf.reduce_max(A1, 2)), [-1, 50]) # None x 5 max_A2_train = tf.reshape(tf.squeeze(tf.reduce_max(A2, 2)), [-1, 50]) # None x 5 max_A3_train = tf.reshape(tf.squeeze(tf.reduce_max(A3, 2)), [-1, 50]) # None x 5 concat = tf.concat([max_A1_train, max_A2_train, max_A3_train], axis=1) concat_drop = tf.nn.dropout(concat, keep_prob=self.dropout_rate) pre_max_true_drop = tf.matmul( concat_drop, self.relevance_weight) + self.relevance_bias rel = tf.nn.softmax(pre_max_true_drop, axis=1) sum_A1_train = tf.reshape(tf.squeeze(tf.reduce_sum(A1, 2)), [-1, 50]) # None x 5 sum_A2_train = tf.reshape(tf.squeeze(tf.reduce_sum(A2, 2)), [-1, 50]) # None x 5 sum_A3_train = tf.reshape(tf.squeeze(tf.reduce_sum(A3, 2)), [-1, 50]) # None x 5 concat_sums = tf.concat( [sum_A1_train, sum_A2_train, sum_A3_train], axis=1) pre_max_sum = tf.matmul( concat_sums, self.relevance_weight) + self.relevance_bias return rel, pre_max_true_drop, pre_max_sum def __init__(self, word_vector_size): tf.reset_default_graph() self.vector_size = word_vector_size self.vectors = tf.placeholder(tf.float32, shape=(None, None, word_vector_size)) self.user_terms = tf.placeholder(tf.float32, shape=(None, None)) self.padding = tf.placeholder(tf.float32, shape=(None, None)) self.output = tf.placeholder(tf.float32, shape=(None, 1)) self.dropout_rate = tf.placeholder(tf.float32) xavier = tf.contrib.layers.xavier_initializer() # 50 tri-gram, 50 4-gram and 50 5-gram filter_tri = tf.Variable(xavier((1, 3, word_vector_size, 50)), name="weight") # bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias") # self.f3 = filter_tri self.b3 = bias_tri filter_4 = tf.Variable(xavier((1, 4, word_vector_size, 50)), name="weight") # bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f4 = filter_4 self.b4 = bias_4 filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)), name="weight") # bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f5 = filter_5 self.b5 = bias_5 with tf.name_scope("relevance"): hidden = 150 self.relevance_weight = tf.Variable(0.01 * xavier( (hidden, 2))) self.relevance_bias = tf.Variable(0.0 * xavier((1, 2))) self.relevance_attention_weight = tf.Variable( 0.01 * xavier((100, 2))) self.relevance_attention_bias = tf.Variable(0.0 * xavier( (1, 2))) rel, pre_max_true_dropped, pre_max_sum = self.forward( self.vectors) self.relevance = rel[:, 1] ut = tf.expand_dims(self.user_terms, 2) # NWC rel_masked, pre_max_true_masked_dropped, _ = self.forward( self.vectors * ut) self.rel_masked = rel_masked self.pre_max_sum = pre_max_sum self.get_attribution() prediction_error = -tf.reduce_sum( (self.output * tf.log(rel[:, 1] + 10**-5, name="log2rel") + (1 - self.output) * tf.log(rel[:, 0] + 10**-5, name="log3rel"))) pos_heads = tf.reduce_sum(tf.multiply(self.pos_attribution, self.user_terms), axis=1) neg_heads = tf.reduce_sum(tf.multiply(self.neg_attribution, self.user_terms), axis=1) misattribution_error = 0.0 corrective_error = 0.0 att_reg = 0.0 if use_attribution: misattribution_error += tf.reduce_sum( self.output * (pos_heads - 0.9)**2 + (1 - self.output) * (neg_heads - 0.9)**2) att_reg = tf.reduce_sum( self.output * tf.nn.relu(self.pos_attribution - att_max_value) + (1 - self.output) * tf.nn.relu(self.neg_attribution - att_max_value)) corrective_error = -tf.reduce_sum( (self.output * tf.log(rel_masked[:, 1] + 10**-5, name="log2rel2") + (1 - self.output) * tf.log(rel_masked[:, 0] + 10**-5, name="log3rel2"))) self.error = ( prediction_error + tf.sign(tf.reduce_sum(self.user_terms)) * (misattribution_error + corrective_error + att_reg)) self.opt = AdamOptimizer() self.optimizer = self.opt.minimize(self.error) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.training = False def get_feed_dict_multiple(self, docs): dp = 0.7 if self.training else 1 maximum = max([len(doc.vectors) for doc in docs] + [5]) return { self.vectors: np.array([ doc.vectors[:maximum] + [[0] * (self.vector_size)] * (maximum - len(doc.vectors[:maximum])) for doc in docs ]).reshape([-1, maximum, self.vector_size]), self.output: [[doc.class_ * 1] for doc in docs], self.user_terms: np.array([ doc.user_terms[:maximum] + [0] * (maximum - len(doc.user_terms[:maximum])) for doc in docs ]).reshape([-1, maximum]), self.padding: np.array([[1] * len(doc.vectors[:maximum]) + [0] * (maximum - len(doc.vectors[:maximum])) for doc in docs]).reshape([-1, maximum]), self.dropout_rate: dp } def train(self, docs): self.training = True # Re-initialize the machine during every training round self.sess.run(tf.global_variables_initializer()) sess = self.sess print("====") epochs = 200 # maximum training epochs random.shuffle(docs) last_10 = [100] * 10 for epoch in range(epochs): total_error = 0 # Stochastic Gradient Descent (mini-batch size = 1) works best. for doc_s in [ docs[i:i + 1] for i in range(0, len(docs), 1) ]: fd = self.get_feed_dict_multiple(doc_s) _, error = sess.run([self.optimizer, self.error], feed_dict=fd) total_error += error total_error = total_error / len(docs) if epoch > 10 and total_error > 4: self.train(docs) return last_10.pop(0) last_10.append(total_error) if max(last_10) < 0.05: print("breaking") break print(total_error) self.training = False def run(self, docs): random.shuffle(docs) sess = self.sess num_correct = 0 num_seen = 0 for doc_s in [docs[i:i + 1] for i in range(0, len(docs), 1)]: fd = self.get_feed_dict_multiple(doc_s) l1 = sess.run([ self.relevance, self.pos_attribution, self.neg_attribution ], feed_dict=fd) for ind, doc in enumerate(doc_s): d = { "rel": l1[0][ind], "pos_att": l1[1][ind], "neg_att": l1[2][ind] } doc.pred_class = 0 if d["rel"] < 0.5 else 1 doc.parameters = d num_correct += 1 * (doc.pred_class == doc.class_) num_seen += 1 if num_seen % 1000 == 0: print(num_correct / num_seen * 100)
# Build Model model = Sequential() model.add(Embedding(len(vocab), args.embedding_size, input_length=max_answer_len)) model.add(Dropout(args.dropout)) if args.flatten: model.add(Flatten()) model.add(Reshape((1, args.embedding_size * max_answer_len))) if args.lstm_dim_2: model.add(LSTM(args.lstm_dim_1, return_sequences=True)) model.add(LSTM(args.lstm_dim_2, return_sequences=False)) else: model.add(LSTM(args.lstm_dim_1, return_sequences=False)) model.add(Dropout(args.dropout)) model.add(Dense(1, activation="linear")) optimizer = AdamOptimizer() model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['acc']) # Train the model model.fit(train_x, train_y, epochs=args.epochs, verbose=0) # Validate test_y = test_data.iloc[:, 0] test_x = test_data.iloc[:, 1:] score = model.evaluate(test_x, test_y, verbose=0) print(f"Validation_loss:{score[0]};Validation_accuracy:{score[1]};") ## --- End of your code --- ## # Save the trained model
def __init__(self, args): self.inputs = tf.placeholder( tf.int32, shape=[args.batch_size, args.sequence_length]) self.targets = tf.placeholder( tf.int32, shape=[args.batch_size, args.sequence_length]) with tf.name_scope("embedding"): embedding_size = int(sqrt(args.vocab_source_size) + 1) embedding = tf.get_variable( 'embedding', shape=[args.vocab_source_size, embedding_size], #embed them in a small space initializer=tf.contrib.layers.xavier_initializer()) embedded = tf.nn.embedding_lookup(embedding, self.inputs) #tensor of shape [batch_size*sequence_length*embedding_size] embedded_inputs = tf.unpack(embedded, axis=0) #assert embedded_inputs[0].get_shape() == (args.batch_size,args.sequence_length,embedding_size) #reshape it to a list of timesteps embedded_inputs_by_timestamp = [ tf.reshape(i, (args.batch_size, embedding_size)) for i in tf.split(1, args.sequence_length, embedded) ] assert len(embedded_inputs_by_timestamp) == args.sequence_length for timestep in embedded_inputs_by_timestamp: assert timestep.get_shape() == (args.batch_size, embedding_size) with tf.variable_scope("bidi_rnn") as bidi_scope: cell = LSTM_factory(args.hidden_size, args.num_layers, dropout=args.dropout) outputs, fwd_state, bwd_state = tf.nn.bidirectional_rnn( cell_fw=cell, cell_bw=cell, inputs=embedded_inputs_by_timestamp, dtype=tf.float32) with tf.variable_scope("decoder_rnn"): decoder_cell = LSTM_factory(args.hidden_size, args.num_layers * 2, dropout=args.dropout) decoder_cell = AttentionCellWrapper(cell=decoder_cell, attn_length=args.hidden_size, state_is_tuple=True) final_outputs, state = tf.nn.rnn(cell=decoder_cell, inputs=outputs, dtype=tf.float32) with tf.variable_scope("logits") as logits_scope: # Reshaping to apply the same weights over the timesteps outputs = tf.pack(final_outputs) outputs = tf.transpose(outputs, [1, 0, 2]) logits = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=args.vocab_target_size, activation_fn=None, weights_initializer=tf.contrib.layers.xavier_initializer(), scope=logits_scope) self.logits = logits with tf.variable_scope("loss"): #flat_targets = tf.reshape(self.targets, [-1]) #flat_logits = tf.reshape(logits, [-1, args.vocab_target_size]) assert logits.get_shape()[:-1] == self.targets.get_shape( ), 'l = {0} t = {1}'.format(logits.get_shape(), self.targets.get_shape()) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits, self.targets) batch_loss = tf.reduce_sum(losses, name="batch_loss") tf.contrib.losses.add_loss(batch_loss) total_loss = tf.contrib.losses.get_total_loss() # Add summaries. tf.scalar_summary("batch_loss", batch_loss) tf.scalar_summary("total_loss", total_loss) self.total_loss = total_loss self.batch_loss = batch_loss self.target_cross_entropy_losses = losses # Used in evaluation. with tf.name_scope("optimization"): opt = AdamOptimizer(learning_rate=args.learning_rate) gvs = opt.compute_gradients(self.batch_loss) capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs] train_op = opt.apply_gradients(capped_gvs) for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) for grad, var in gvs: if grad is not None: print(capped_gvs) tf.histogram_summary( var.op.name + '/gradients', grad, ) with tf.name_scope("tensors"): self.train_op = train_op self.logits = logits self.total_loss = total_loss self.summaries = tf.merge_all_summaries()
class QuerySumModel: ''' The QuerySum model itself. ''' def __init__(self, mode, word_dict, word_embedding_dim, vocabulary, initial_vocabulary_embeddings, target_vocabulary_size, cell='gru'): ''' Args: self: QuerySumModel. mode: str, one of 'train', 'validate', or 'decode'. word_dict: dict, map from words to their embeddings. word_embedding_dim: int, the dimension of a single embedding. vocabulary: Vocabulary. initial_vocabulary_embeddings: np.ndarray. target_vocabulary_size: int. cell: 'gru' or 'lstm', the type of RNN unit to use. ''' self.word_dict = word_dict self.word_embedding_dim = word_embedding_dim self.summary_vocabulary = vocabulary self.target_vocabulary_size = min(len(vocabulary.words), target_vocabulary_size) self.embeddings = tf.Variable(initial_vocabulary_embeddings, name='embeddings') self.documents_placeholder = tf.placeholder(tf.int32, shape=[None, None]) self.document_lengths_placeholder = tf.placeholder(tf.int32, shape=[None]) self.queries_placeholder = tf.placeholder(tf.int32, shape=[None, None]) self.query_lengths_placeholder = tf.placeholder(tf.int32, shape=[None]) self.references_placeholder = tf.placeholder(tf.int32, shape=[None, None]) self.reference_lengths_placeholder = tf.placeholder(tf.int32, shape=[None]) self.pointer_reference_placeholder = tf.placeholder(tf.int32, shape=[None, None]) self.pointer_switch_placeholder = tf.placeholder(tf.int32, shape=[None, None]) self.reference_lengths_placeholder = tf.placeholder(tf.int32, shape=[None]) self.epoch = tf.Variable(0, name='epoch', trainable=False) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.best_validation_loss = tf.Variable(np.inf, name='best_validation_loss', trainable=False) self.new_best_validation = tf.placeholder(tf.float32, shape=[]) self.best_validation_assign = self.best_validation_loss.assign( self.new_best_validation) self.increment_epoch_op = tf.assign(self.epoch, self.epoch + 1) self.batch_size = tf.shape(self.documents_placeholder)[0] self.dropout_enabled = False self.encoder_cell_state_size = 256 self.encoder_output_size = 2 * self.encoder_cell_state_size self.decoder_cell_state_size = self.encoder_output_size self.decoder_vocab_hidden_size = 256 self.attention_hidden_output_size = 256 # Size is that of decoder state + encoder hidden state + query reader state self.attention_hidden_input_size = (self.decoder_cell_state_size + self.encoder_output_size + self.encoder_cell_state_size) self.beam_width_placeholder = tf.placeholder(tf.int32, shape=[]) self.decode_last_output_placeholder = tf.placeholder(tf.int32, shape=[None]) self.initial_decoder_state_placeholder = tf.placeholder( tf.float32, shape=[None, self.decoder_cell_state_size]) self.pre_computed_encoder_states_placeholder = tf.placeholder( tf.float32, shape=[None, None, self.encoder_output_size]) self.pre_computed_query_state_placeholder = tf.placeholder( tf.float32, shape=[None, self.encoder_cell_state_size]) self.query_attention_partial_score_placeholder = tf.placeholder( tf.float32, shape=[None, self.attention_hidden_output_size]) self.encoder_state_attention_partial_scores_placeholder = tf.placeholder( tf.float32, shape=[None, None, self.attention_hidden_output_size]) self.mode = mode if cell == 'gru': self.cell = GRUCell elif cell == 'lstm': self.cell = lambda *args, **kwargs: LSTMCell( *args, **kwargs, state_is_tuple=False) else: raise Exception('{} is not a valid RNN cell'.format(cell)) self.output_keep_prob = 0.8 # DropoutWrapper keep probability self._build_graph(mode=mode) def _build_graph(self, mode): ''' A simple wrapper for the other graph-building methods. Args: self: QuerySumModel. mode: str. ''' self._add_encoders() self._add_decoder(mode) if mode == 'train': self._add_optimizer() def _add_encoders(self): ''' Build the model's encoder and add it to the graph. Args: self: QuerySumModel. ''' with tf.variable_scope('query_encoder'): query_encoder_cell = self.cell(self.encoder_cell_state_size) if self.dropout_enabled and self.mode != 'decode': query_encoder_cell = DropoutWrapper( cell=query_encoder_cell, output_keep_prob=self.output_keep_prob) query_embeddings = tf.nn.embedding_lookup(self.embeddings, self.queries_placeholder) query_encoder_outputs, _ = rnn.dynamic_rnn( query_encoder_cell, query_embeddings, sequence_length=self.query_lengths_placeholder, swap_memory=True, dtype=tf.float32) # because the query is so short, we can store almost all the # information inside it using a single contex vector. thus, we # extract the final query encoder output and save it. self.query_last = query_encoder_outputs[:, -1, :] with tf.variable_scope('encoder'): fw_cell = self.cell(self.encoder_cell_state_size) bw_cell = self.cell(self.encoder_cell_state_size) if self.dropout_enabled and self.mode != 'decode': fw_cell = DropoutWrapper( cell=fw_cell, output_keep_prob=self.output_keep_prob) bw_cell = DropoutWrapper( cell=bw_cell, output_keep_prob=self.output_keep_prob) embeddings = tf.nn.embedding_lookup(self.embeddings, self.documents_placeholder) (encoder_outputs_fw, encoder_outputs_bw), _ = rnn.bidirectional_dynamic_rnn( fw_cell, bw_cell, embeddings, sequence_length=self.document_lengths_placeholder, swap_memory=True, dtype=tf.float32) # Unlike the query, the document can be very complex, making it # difficult to encode all of its information into a single context # vector. Instead, we use attention, so we need to track all the # cell outputs. In addition, we need to save the final encoder # state so we can initialize the decoder's state to it. self.encoder_outputs = tf.concat( [encoder_outputs_fw, encoder_outputs_bw], 2) self.final_encoder_state = self.encoder_outputs[:, -1, :] def _add_decoder(self, mode): ''' Args: self: QuerySumModel. mode: str. ''' with tf.variable_scope('decoder') as scope: decoder_cell = self.cell(self.decoder_cell_state_size) if self.dropout_enabled and self.mode != 'decode': decoder_cell = DropoutWrapper( cell=decoder_cell, output_keep_prob=self.output_keep_prob) # W^{(1)}_{gen} self.vocabulary_project_w_1 = tf.get_variable( name='vocabulary_project_w_1', shape=[ decoder_cell.output_size + self.encoder_output_size, self.decoder_vocab_hidden_size ]) self.vocabulary_project_w_2 = tf.get_variable( name='vocabulary_project_w_2', shape=[ self.decoder_vocab_hidden_size, self.target_vocabulary_size ]) self.vocabulary_project_b_1 = tf.get_variable( name='vocabulary_project_b_1', initializer=tf.zeros_initializer(), shape=[self.decoder_vocab_hidden_size]) self.vocabulary_project_b_2 = tf.get_variable( name='vocabulary_project_b_2', initializer=tf.zeros_initializer(), shape=[self.target_vocabulary_size]) self.pointer_probability_project_w = tf.get_variable( name='pointer_probability_project_w', shape=[ self.encoder_output_size + self.decoder_cell_state_size + self.word_embedding_dim, 1 ]) self.pointer_probability_project_b = tf.get_variable( name='pointer_probability_project_b', initializer=tf.zeros_initializer(), shape=[1]) self.attention_w = tf.get_variable( name='attention_w', shape=[ self.decoder_cell_state_size, self.attention_hidden_output_size ], dtype=tf.float32) self.attention_w_e = tf.get_variable( name='attention_w_e', shape=[ self.word_embedding_dim, self.attention_hidden_output_size ], dtype=tf.float32) self.attention_w_q = tf.get_variable( name='attention_w_q', shape=[ self.encoder_cell_state_size, self.attention_hidden_output_size ], dtype=tf.float32) self.attention_w_d = tf.get_variable( name='attention_w_d', shape=[ self.encoder_output_size, self.attention_hidden_output_size ], dtype=tf.float32) self.attention_v = tf.get_variable( name='attention_v', shape=[self.attention_hidden_output_size], dtype=tf.float32) self.attention_b = tf.get_variable( name='attention_b', initializer=tf.zeros_initializer(), shape=[self.attention_hidden_output_size], dtype=tf.float32) self._precompute_partial_attention_scores() if mode == 'decode': embedding = tf.nn.embedding_lookup( self.embeddings, self.decode_last_output_placeholder) (decoder_outputs, self.one_step_decoder_state, context_vectors, attention_logits, pointer_probabilities) = self._rnn_one_step_attention_decoder( decoder_cell, embedding, self.initial_decoder_state_placeholder) else: if mode == 'train': train_decoder_outputs, train_context_vectors, train_attention_logits, train_pointer_probabilities = \ self._rnn_attention_decoder(decoder_cell, training_wheels=True) scope.reuse_variables() self.train_attention_argmax = tf.cast(tf.argmax( train_attention_logits, 1), dtype=tf.int32) self.train_pointer_enabled = tf.cast( tf.round(train_pointer_probabilities), tf.int32) decoder_outputs, context_vectors, attention_logits, pointer_probabilities = \ self._rnn_attention_decoder(decoder_cell, training_wheels=False) self.attention_argmax = tf.cast(tf.argmax(attention_logits, 1), dtype=tf.int32) self.attention_softmax = tf.nn.softmax(attention_logits) self.pointer_enabled = tf.cast(tf.round(pointer_probabilities), tf.int32) if mode == 'decode': self.top_k_vocabulary_argmax, self.top_k_probabilities = self._extract_top_k_argmax( self.beam_width_placeholder, decoder_outputs, context_vectors) else: if mode == 'train': self.train_vocabulary_argmax, self.main_train_loss = self._compute_argmax_and_loss( train_decoder_outputs, train_context_vectors, train_attention_logits, train_pointer_probabilities) self.vocabulary_argmax, self.main_loss = self._compute_argmax_and_loss( decoder_outputs, context_vectors, attention_logits, pointer_probabilities) def _rnn_attention_decoder(self, decoder_cell, training_wheels): ''' Args: self: QuerySumModel, decoder_cell: RNNCell or GRUCell, the RNN cell used by the decoder. training_wheels: Returns: ''' loop_fn = self._custom_rnn_loop_fn(decoder_cell.output_size, training_wheels=training_wheels) decoder_outputs, _, (context_vectors_array, attention_logits_array, pointer_probability_array) = \ tf.nn.raw_rnn(decoder_cell, loop_fn, swap_memory=True) decoder_outputs = decoder_outputs.stack() decoder_outputs = tf.transpose(decoder_outputs, [1, 0, 2]) attention_logits = attention_logits_array.gather( tf.range(0, attention_logits_array.size() - 1)) attention_logits = tf.transpose(attention_logits, [1, 0, 2]) context_vectors = context_vectors_array.gather( tf.range(0, context_vectors_array.size() - 1)) context_vectors = tf.transpose(context_vectors, [1, 0, 2]) pointer_probabilities = pointer_probability_array.gather( tf.range(0, pointer_probability_array.size() - 1)) pointer_probabilities = tf.transpose(pointer_probabilities, [1, 0]) return decoder_outputs, context_vectors, attention_logits, pointer_probabilities def _custom_rnn_loop_fn(self, cell_size, training_wheels): def loop_fn(time, cell_output, cell_state, loop_state): print(cell_state) if cell_output is None: # time == 0 context_vectors_array = tf.TensorArray( tf.float32, size=tf.shape(self.references_placeholder)[1] + 1) attention_logits_array = tf.TensorArray( tf.float32, size=tf.shape(self.references_placeholder)[1] + 1) pointer_probability_array = tf.TensorArray( tf.float32, size=tf.shape(self.references_placeholder)[1] + 1) next_cell_state = self.final_encoder_state go_id = self.summary_vocabulary.word_to_id('<GO>') last_output_embedding = tf.nn.embedding_lookup( self.embeddings, tf.tile([go_id], [self.batch_size])) else: context_vectors_array, attention_logits_array, pointer_probability_array = loop_state next_cell_state = cell_state if training_wheels: voc_indices = self.references_placeholder[:, time - 1] pointer_indices = self.pointer_reference_placeholder[:, time - 1] pointer_switch = tf.cast( self.pointer_switch_placeholder[:, time - 1], tf.bool) batch_range = tf.range(self.batch_size) pointer_indexer = tf.stack([batch_range, pointer_indices], axis=1) attention_vocabulary_indices = tf.gather_nd( self.documents_placeholder, pointer_indexer) mixed_indices = tf.where(pointer_switch, attention_vocabulary_indices, voc_indices) last_output_embedding = tf.nn.embedding_lookup( self.embeddings, mixed_indices) else: last_output_embedding = self._extract_argmax_and_embed( cell_output, cell_size, tf.shape(self.documents_placeholder)[0]) context_vector, attention_logits = self._attention( next_cell_state, last_output_embedding) pointer_probabilities = self._pointer_probabilities( context_vector, next_cell_state, last_output_embedding) context_vectors_array = context_vectors_array.write( time, context_vector) attention_logits_array = attention_logits_array.write( time, attention_logits) pointer_probability_array = pointer_probability_array.write( time, pointer_probabilities) next_input = tf.concat( [last_output_embedding, context_vector, self.query_last], axis=1) elements_finished = (time >= self.reference_lengths_placeholder) emit_output = cell_output next_loop_state = (context_vectors_array, attention_logits_array, pointer_probability_array) return elements_finished, next_input, next_cell_state, emit_output, next_loop_state return loop_fn def _precompute_partial_attention_scores(self): encoder_outputs_flat = tf.reshape(self.encoder_outputs, shape=[-1, self.encoder_output_size]) self.encoder_state_attention_partial_scores = tf.matmul( encoder_outputs_flat, self.attention_w_d) self.encoder_state_attention_partial_scores = tf.reshape( self.encoder_state_attention_partial_scores, shape=[self.batch_size, -1, self.attention_hidden_output_size]) self.encoder_state_attention_partial_scores = tf.transpose( self.encoder_state_attention_partial_scores, [1, 0, 2]) self.query_attention_partial_score = tf.matmul(self.query_last, self.attention_w_q) def _score(self, prev_decoder_state, prev_embedding): # Returns scores in a tensor of shape [batch_size, input_sequence_length] if self.mode == 'decode': query_part = self.query_attention_partial_score_placeholder encoder_part = self.encoder_state_attention_partial_scores_placeholder else: query_part = self.query_attention_partial_score encoder_part = self.encoder_state_attention_partial_scores embedding_part = tf.matmul(prev_embedding, self.attention_w_e) # XXX: this is where the shape mismatch is output = tf.matmul( prev_decoder_state, self.attention_w ) + embedding_part + query_part + encoder_part + self.attention_b output = tf.tanh(output) output = tf.reduce_sum(self.attention_v * output, axis=2) output = tf.transpose(output, [1, 0]) # Handle input document padding by giving a large penalty, eliminating it from the weighted average padding_penalty = -1e20 * tf.to_float( 1 - tf.sign(self.documents_placeholder)) masked = output + padding_penalty return masked def _attention(self, prev_decoder_state, prev_embedding): with tf.variable_scope('attention') as scope: # e = score of shape [batch_size, output_seq_length, input_seq_length], e_{ij} = score(s_{i-1}, h_j) # e_i = score of shape [batch_size, input_seq_length], e_ij = score(prev_decoder_state, h_j) e_i = self._score(prev_decoder_state, prev_embedding) # alpha_i = softmax(e_i) of shape [batch_size, input_seq_length] alpha_i = tf.nn.softmax(e_i) resized_alpha_i = tf.reshape( tf.tile(alpha_i, [1, self.encoder_output_size]), [self.batch_size, -1, self.encoder_output_size]) if self.mode == 'decode': c_i = tf.reduce_sum(tf.multiply( resized_alpha_i, self.pre_computed_encoder_states_placeholder), axis=1) else: c_i = tf.reduce_sum(tf.multiply(resized_alpha_i, self.encoder_outputs), axis=1) return c_i, e_i def _pointer_probabilities(self, attention, cell_state, last_output_embedding): combined_input = tf.concat( [attention, cell_state, last_output_embedding], axis=1) result = tf.sigmoid( tf.matmul(combined_input, self.pointer_probability_project_w) + self.pointer_probability_project_b) # Remove extra dimension of size 1 result = tf.reshape(result, shape=[self.batch_size]) return result def _compute_argmax_and_loss(self, decoder_outputs, context_vectors, attention_logits, pointer_probabilities): # Projection onto vocabulary is based on # http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/ vocabulary_project_input = tf.concat( [decoder_outputs, context_vectors], axis=2) # Flatten output over batch dimension vocabulary_project_input_flat = tf.reshape( vocabulary_project_input, [-1, self.decoder_cell_state_size + self.encoder_output_size]) vocabulary_hidden_flat = tf.matmul( vocabulary_project_input_flat, self.vocabulary_project_w_1) + self.vocabulary_project_b_1 logits_flat = tf.matmul( vocabulary_hidden_flat, self.vocabulary_project_w_2) + self.vocabulary_project_b_2 max_decoder_length = tf.shape(decoder_outputs)[1] # Reshape back to [batch_size, max_decoder_length, vocabulary_size] logits = tf.reshape( logits_flat, [-1, max_decoder_length, self.target_vocabulary_size]) vocabulary_argmax = tf.argmax(logits, 2) references_placeholder_flat = tf.reshape(self.references_placeholder, [-1, 1]) # Calculate the losses losses_flat = tf.nn.sampled_softmax_loss( weights=tf.transpose(self.vocabulary_project_w_2), biases=self.vocabulary_project_b_2, labels=references_placeholder_flat, inputs=vocabulary_hidden_flat, num_sampled=512, num_classes=self.target_vocabulary_size) vocabulary_loss = tf.reshape(losses_flat, [-1, max_decoder_length]) # Previous loss function for full softmax # vocabulary_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, # labels=self.references_placeholder) pointer_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=attention_logits, labels=self.pointer_reference_placeholder) float_pointer_switch_reference = tf.to_float( self.pointer_switch_placeholder) pointer_probability_loss = (float_pointer_switch_reference * -tf.log(pointer_probabilities + 1e-9) + (1. - float_pointer_switch_reference) * -tf.log(1. - pointer_probabilities + 1e-9)) # Mask out padding from loss computation length_mask = tf.sign(tf.to_float(self.references_placeholder)) masked_losses = length_mask * ( pointer_probability_loss + (1. - float_pointer_switch_reference) * vocabulary_loss + float_pointer_switch_reference * pointer_loss) float_lengths = tf.to_float(self.reference_lengths_placeholder) # Calculate mean loss mean_loss_by_example = tf.reduce_sum(masked_losses, axis=1) / float_lengths mean_loss = tf.reduce_mean(mean_loss_by_example) return vocabulary_argmax, mean_loss def _extract_argmax_and_embed(self, cell_output, cell_size, batch_size): # Flatten output over batch dimension rnn_outputs_flat = tf.reshape(cell_output, [-1, cell_size]) # Running without training wheels is currently not supported # TODO: Fix or remove logits_flat = tf.zeros([batch_size, self.target_vocabulary_size]) # logits_flat = tf.matmul(rnn_outputs_flat, self.vocabulary_project_w) + self.vocabulary_project_b # Reshape back to [batch_size, vocabulary_size] logits = tf.reshape(logits_flat, [-1, self.target_vocabulary_size]) vocabulary_argmax = tf.argmax(logits, 1) return tf.nn.embedding_lookup(self.embeddings, vocabulary_argmax) def _add_optimizer(self): self.optimizer = AdamOptimizer() self.final_train_loss = self.main_train_loss with tf.variable_scope('l2_regularization'): # Find variables to regularize by iterating over all variables and checking if in set. Haven't found way to # directly get variables by absolute path. l2_regularized_names = { 'encoder/bidirectional_rnn/fw/gru_cell/gates/weights:0' # If used, add additional complete variables names } l2_regularized = [ variable for variable in tf.trainable_variables() if variable.name in l2_regularized_names ] l2_loss = 0.001 * tf.add_n( [tf.nn.l2_loss(variable) for variable in l2_regularized]) gradients = self.optimizer.compute_gradients(self.final_train_loss) with tf.variable_scope('gradient_clipping'): def clip_gradient(gradient, variable): # Only clip normal tensors, IndexedSlices gives warning otherwise if isinstance(gradient, tf.Tensor): gradient = tf.clip_by_norm(gradient, 10) return gradient, variable gradients = [ clip_gradient(gradient, variable) for gradient, variable in gradients ] self.minimize_operation = self.optimizer.apply_gradients( gradients, global_step=self.global_step) def _rnn_one_step_attention_decoder(self, decoder_cell, initial_input_word_embedding, initial_cell_state): loop_fn = self._custom_one_step_rnn_loop_fn( initial_input_word_embedding, initial_cell_state) decoder_outputs, final_state, (context_vector, attention_logits, pointer_probabilities) = tf.nn.raw_rnn( decoder_cell, loop_fn) decoder_outputs = decoder_outputs.stack() decoder_outputs = tf.transpose(decoder_outputs, [1, 0, 2]) return decoder_outputs, final_state, context_vector, attention_logits, pointer_probabilities def _custom_one_step_rnn_loop_fn(self, initial_input_word_embedding, initial_cell_state): def loop_fn(time, cell_output, cell_state, loop_state): if cell_output is None: # time == 0 next_cell_state = initial_cell_state context_vector, attention_logits = self._attention( next_cell_state, initial_input_word_embedding) pointer_probabilities = self._pointer_probabilities( context_vector, next_cell_state, initial_input_word_embedding) next_input = tf.concat([ initial_input_word_embedding, context_vector, self.pre_computed_query_state_placeholder ], axis=1) next_loop_state = (context_vector, attention_logits, pointer_probabilities) else: next_cell_state = cell_state next_input = tf.zeros(shape=[ self.batch_size, self.word_embedding_dim + self.encoder_output_size + self.encoder_cell_state_size ]) next_loop_state = loop_state elements_finished = cell_output is not None print(next_cell_state.shape) emit_output = cell_output return elements_finished, next_input, next_cell_state, emit_output, next_loop_state return loop_fn def _extract_top_k_argmax(self, k, cell_output, context_vectors): cell_output_flat = tf.reshape(cell_output, [-1, self.decoder_cell_state_size]) vocabulary_project_input = tf.concat( [cell_output_flat, context_vectors], axis=1) vocabulary_hidden = tf.matmul( vocabulary_project_input, self.vocabulary_project_w_1) + self.vocabulary_project_b_1 logits = tf.matmul( vocabulary_hidden, self.vocabulary_project_w_2) + self.vocabulary_project_b_2 top_k_probabilities, vocabulary_argmax = tf.nn.top_k( tf.nn.softmax(logits), k) return vocabulary_argmax, top_k_probabilities
def __init__(self, **optimizer_kwargs): self._model = optimizer_kwargs["model"] self._individual_learning_rate = optimizer_kwargs[ "individual_learning_rate"] self._learning_rate = optimizer_kwargs["learning_rate"] self._rescale_learning_rate = optimizer_kwargs["rescale_learning_rate"] self._d_p = None self._n_reg = None post_optimizer = optimizer_kwargs[ "post_optimizer"] if "post_optimizer" in optimizer_kwargs else None if post_optimizer is None: self._post_optimizer = super() elif post_optimizer == "Momentum": self._post_optimizer = MomentumOptimizer( learning_rate=optimizer_kwargs["learning_rate"], momentum=0.95, use_locking=False, name="MomentumOptimizer") elif post_optimizer == "RMSProp": self._post_optimizer = RMSPropOptimizer( learning_rate=optimizer_kwargs["learning_rate"], decay=0.9, epsilon=1e-5, use_locking=False, name="RMSPropOptimizer") elif post_optimizer == "Adam": self._post_optimizer = AdamOptimizer( learning_rate=optimizer_kwargs["learning_rate"], beta1=0.9, beta2=0.999, epsilon=1e-8, use_locking=False, name="AdamOptimizer") elif post_optimizer == "Nadam": self._post_optimizer = NadamOptimizer( learning_rate=optimizer_kwargs["learning_rate"], beta1=0.9, beta2=0.999, epsilon=1e-8, use_locking=False, name="NadamOptimizer") elif post_optimizer == "Nesterov": self._post_optimizer = MomentumOptimizer( learning_rate=optimizer_kwargs["learning_rate"], momentum=0.95, use_locking=False, use_nesterov=True, name="NesterovMomentumOptimizer") elif post_optimizer == "NesterovConst": self._post_optimizer = NesterovConst( model=self._model, learning_rate=optimizer_kwargs["learning_rate"], use_locking=False, name="NesterovConstOptimizer") else: raise Exception( "There is no such post optimizer defined. Must be: None, Adam, Momentum, RMSProp" ) super().__init__(self._learning_rate)
from tensorflow.python.keras.optimizers import SGD from tensorflow.python.training.adam import AdamOptimizer from audio.adapter import get_audio_adapter from dataset import DatasetBuilder from model import model_fn from model.KerasUnet import getUnetModel from utils.configuration import load_configuration import tensorflow as tf import csv audio_path = '../musdb_dataset/' config_path = "../config/musdb_config.json" INIT_LR = 1e-3 opt = AdamOptimizer(INIT_LR) opt = SGD(lr=INIT_LR, momentum=0.9) _instruments = ['vocals_spectrogram'] model_dict = {} model_trainable_variables = {} val_loss_results = [] val_metrics_results = [] export_dir = '../spleeter_saved_model_dir/' metrics_csv = './csv_metrics/metrics_loss.csv' def get_training_dataset(audio_params, audio_adapter, audio_path): """ Builds training dataset.
dtype=tf.int32), batch_sz, name='accuracy') tf.summary.scalar('accuracy', accuracy) from tflearn.objectives import categorical_crossentropy loss = categorical_crossentropy(softmax_class_op, selected_gesture) tf.summary.scalar('classification_loss', loss) with tf.variable_scope('optimize'): lr_op = tf.Variable(5e-4, False, dtype=tf.float32) decay_lr_op = tf.assign(lr_op, lr_op * (1 - 1e-4)) tf.summary.scalar('learning_rate', lr_op) with tf.control_dependencies([decay_lr_op]): train_step = AdamOptimizer(learning_rate=lr_op).minimize(loss) display_q = queue.Queue(10) def display(): while True: softmax_class, display_states = display_q.get() print("Prediction: ", np.max(softmax_class, axis=1)) for states in np.transpose(display_states, axes=[1, 0, 2]): env.step(states) env.render() sleep(.2 / (display_q.qsize() + 1)) env.reset()
d.load_embeddings(args.emb_type, args.word2vec_file, args.glove_file, args.fasttext_file, args.custom_file, logger) d.batch = d.batch_generator(args.mb) m = bayesian_emb_model(d, d.K, sess, dir_name) sigmas_list = list() # TRAINING n_iters, n_batches = get_n_iters(args.n_epochs, args.mb, len(d.word_target)) logger.debug('init training number of iters '+str(n_iters)+' and batches '+str(n_batches)) #kl_scaling_weights = get_kl_weights(n_batches) learning_rates = get_learning_rates(args.clr_type, n_iters, args.clr_cycles, args.base_lr, args.max_lr, args.lr) m.inference.initialize(n_samples=1, n_iter=n_iters, logdir=m.logdir, scale={m.y_pos: n_batches, m.y_neg: n_batches / args.ns}, kl_scaling={m.y_pos: n_batches, m.y_neg: n_batches / args.ns}, optimizer=AdamOptimizer(learning_rate=m.learning_rate_placeholder) ) early_stopping = EarlyStopping(patience=args.patience) init = tf.global_variables_initializer() sess.run(init) logger.debug('....starting training') iteration = 0 for epoch in range(args.n_epochs): for batch in range(n_batches): info_dict = m.inference.update(feed_dict=d.feed(m.target_placeholder, m.context_placeholder, m.labels_placeholder, m.ones_placeholder, m.zeros_placeholder, m.learning_rate_placeholder, args.mb,
def create_optimizer(step: Tensorflow2ModelStep, context: ExecutionContext): return AdamOptimizer(learning_rate=step.hyperparams['learning_rate'])
n_iters, n_batches = get_n_iters() logger.debug('init training number of iters ' + str(n_iters) + ' and batches ' + str(n_batches)) m.inference.initialize(n_samples=1, n_iter=n_iters, logdir=m.logdir, scale={ m.y_pos: n_batches, m.y_neg: n_batches / args.ns }, kl_scaling={ m.y_pos: n_batches, m.y_neg: n_batches / args.ns }, optimizer=AdamOptimizer(learning_rate=0.001)) init = tf.global_variables_initializer() sess.run(init) logger.debug('....starting training') for i in range(m.inference.n_iter): info_dict = m.inference.update(feed_dict=d.feed( args.mb, m.target_placeholder, m.context_placeholder, m.labels_placeholder, m.ones_placeholder, m.zeros_placeholder, True)) m.inference.print_progress(info_dict) if i % 10000 == 0: m.saver.save(sess, os.path.join(m.logdir, "model.ckpt"), i) sigmas = m.sigU.eval()[:, 0] sigmas_list.append(sigmas) pickle.dump(sigmas_list, open(dir_name + "/sigmas.dat", "wb+")) if is_goog_embedding(sigmas): break
) n_features = 1001 n_classes = 101 batch_size = 32 val_batch_size = 256 tree = SoftDecisionTree(max_depth=6, n_features=n_features, n_classes=n_classes, max_leafs=None) tree.build_tree() # optimizer optimizer = AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08).minimize(tree.loss) # Saving the model # saver = tf.train.Saver() # Initialize the variables (i.e. assign their default value) init = global_variables_initializer() EPOCHS = 1000 TOTAL_BATCH = 16 display_step = 100 with tf.compat.v1.Session() as sess: sess.run(init) t0 = time.time()
def __init__(self, word_vector_size): tf.reset_default_graph() self.vector_size = word_vector_size self.vectors = tf.placeholder(tf.float32, shape=(None, None, word_vector_size)) self.user_terms = tf.placeholder(tf.float32, shape=(None, None)) self.padding = tf.placeholder(tf.float32, shape=(None, None)) self.output = tf.placeholder(tf.float32, shape=(None, 1)) self.dropout_rate = tf.placeholder(tf.float32) xavier = tf.contrib.layers.xavier_initializer() # 50 tri-gram, 50 4-gram and 50 5-gram filter_tri = tf.Variable(xavier((1, 3, word_vector_size, 50)), name="weight") # bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias") # self.f3 = filter_tri self.b3 = bias_tri filter_4 = tf.Variable(xavier((1, 4, word_vector_size, 50)), name="weight") # bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f4 = filter_4 self.b4 = bias_4 filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)), name="weight") # bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias") self.f5 = filter_5 self.b5 = bias_5 with tf.name_scope("relevance"): hidden = 150 self.relevance_weight = tf.Variable(0.01 * xavier( (hidden, num_classes))) self.relevance_bias = tf.Variable(0.0 * xavier( (1, num_classes))) rel, pre_max_true_dropped, pre_max_sum = self.forward( self.vectors) self.relevance = rel ut = tf.expand_dims(self.user_terms, 2) # NWC rel_masked, pre_max_true_masked_dropped, _ = self.forward( self.vectors * ut) self.rel_masked = rel_masked self.pre_max_sum = pre_max_sum self.get_attribution() prediction_error = -tf.reduce_sum( tf.one_hot(tf.cast(self.output, tf.int32), num_classes) * tf.log(rel + 10**-5, name="log2rel")) heads = [] for att in self.attributions: heads.append( tf.reduce_sum(tf.multiply(att, self.user_terms), axis=1)) heads_all = tf.stack(heads) self.h = heads_all self.a = tf.stack(self.attributions) # pos_heads = # neg_heads = tf.reduce_sum(tf.multiply(self.neg_attribution, self.user_terms), axis=1) misattribution_error = 0.0 corrective_error = 0.0 att_reg = 0.0 if use_attribution: misattribution_error += ( self.h[tf.cast(self.output[0][0], tf.int32)][0] - 0.9)**2 att_reg = 0 for att in self.attributions: att_reg += tf.reduce_sum( tf.nn.relu(att - att_max_value)) corrective_error = -tf.reduce_sum( tf.one_hot(tf.cast(self.output, tf.int32), num_classes) * tf.log(rel_masked + 10**-5, name="log2rel")) self.error = ( prediction_error + tf.sign(tf.reduce_sum(self.user_terms)) * (misattribution_error + corrective_error + att_reg)) self.opt = AdamOptimizer() self.optimizer = self.opt.minimize(self.error) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.training = False
def fit(self, dataset): self.w = self.w_hat if self.train_type == 'Center': self.torque = self.iteration self.w = 0 x = tf.placeholder(tf.float32, [None, 784]) # dynamically reshape the input x_shaped = tf.reshape(x, [-1, 28, 28, 1]) # now declare the output data placeholder - 10 digits y = tf.placeholder(tf.float32, [None, 10]) # create some convolutional layers layer1 = create_new_conv_layer(x_shaped, self.w[:2], self.layer1_size[0], self.layer1_size[1], self.layer1_size[2], self.layer1_size[3], name='layer1') layer2 = create_new_conv_layer(layer1, self.w[2:4], self.layer2_size[0], self.layer2_size[1], self.layer2_size[2], self.layer2_size[3], name='layer2') flattened_parameter_size = self.flattend_size( )**2 * self.layer2_size[1] flattened = tf.reshape(layer2, [-1, flattened_parameter_size]) # setup some weights and bias values for this layer, then activate with ReLU wd1 = tf.Variable(self.w[4].reshape(flattened_parameter_size, self.flatten1_size), name='wd1') bd1 = tf.Variable(self.w[5], name='bd1') dense_layer1 = tf.matmul(flattened, wd1) + bd1 dense_layer1 = tf.nn.relu(dense_layer1) # another layer with softmax activations wd2 = tf.Variable(self.w[6].reshape(self.flatten1_size, self.flatten2_size), name='wd2') bd2 = tf.Variable(self.w[7], name='bd2') dense_layer2 = tf.matmul(dense_layer1, wd2) + bd2 y_ = tf.nn.softmax(dense_layer2) #loss is cross_entropy loss cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=dense_layer2, labels=y)) #metrics correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #Optimizer initialization optimizer_gradient = AdamOptimizer_Bing( learning_rate=self.learning_rate).minimize(cross_entropy) optimizer = AdamOptimizer( learning_rate=self.learning_rate).minimize(cross_entropy) # setup the initialisation operator init_op = tf.global_variables_initializer() grad = [] with tf.Session() as sess: # initialise the variables sess.run(init_op) total_batch = int(len(dataset.train.labels) / self.batch_size) count = 0 for epoch in range(self.torque): avg_cost = 0 self.t += 1 count += 1 if count < self.torque: for i in range(total_batch): batch_x, batch_y = dataset.train.next_batch( batch_size=self.batch_size) _, c = sess.run([optimizer, cross_entropy], feed_dict={ x: batch_x, y: batch_y }) avg_cost += c / total_batch elif count == self.torque: ''' #self.grad saved for belta computation. #It denotes in time t(update time), the gradient of local loss of local parameters ''' for i in range(total_batch): batch_x, batch_y = dataset.train.next_batch( batch_size=self.batch_size) g, c = sess.run([optimizer_gradient, cross_entropy], feed_dict={ x: batch_x, y: batch_y }) #g[1] is grad_var list gradient_temp = batch_gradient_collector(g[1]) grad.append(gradient_temp) avg_cost += c / total_batch self.w = batch_parameter_collector(g[1]) #Sum up gradients from each batch self.grad = np.array(grad).sum(axis=0) test_acc = sess.run(accuracy, feed_dict={ x: dataset.test.images, y: dataset.test.labels }) self.history.append([avg_cost, test_acc, str(self.t)]) return self
class DBQA(DependencyParserBase): available_data_formats = { "word-based": NLPCC16DBQA, "character-based": NLPCC16DBQACharacterBased } default_data_format_name = "word-based" @classmethod def add_parser_arguments(cls, arg_parser): super(DBQA, cls).add_parser_arguments(arg_parser) group = arg_parser.add_argument_group(DBQA.__name__) group.add_argument("--external-embedding") group.add_argument("--batch-size", type=int, default=4096) group.add_argument("--embed-size", type=int, default=100) group.add_argument("--lstm-size", type=int, default=256) group.add_argument("--n-recur", type=int, default=2) group.add_argument("--use-bigram", type=int, default=1) group.add_argument("--input-keep-prob", type=int, default=1) group.add_argument("--recurrent-keep-prob", type=int, default=1) group.add_argument("--seed", type=int, default=42) group.add_argument("--steps", type=int, default=50000) group.add_argument("--merger-type", choices=["rnn", "cnn"], default="rnn") def __init__(self, options, data_train, session=None): self.statistics = DBQAStatistics.from_data(data_train) self.options = options self.optimizer = AdamOptimizer() self.global_step = tf.train.get_or_create_global_step() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) self.question_2d_pl = tf.placeholder(tf.int32, (None, None)) self.question_bigram_2d_pl = tf.placeholder(tf.int32, (None, None)) self.answer_2d_pl = tf.placeholder(tf.int32, (None, None)) self.answer_bigram_2d_pl = tf.placeholder(tf.int32, (None, None)) self.wrong_answer_2d_pl = tf.placeholder(tf.int32, (None, None)) self.wrong_answer_bigram_2d_pl = tf.placeholder(tf.int32, (None, None)) self.network = PairwiseSimilarity(options, self.statistics) self.loss, self.accuracy = self.network.get_loss( self.question_2d_pl, self.question_bigram_2d_pl, self.answer_2d_pl, self.answer_bigram_2d_pl, self.wrong_answer_2d_pl, self.wrong_answer_bigram_2d_pl, ) self.similarity = self.network.get_similarity( self.question_2d_pl, self.question_bigram_2d_pl, self.answer_2d_pl, self.answer_bigram_2d_pl) self.optimize_op = self.optimizer.minimize( self.loss, global_step=self.global_step) if session is None: self.session = self.create_session() self.session.run(tf.global_variables_initializer()) else: self.session = session self.random = Random(42) def create_session(self): config_proto = tf.ConfigProto() # config_proto.gpu_options.per_process_gpu_memory_fraction = self.options.per_process_gpu_memory_fraction return tf.Session(config=config_proto) def train(self, data_train): for questions_np, questions_bigram_np, \ corrects_np, corrects_bigram_np, \ wrongs_np, wrongs_bigram_np in generate_train_batches( data_train, self.options.batch_size, self.random ): step, loss, accuracy, _ = self.session.run( [self.global_step, self.loss, self.accuracy, self.optimize_op], { self.question_2d_pl: questions_np, self.question_bigram_2d_pl: questions_bigram_np, self.answer_2d_pl: corrects_np, self.answer_bigram_2d_pl: corrects_bigram_np, self.wrong_answer_2d_pl: wrongs_np, self.wrong_answer_bigram_2d_pl: wrongs_bigram_np }) logger.info("Train: Step {}, loss {}, accuracy {}".format( step, loss, accuracy)) @classmethod def repeat_train_and_validate(cls, data_train, data_devs, data_test, options): tf.set_random_seed(options.seed) parser = cls(options, data_train) for question in data_train: question.fill_ids(parser.statistics) for file_name, data_dev in data_devs.items(): for question in data_dev: question.fill_ids(parser.statistics) while True: step = parser.session.run(parser.global_step) if step > options.steps: break parser.random.shuffle(data_train) parser.train(data_train) for file_name, data_dev in data_devs.items(): try: prefix, suffix = os.path.basename(file_name).rsplit(".", 1) except ValueError: prefix = os.path.basename(file_name) suffix = "" dev_output = os.path.join( options.output, '{}_step_{}.{}'.format(prefix, step, suffix)) scores = list(parser.predict(data_dev)) with open(dev_output, "w") as f_output: for score in scores: f_output.write("{}\n".format(score)) @classmethod def load(cls, prefix, new_options=None): pass def predict(self, data_dev): for questions_np, questions_bigram_np,\ answer_np, answer_bigram_np in generate_predict_batches( data_dev, self.options.batch_size ): similarities = self.session.run( self.similarity, { self.question_2d_pl: questions_np, self.question_bigram_2d_pl: questions_bigram_np, self.answer_2d_pl: answer_np, self.answer_bigram_2d_pl: answer_bigram_np }) for similarity in similarities: yield similarity def save(self, prefix): pass
def training_embedding(reverse_dictionary, with_dp=False): """ # training with DP :param with_dp: :return: """ batch_size = 128 embedding_size = 300 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. # We pick a random validation set to sample nearest neighbors. here we limit the # validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.array(random.sample(range(valid_window), valid_size)) num_sampled = 64 # Number of negative examples to sample. learning_rate = 1 # DP parameters clip_bound = 0.01 # 'the clip bound of the gradients' # num_steps = 160000 # 'number of steps T = E * N / L = E / q' sigma = 5 # 'sigma' delta = 1e-5 # 'delta' sess = tf.InteractiveSession() graph = tf.Graph() avg_loss_arr = [] loss_arr = [] # with graph.as_default(), tf.device('/cpu:0'): # Input data. with tf.device('/gpu:0'): train_dataset = tf.placeholder(tf.int32, shape=[batch_size]) train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # Variables. embeddings = tf.Variable( tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # Model. # Look up embeddings for inputs. embed = tf.nn.embedding_lookup(embeddings, train_dataset) if FLAGS.with_nce_loss: nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) cross_entropy = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size)) else: with tf.device('/gpu:0'): softmax_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) softmax_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the softmax loss, using a sample of the negative labels each time. # Read more: https://stackoverflow.com/questions/37671974/tensorflow-negative-sampling # When we want to compute the softmax probability for your true label, # we compute: logits[true_label] / sum(logits[negative_sampled_labels] # Other candidate sampling: https://www.tensorflow.org/extras/candidate_sampling.pdf cross_entropy = tf.reduce_mean( tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed, labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size)) priv_accountant = accountant.GaussianMomentsAccountant(vocabulary_size) privacy_accum_op = priv_accountant.accumulate_privacy_spending( [None, None], sigma, batch_size) # Optimizer. # Note: The optimizer will optimize the softmax_weights AND the embeddings. # This is because the embeddings are defined as a variable quantity and the # optimizer's `minimize` method will by default modify all variable quantities # that contribute to the tensor it is passed. # See docs on `tf.train.Optimizer.minimize()` for more details. # optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss) # optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy) optimizer = GradientDescentOptimizer(learning_rate) if FLAGS.optimizer == "adam": # cannot use adam so far. Tested and the model couldn't converge. optimizer = AdamOptimizer(learning_rate) print("##INFO: Using adam optimizer") if FLAGS.optimizer == "adagrad": # cannot use adam so far. Tested and the model couldn't converge. optimizer = AdagradOptimizer(learning_rate) print("##INFO: Using adagrad optimizer") log_dir = os.path.join(FLAGS.trained_models, "logs") # compute gradient if FLAGS.with_nce_loss: gw_Embeddings = tf.gradients(cross_entropy, embeddings)[0] # gradient of embeddings gw_softmax_weights = tf.gradients( cross_entropy, nce_weights)[0] # gradient of nce_weights gb_softmax_biases = tf.gradients( cross_entropy, nce_biases)[0] # gradient of nce_biases else: with tf.device('/gpu:0'): gw_Embeddings = tf.gradients( cross_entropy, embeddings)[0] # gradient of embeddings gw_softmax_weights = tf.gradients( cross_entropy, softmax_weights)[0] # gradient of softmax_weights gb_softmax_biases = tf.gradients( cross_entropy, softmax_biases)[0] # gradient of softmax_biases # clip gradient if FLAGS.clip_by_norm: # faster but takes more epochs to train with tf.device('/gpu:0'): gw_Embeddings = tf.clip_by_norm(gw_Embeddings, clip_bound) gw_softmax_weights = tf.clip_by_norm(gw_softmax_weights, clip_bound) gb_softmax_biases = tf.clip_by_norm(gb_softmax_biases, clip_bound) else: # dp-sgd: slow and require more memory but converge faster, take less epochs. gw_Embeddings = utils.BatchClipByL2norm(gw_Embeddings, clip_bound) gw_softmax_weights = utils.BatchClipByL2norm(gw_softmax_weights, clip_bound) gb_softmax_biases = utils.BatchClipByL2norm(gb_softmax_biases, clip_bound) sensitivity = clip_bound # adjacency matrix with one more tuple # Add noise if FLAGS.with_dp: gw_Embeddings += tf.random_normal(shape=tf.shape(gw_Embeddings), mean=0.0, stddev=sigma * (sensitivity**2), dtype=tf.float32) gw_softmax_weights += tf.random_normal( shape=tf.shape(gw_softmax_weights), mean=0.0, stddev=sigma * (sensitivity**2), dtype=tf.float32) gb_softmax_biases += tf.random_normal( shape=tf.shape(gb_softmax_biases), mean=0.0, stddev=sigma * (sensitivity**2), dtype=tf.float32) if FLAGS.with_nce_loss: train_step = optimizer.apply_gradients([ (gw_Embeddings, embeddings), (gw_softmax_weights, nce_weights), (gb_softmax_biases, nce_biases) ]) else: train_step = optimizer.apply_gradients([ (gw_Embeddings, embeddings), (gw_softmax_weights, softmax_weights), (gb_softmax_biases, softmax_biases) ]) # Compute the similarity between minibatch examples and all embeddings. # We use the cosine distance: with tf.device('/gpu:0'): norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings)) min_loss = 10**4 per_dec_count = 0 print('Initialized') average_loss = 0 running = True step = 0 average_loss_arr = [] saving_pointer_idx = 0 # put it here because Adam has its own variables. sess.run(tf.global_variables_initializer()) # saver must be used after global_variables_initializer saver = tf.train.Saver() # Save the variables to disk. save_path = os.path.join(FLAGS.trained_models, "initialized_model.ckpt") # Sonvx: we need to make sure initialized variables are all the same for different tests. print("Checking on path: ", save_path) if not os.path.isfile(save_path + ".index"): saved_info = saver.save(sess, save_path) print("Global initialized model saved in file: %s" % saved_info) else: saver.restore(sess, save_path) print("Restored the global initialized model.") if FLAGS.DEBUG: input( "Double check whether or not the initialized model got restored then <Press enter>" ) print('###INFO: Initialized in run(graph)') if FLAGS.RESTORE_LAST_CHECK_POINT: checkpoint_path = os.path.join(log_dir, "model.ckpt") if os.path.isfile(checkpoint_path + ".index"): saver.restore(sess, checkpoint_path) print("Restored the latest checkpoint at %s." % (checkpoint_path)) while running: # for step in range(num_steps): batch_data, batch_labels = generate_batch(batch_size, num_skips, skip_window) print("Global data_index = ", data_index) # feed_dict = {train_dataset: batch_data, train_labels: batch_labels} # old: sess.run([optimizer, cross_entropy], feed_dict=feed_dict) # template: train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5}); train_step.run(feed_dict={ train_dataset: batch_data, train_labels: batch_labels }) loss = cross_entropy.eval(feed_dict={ train_dataset: batch_data, train_labels: batch_labels }) # loss_arr.append(l) # average_loss += l # current_avg_loss = average_loss/step # avg_loss_arr.append(current_avg_loss) sess.run([privacy_accum_op]) # print(step, spent_eps_deltas) average_loss += loss if step == 0: step_dev = 0.1 * 5 else: step_dev = step current_avg_loss = np.mean(average_loss) / step_dev average_loss_arr.append(current_avg_loss) if step % 200 == 0: # if step > 0: # average_loss = average_loss / 2000 # The average loss is an estimate of the loss over the last 2000 batches. print('Average loss at step %d: %f' % (step, current_avg_loss)) # TODO: turns this back on if not sure how average_loss influences training process print("Embedding: ") em_val = tf.reduce_mean(tf.abs(embeddings)) print(sess.run(em_val)) # average_loss = 0 # note that this is expensive (~20% slowdown if computed every 500 steps) check_step = (FLAGS.NUM_STEPS * 0.2) if step % check_step == 0: # gw_emb = tf.reduce_mean(tf.abs(gw_Embeddings)) # print("Embedding gradients: ") # print(sess.run(gw_emb)) sim = similarity.eval() for i in range(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log = 'Nearest to %s:' % valid_word for k in range(top_k): close_word = reverse_dictionary[nearest[k]] log = '%s %s,' % (log, close_word) print(log) current_saving_dir = os.path.join( FLAGS.trained_models, "_%sepoch" % (saving_pointers[saving_pointer_idx])) # EARLY STOPPING if min_loss >= current_avg_loss: min_loss = current_avg_loss per_dec_count = 0 if FLAGS.save_best_model_alltime: best_of_saving_point_dir = os.path.join( current_saving_dir, "_best_one") if not os.path.exists(best_of_saving_point_dir): os.makedirs(best_of_saving_point_dir) temp_embeddings = normalized_embeddings.eval() spent_eps_deltas = priv_accountant.get_privacy_spent( sess, target_eps=target_eps) saving_state(best_of_saving_point_dir, spent_eps_deltas, temp_embeddings, saver, sess) msg = ("Got best model so far at step %s , avg loss = %s" % (step, current_avg_loss)) logging.info(msg) print(msg) else: per_dec_count += 1 step += 1 if per_dec_count == max_early_stopping or step == num_steps: running = False if (step + 1) in saving_pointers: spent_eps_deltas = priv_accountant.get_privacy_spent( sess, target_eps=target_eps) folder_path = os.path.join(FLAGS.trained_models, "_%sepoch" % (step + 1)) temp_embeddings = normalized_embeddings.eval() saving_state(folder_path, spent_eps_deltas, temp_embeddings, saver, sess) # Make sure we don't increase saving_pointer_idx larger than what the total number of pointers we set. if saving_pointer_idx < len(saving_pointers) - 1: saving_pointer_idx += 1 msg = "##INFO: STEP %s: avg_loss history: avg_loss_arr = %s" % ( step, average_loss_arr) logging.info(msg) if step % (num_steps - 1) == 0: print("Final privacy spent: ", step, spent_eps_deltas) print("Stopped at %s, \nFinal avg_loss = %s" % (step, avg_loss_arr)) print("loss = %s" % (loss_arr)) # final_embeddings = normalized_embeddings.eval() sess.close()