def __init__(self, config: configure_pretraining.PretrainingConfig, features, is_training): # Set up model config self._config = config self._bert_config = training_utils.get_bert_config(config) if config.debug: self._bert_config.num_hidden_layers = 3 self._bert_config.hidden_size = 144 self._bert_config.intermediate_size = 144 * 4 self._bert_config.num_attention_heads = 4 # Mask the input masked_inputs = pretrain_helpers.mask( config, pretrain_data.features_to_inputs(features), config.mask_prob) # Generator embedding_size = (self._bert_config.hidden_size if config.embedding_size is None else config.embedding_size) if config.uniform_generator: mlm_output = self._get_masked_lm_output(masked_inputs, None) elif config.electra_objective and config.untied_generator: generator = self._build_transformer( masked_inputs, is_training, bert_config=get_generator_config(config, self._bert_config), embedding_size=(None if config.untied_generator_embeddings else embedding_size), untied_embeddings=config.untied_generator_embeddings, name="generator", ) mlm_output = self._get_masked_lm_output(masked_inputs, generator) else: generator = self._build_transformer(masked_inputs, is_training, embedding_size=embedding_size) mlm_output = self._get_masked_lm_output(masked_inputs, generator) fake_data = self._get_fake_data(masked_inputs, mlm_output.logits) self.mlm_output = mlm_output self.total_loss = config.gen_weight * mlm_output.loss # Discriminator disc_output = None if config.electra_objective: discriminator = self._build_transformer( fake_data.inputs, is_training, reuse=not config.untied_generator, embedding_size=embedding_size, ) disc_output = self._get_discriminator_output( fake_data.inputs, discriminator, fake_data.is_fake_tokens) self.total_loss += config.disc_weight * disc_output.loss # Evaluation eval_fn_inputs = { "input_ids": masked_inputs.input_ids, "masked_lm_preds": mlm_output.preds, "mlm_loss": mlm_output.per_example_loss, "masked_lm_ids": masked_inputs.masked_lm_ids, "masked_lm_weights": masked_inputs.masked_lm_weights, "input_mask": masked_inputs.input_mask, } if config.electra_objective: eval_fn_inputs.update({ "disc_loss": disc_output.per_example_loss, "disc_labels": disc_output.labels, "disc_probs": disc_output.probs, "disc_preds": disc_output.preds, "sampled_tokids": tf.argmax(fake_data.sampled_tokens, -1, output_type=tf.int32), }) eval_fn_keys = eval_fn_inputs.keys() eval_fn_values = [eval_fn_inputs[k] for k in eval_fn_keys] def metric_fn(*args): """Computes the loss and accuracy of the model.""" d = {k: arg for k, arg in zip(eval_fn_keys, args)} metrics = dict() metrics["masked_lm_accuracy"] = tf.metrics.accuracy( labels=tf.reshape(d["masked_lm_ids"], [-1]), predictions=tf.reshape(d["masked_lm_preds"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1]), ) metrics["masked_lm_loss"] = tf.metrics.mean( values=tf.reshape(d["mlm_loss"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1]), ) if config.electra_objective: metrics["sampled_masked_lm_accuracy"] = tf.metrics.accuracy( labels=tf.reshape(d["masked_lm_ids"], [-1]), predictions=tf.reshape(d["sampled_tokids"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1]), ) if config.disc_weight > 0: metrics["disc_loss"] = tf.metrics.mean(d["disc_loss"]) metrics["disc_auc"] = tf.metrics.auc( d["disc_labels"] * d["input_mask"], d["disc_probs"] * tf.cast(d["input_mask"], tf.float32), ) metrics["disc_accuracy"] = tf.metrics.accuracy( labels=d["disc_labels"], predictions=d["disc_preds"], weights=d["input_mask"], ) metrics["disc_precision"] = tf.metrics.accuracy( labels=d["disc_labels"], predictions=d["disc_preds"], weights=d["disc_preds"] * d["input_mask"], ) metrics["disc_recall"] = tf.metrics.accuracy( labels=d["disc_labels"], predictions=d["disc_preds"], weights=d["disc_labels"] * d["input_mask"], ) return metrics self.eval_metrics = (metric_fn, eval_fn_values)
def _calculate_eval_metrics_fn(loss, label_ids, logits, input_mask, aggregation_function_id, logits_aggregation, classification_class_index, logits_cls): """Calculates metrics for both cells and aggregation functions.""" logits.shape.assert_has_rank(2) label_ids.shape.assert_has_rank(2) # <int32>[batch size, seq_length] predictions = tf.where(logits >= 0, tf.ones_like(logits, dtype=tf.int32), tf.zeros_like(logits, dtype=tf.int32)) input_mask_float = tf.cast(input_mask, tf.float32) loss = tf.metrics.mean(values=loss) accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=input_mask_float) # <bool>[batch size, seq_length] token_correct = tf.logical_or(tf.equal(label_ids, predictions), tf.logical_not(tf.cast(input_mask, tf.bool))) # <bool>[batch size] per_sequence_accuracy = tf.reduce_all(token_correct, axis=1) sequence_accuracy = tf.metrics.mean(values=per_sequence_accuracy) probs = tf.sigmoid(logits) precision = tf.metrics.precision(labels=label_ids, predictions=predictions, weights=input_mask_float) recall = tf.metrics.recall(labels=label_ids, predictions=predictions, weights=input_mask_float) auc = tf.metrics.auc(labels=label_ids, predictions=probs) mean_label = tf.metrics.mean(values=tf.cast(label_ids, tf.float32), weights=input_mask_float) metrics = { "eval_loss": loss, "eval_accuracy": accuracy, "eval_sequence_accuracy": sequence_accuracy, "eval_precision": precision, "eval_recall": recall, "eval_auc": auc, "eval_mean_label": mean_label, } if logits_cls is not None: # <int32>[batch size] predictions_cls = tf.argmax(logits_cls, axis=-1, output_type=tf.int32) accuracy_cls = tf.metrics.accuracy(labels=classification_class_index, predictions=predictions_cls) metrics.update({ "eval_classification_accuracy": accuracy_cls, }) if logits_aggregation is not None: # <int32>[batch size] predictions_agg = tf.argmax(logits_aggregation, axis=-1, output_type=tf.int32) accuracy_agg = tf.metrics.accuracy(labels=aggregation_function_id, predictions=predictions_agg) # <bool>[batch size] per_sequence_agg_accuracy = tf.equal(aggregation_function_id, predictions_agg) # Whether cells and aggregation function predictions are both correct. per_sequence_joint_accuracy = tf.logical_and(per_sequence_agg_accuracy, per_sequence_accuracy) joint_accuracy = tf.metrics.mean(values=per_sequence_joint_accuracy) metrics.update({ "eval_aggregation_accuracy": accuracy_agg, "eval_joint_accuracy": joint_accuracy, }) return metrics
def model(): print(' model') batch_size = 100 features = 32 * 32 categories = 4 hidden_layer_nodes_1 = 100 hidden_layer_nodes_2 = 50 x = tf.placeholder(tf.float32, [None, features]) y_ = tf.placeholder(tf.float32, [None, categories]) W1 = tf.Variable(tf.truncated_normal([features, hidden_layer_nodes_1], stddev=0.1)) b1 = tf.Variable(tf.constant(0.1, shape=[hidden_layer_nodes_1])) z1 = tf.nn.relu(tf.matmul(x,W1)+b1) W2 = tf.Variable(tf.truncated_normal([hidden_layer_nodes_1, hidden_layer_nodes_2], stddev=0.1)) b2 = tf.Variable(tf.constant(0.1, shape=[hidden_layer_nodes_2])) z2 = tf.nn.relu(tf.matmul(z1, W2) + b2) W3 = tf.Variable(tf.truncated_normal([hidden_layer_nodes_2, categories], stddev=0.1)) b3 = tf.Variable(tf.constant(0.1, shape=[categories])) z3 = tf.matmul(z2, W3) + b3 y = tf.nn.softmax(tf.matmul(z2, W3) + b3) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(y_, z3)) update = tf.train.AdamOptimizer(0.0001).minimize(loss) # data_x = dataX(features, r'Road-Signs-Project\dataset2\train\[0-3]') # print("datax: ", data_x) # data_y = dataY(categories, r'Road-Signs-Project\dataset2\train\[0-3]') # print("datay: ", data_y) # data_x_test = dataX(features, r'Road-Signs-Project\dataset2\test\[0-3]') # data_y_test = dataY(categories, r'Road-Signs-Project\dataset2\test\[0-3]') # data_x_validation = dataX(features, r'Road-Signs-Project\dataset2\validation\[0-3]') # data_y_validation = dataY(categories, r'Road-Signs-Project\dataset2\validation\[0-3]') data_x = np.load(r't_x.npy') # dataX(features, r'dataset2\train\[0-42]') data_x = data_x.astype(int) print("datax: ", data_x) data_y = np.load(r't_y.npy') # dataY(categories, r'dataset2\train\[0-42]') data_y = data_y.astype(int) print("datay: ", data_y) data_x_test = np.load(r'test_x.npy') # dataX(features, r'dataset2\test\[0-42]') data_x_test = data_x_test.astype(int) data_y_test = np.load(r'test_y.npy') # dataY(categories, r'dataset2\test\[0-42]') data_y_test = data_y_test.astype(int) data_x_validation = np.load(r'v_x.npy') # dataX(features, r'dataset2\validation\[0-42]') data_x_validation = data_x_validation.astype(int) data_y_validation = np.load(r'v_y.npy') # dataY(categories, r'dataset2\validation\[0-42]') data_y_validation = data_y_validation.astype(int) sess = tf.Session() sess.run(tf.global_variables_initializer()) first = 1 while(first == 1 or accuracy.eval(session=sess, feed_dict={x: data_x_validation, y_: data_y_validation}) < 0.975): first = 0 for i in range(0, 1000): total_batch = int(len(data_x) / batch_size) for j in range(total_batch): batch_xs, batch_ys = next_batch(batch_size,data_x,data_y) sess.run(update, feed_dict={x: batch_xs, y_: batch_ys}) if i % 100 == 0: print("Iteration:", i, ", Loss: ", loss.eval(session=sess, feed_dict = {x:data_x, y_:data_y})) if i==999: # print("W: ", sess.run(W1), ", b: ", sess.run(b1)) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) # Calculate accuracy accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print("Accuracy train:", accuracy.eval(session=sess, feed_dict={x: data_x, y_: data_y})) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) # Calculate accuracy accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print("Accuracy validation:", accuracy.eval(session=sess, feed_dict={x: data_x_validation, y_: data_y_validation})) print("The model is ready!") # Test model correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) # Calculate accuracy accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print("Accuracy test:", accuracy.eval(session=sess, feed_dict = {x: data_x_test, y_: data_y_test})) for i in range(len(data_x_test)): print('Prediction for: "', data_x_test[i], '": ', sess.run(y, feed_dict={x: [data_x_test[i]]}), ', Max value: ', max(sess.run(y, feed_dict={x: [data_x_test[i]]})[0]), ', Sum: ', sum(sess.run(y, feed_dict={x: [data_x_test[i]]})[0]), ', real class: ', data_y_test[i]) for i in range(len(data_x_test)): print('Prediction for: "', data_x_test[i], '": ', sess.run(y, feed_dict={x: [data_x_test[i]]}), ', Max value: ', max(sess.run(y, feed_dict={x: [data_x_test[i]]})[0]), ', Sum: ', sum(sess.run(y, feed_dict={x: [data_x_test[i]]})[0]), ', real class: ', data_y_test[i])
def train(): with tf.Graph().as_default(): with tf.device('/gpu:' + str(GPU_INDEX)): pointclouds_pl, labels_pl = MODEL.placeholder_inputs( BATCH_SIZE, NUM_POINT) is_training_pl = tf.placeholder(tf.bool, shape=()) print(is_training_pl) # Note the global_step=batch parameter to minimize. # That tells the optimizer to helpfully increment the 'batch' parameter for you every time it trains. batch = tf.Variable(0) bn_decay = get_bn_decay(batch) tf.summary.scalar('bn_decay', bn_decay) # Get model and loss pred, end_points = MODEL.get_model(pointclouds_pl, is_training_pl, bn_decay=bn_decay) loss = MODEL.get_loss(pred, labels_pl, end_points) tf.summary.scalar('loss', loss) correct = tf.equal(tf.argmax(pred, 1), tf.to_int64(labels_pl)) accuracy = tf.reduce_sum(tf.cast(correct, tf.float32)) / float(BATCH_SIZE) tf.summary.scalar('accuracy', accuracy) # Get training operator learning_rate = get_learning_rate(batch) tf.summary.scalar('learning_rate', learning_rate) if OPTIMIZER == 'momentum': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=MOMENTUM) elif OPTIMIZER == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.minimize(loss, global_step=batch) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Create a session config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False sess = tf.Session(config=config) # Add summary writers #merged = tf.merge_all_summaries() merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'train'), sess.graph) test_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'test')) # Init variables init = tf.global_variables_initializer() # To fix the bug introduced in TF 0.12.1 as in # http://stackoverflow.com/questions/41543774/invalidargumenterror-for-tensor-bool-tensorflow-0-12-1 #sess.run(init) sess.run(init, {is_training_pl: True}) ops = { 'pointclouds_pl': pointclouds_pl, 'labels_pl': labels_pl, 'is_training_pl': is_training_pl, 'pred': pred, 'loss': loss, 'train_op': train_op, 'merged': merged, 'step': batch } for epoch in range(MAX_EPOCH): log_string('**** EPOCH %03d ****' % (epoch)) sys.stdout.flush() train_one_epoch(sess, ops, train_writer) eval_one_epoch(sess, ops, test_writer) # Save the variables to disk. if epoch % 10 == 0: save_path = saver.save(sess, os.path.join(LOG_DIR, "model.ckpt")) log_string("Model saved in file: %s" % save_path)
def model_eval(sess, x, y, predictions, X_test=None, Y_test=None, feed=None, args=None): """ Compute the accuracy of a TF model on some data :param sess: TF session to use :param x: input placeholder :param y: output placeholder (for labels) :param predictions: model output predictions :param X_test: numpy array with training inputs :param Y_test: numpy array with training outputs :param feed: An optional dictionary that is appended to the feeding dictionary before the session runs. Can be used to feed the learning phase of a Keras model for instance. :param args: dict or argparse `Namespace` object. Should contain `batch_size` :return: a float with the accuracy value """ global _model_eval_cache args = _ArgsWrapper(args or {}) assert args.batch_size, "Batch size was not given in args dict" if X_test is None or Y_test is None: raise ValueError("X_test argument and Y_test argument " "must be supplied.") # Define accuracy symbolically key = (y, predictions) if key in _model_eval_cache: correct_preds = _model_eval_cache[key] else: correct_preds = tf.equal(tf.argmax(y, axis=-1), tf.argmax(predictions, axis=-1)) _model_eval_cache[key] = correct_preds # Init result var accuracy = 0.0 with sess.as_default(): # Compute number of batches nb_batches = int(math.ceil(float(len(X_test)) / args.batch_size)) assert nb_batches * args.batch_size >= len(X_test) X_cur = np.zeros((args.batch_size, ) + X_test.shape[1:], dtype=X_test.dtype) Y_cur = np.zeros((args.batch_size, ) + Y_test.shape[1:], dtype=Y_test.dtype) for batch in range(nb_batches): if batch % 100 == 0 and batch > 0: _logger.debug("Batch " + str(batch)) # Must not use the `batch_indices` function here, because it # repeats some examples. # It's acceptable to repeat during training, but not eval. start = batch * args.batch_size end = min(len(X_test), start + args.batch_size) # The last batch may be smaller than all others. This should not # affect the accuarcy disproportionately. cur_batch_size = end - start X_cur[:cur_batch_size] = X_test[start:end] Y_cur[:cur_batch_size] = Y_test[start:end] feed_dict = {x: X_cur, y: Y_cur} if feed is not None: feed_dict.update(feed) cur_corr_preds = correct_preds.eval(feed_dict=feed_dict) accuracy += cur_corr_preds[:cur_batch_size].sum() assert end >= len(X_test) # Divide by number of examples to get final value accuracy /= len(X_test) return accuracy
# ---------------------------网络结束--------------------------- def regularizer(a): return ((tf.nn.l2_loss(a) * 2)**0.5) * 0.5 * 0.0001 # regularizer = tf.contrib.layers.l2_regularizer(0.0001) logits = inference(x, False, regularizer) # (小处理)将logits乘以1赋值给logits_eval,定义name,方便在后续调用模型时通过tensor名字调用输出tensor b = tf.constant(value=1, dtype=tf.float32) logits_eval = tf.multiply(logits, b, name='logits_eval') loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y_) train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), y_) acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # 定义一个函数,按批次取数据 def minibatches(inputs=None, targets=None, batch_size=None, shuffle=False): assert len(inputs) == len(targets) if shuffle: indices = np.arange(len(inputs)) np.random.shuffle(indices) for start_idx in range(0, len(inputs) - batch_size + 1, batch_size): if shuffle: excerpt = indices[start_idx:start_idx + batch_size] else: excerpt = slice(start_idx, start_idx + batch_size) yield inputs[excerpt], targets[excerpt]
def metric_fn(labels, logits): accuracy = tf.metrics.accuracy( labels=labels, predictions=tf.argmax(logits, axis=1)) return {"accuracy": accuracy}
def build(): """Builds the Tensorflow graph.""" inputs, labels, lengths = None, None, None if mode in ('train', 'eval'): if isinstance(no_event_label, numbers.Number): label_shape = [] else: label_shape = [len(no_event_label)] inputs, labels, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size, label_shape=label_shape, shuffle=mode == 'train') elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) if isinstance(encoder_decoder, magenta.music.OneHotIndexEventSequenceEncoderDecoder): expanded_inputs = tf.one_hot( tf.cast(tf.squeeze(inputs, axis=-1), tf.int64), encoder_decoder.input_depth) else: expanded_inputs = inputs dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob cell = make_rnn_cell(hparams.rnn_layer_sizes, dropout_keep_prob=dropout_keep_prob, attn_length=hparams.attn_length, residual_connections=hparams.residual_connections) initial_state = cell.zero_state(hparams.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn(cell, expanded_inputs, sequence_length=lengths, initial_state=initial_state, swap_memory=True) outputs_flat = magenta.common.flatten_maybe_padded_sequences( outputs, lengths) if isinstance(num_classes, numbers.Number): num_logits = num_classes else: num_logits = sum(num_classes) logits_flat = tf_slim.layers.linear(outputs_flat, num_logits) if mode in ('train', 'eval'): labels_flat = magenta.common.flatten_maybe_padded_sequences( labels, lengths) if isinstance(num_classes, numbers.Number): softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat, logits=logits_flat) predictions_flat = tf.argmax(logits_flat, axis=1) else: logits_offsets = np.cumsum([0] + num_classes) softmax_cross_entropy = [] predictions = [] for i in range(len(num_classes)): softmax_cross_entropy.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat[:, i], logits=logits_flat[:, logits_offsets[i]: logits_offsets[i + 1]])) predictions.append( tf.argmax( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], axis=1)) predictions_flat = tf.stack(predictions, 1) correct_predictions = tf.to_float( tf.equal(labels_flat, predictions_flat)) event_positions = tf.to_float( tf.not_equal(labels_flat, no_event_label)) no_event_positions = tf.to_float( tf.equal(labels_flat, no_event_label)) # Compute the total number of time steps across all sequences in the # batch. For some models this will be different from the number of RNN # steps. def batch_labels_to_num_steps(batch_labels, lengths): num_steps = 0 for labels, length in zip(batch_labels, lengths): num_steps += encoder_decoder.labels_to_num_steps( labels[:length]) return np.float32(num_steps) num_steps = tf.py_func(batch_labels_to_num_steps, [labels, lengths], tf.float32) if mode == 'train': loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.exp(loss) accuracy = tf.reduce_mean(correct_predictions) event_accuracy = ( tf.reduce_sum(correct_predictions * event_positions) / tf.reduce_sum(event_positions)) no_event_accuracy = ( tf.reduce_sum(correct_predictions * no_event_positions) / tf.reduce_sum(no_event_positions)) loss_per_step = tf.reduce_sum( softmax_cross_entropy) / num_steps perplexity_per_step = tf.exp(loss_per_step) optimizer = tf.train.AdamOptimizer( learning_rate=hparams.learning_rate) train_op = tf_slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm) tf.add_to_collection('train_op', train_op) vars_to_summarize = { 'loss': loss, 'metrics/perplexity': perplexity, 'metrics/accuracy': accuracy, 'metrics/event_accuracy': event_accuracy, 'metrics/no_event_accuracy': no_event_accuracy, 'metrics/loss_per_step': loss_per_step, 'metrics/perplexity_per_step': perplexity_per_step, } elif mode == 'eval': vars_to_summarize, update_ops = tf_slim.metrics.aggregate_metric_map( { 'loss': tf.metrics.mean(softmax_cross_entropy), 'metrics/accuracy': tf.metrics.accuracy(labels_flat, predictions_flat), 'metrics/per_class_accuracy': tf.metrics.mean_per_class_accuracy( labels_flat, predictions_flat, num_classes), 'metrics/event_accuracy': tf.metrics.recall(event_positions, correct_predictions), 'metrics/no_event_accuracy': tf.metrics.recall(no_event_positions, correct_predictions), 'metrics/loss_per_step': tf.metrics.mean(tf.reduce_sum(softmax_cross_entropy) / num_steps, weights=num_steps), }) for updates_op in update_ops.values(): tf.add_to_collection('eval_ops', updates_op) # Perplexity is just exp(loss) and doesn't need its own update op. vars_to_summarize['metrics/perplexity'] = tf.exp( vars_to_summarize['loss']) vars_to_summarize['metrics/perplexity_per_step'] = tf.exp( vars_to_summarize['metrics/loss_per_step']) for var_name, var_value in vars_to_summarize.items(): tf.summary.scalar(var_name, var_value) tf.add_to_collection(var_name, var_value) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) if isinstance(num_classes, numbers.Number): softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape(softmax_flat, [hparams.batch_size, -1, num_classes]) else: logits_offsets = np.cumsum([0] + num_classes) softmax = [] for i in range(len(num_classes)): sm = tf.nn.softmax( tf.div( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], tf.fill([num_classes[i]], temperature))) sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]]) softmax.append(sm) tf.add_to_collection('inputs', inputs) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) # Flatten state tuples for metagraph compatibility. for state in tf.nest.flatten(initial_state): tf.add_to_collection('initial_state', state) for state in tf.nest.flatten(final_state): tf.add_to_collection('final_state', state)
def main(unused_argv): # Load training and eval data. train_file = "data/train.csv" val_file = "data/val.csv" test_file = "data/test.csv" # Define the TabNet model tabnet_forest_covertype = tabnet_model.TabNet( columns=data_helper_covertype.get_columns(), num_features=data_helper_covertype.num_features, feature_dim=128, output_dim=64, num_decision_steps=6, relaxation_factor=1.5, batch_momentum=0.7, virtual_batch_size=512, num_classes=data_helper_covertype.num_classes) column_names = sorted(data_helper_covertype.feature_columns) print( "Ordered column names, corresponding to the indexing in Tensorboard visualization" ) for fi in range(len(column_names)): print(str(fi) + " : " + column_names[fi]) # Training parameters max_steps = 1000000 display_step = 5000 val_step = 10000 save_step = 40000 init_localearning_rate = 0.02 decay_every = 500 decay_rate = 0.95 batch_size = 16384 sparsity_loss_weight = 0.0001 gradient_thresh = 2000.0 # Input sampling train_batch = data_helper_covertype.input_fn(train_file, num_epochs=100000, shuffle=True, batch_size=batch_size) val_batch = data_helper_covertype.input_fn( val_file, num_epochs=10000, shuffle=False, batch_size=data_helper_covertype.n_val_samples) test_batch = data_helper_covertype.input_fn( test_file, num_epochs=10000, shuffle=False, batch_size=data_helper_covertype.n_test_samples) train_iter = train_batch.make_initializable_iterator() val_iter = val_batch.make_initializable_iterator() test_iter = test_batch.make_initializable_iterator() feature_train_batch, label_train_batch = train_iter.get_next() feature_val_batch, label_val_batch = val_iter.get_next() feature_test_batch, label_test_batch = test_iter.get_next() # Define the model and losses encoded_train_batch, total_entropy = tabnet_forest_covertype.encoder( feature_train_batch, reuse=False, is_training=True) logits_orig_batch, _ = tabnet_forest_covertype.classify( encoded_train_batch, reuse=False) softmax_orig_key_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_orig_batch, labels=label_train_batch)) train_loss_op = softmax_orig_key_op + sparsity_loss_weight * total_entropy tf.summary.scalar("Total loss", train_loss_op) # Optimization step global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay(init_localearning_rate, global_step=global_step, decay_steps=decay_every, decay_rate=decay_rate) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): gvs = optimizer.compute_gradients(train_loss_op) capped_gvs = [(tf.clip_by_value(grad, -gradient_thresh, gradient_thresh), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Model evaluation # Validation performance encoded_val_batch, _ = tabnet_forest_covertype.encoder(feature_val_batch, reuse=True, is_training=False) _, prediction_val = tabnet_forest_covertype.classify(encoded_val_batch, reuse=True) predicted_labels = tf.cast(tf.argmax(prediction_val, 1), dtype=tf.int32) val_eq_op = tf.equal(predicted_labels, label_val_batch) val_acc_op = tf.reduce_mean(tf.cast(val_eq_op, dtype=tf.float32)) tf.summary.scalar("Val accuracy", val_acc_op) # Test performance encoded_test_batch, _ = tabnet_forest_covertype.encoder(feature_test_batch, reuse=True, is_training=False) _, prediction_test = tabnet_forest_covertype.classify(encoded_test_batch, reuse=True) predicted_labels = tf.cast(tf.argmax(prediction_test, 1), dtype=tf.int32) test_eq_op = tf.equal(predicted_labels, label_test_batch) test_acc_op = tf.reduce_mean(tf.cast(test_eq_op, dtype=tf.float32)) tf.summary.scalar("Test accuracy", test_acc_op) # Training setup model_name = "tabnet_forest_covertype_model" init = tf.initialize_all_variables() init_local = tf.local_variables_initializer() init_table = tf.tables_initializer(name="Initialize_all_tables") saver = tf.train.Saver() summaries = tf.summary.merge_all() with tf.Session() as sess: summary_writer = tf.summary.FileWriter("./tflog/" + model_name, sess.graph) sess.run(init) sess.run(init_local) sess.run(init_table) sess.run(train_iter.initializer) sess.run(val_iter.initializer) sess.run(test_iter.initializer) for step in range(1, max_steps + 1): if step % display_step == 0: _, train_loss, merged_summary = sess.run( [train_op, train_loss_op, summaries]) summary_writer.add_summary(merged_summary, step) print("Step " + str(step) + " , Training Loss = " + "{:.4f}".format(train_loss)) else: _ = sess.run(train_op) if step % val_step == 0: feed_arr = [ vars()["summaries"], vars()["val_acc_op"], vars()["test_acc_op"] ] val_arr = sess.run(feed_arr) merged_summary = val_arr[0] val_acc = val_arr[1] print("Step " + str(step) + " , Val Accuracy = " + "{:.4f}".format(val_acc)) summary_writer.add_summary(merged_summary, step) if step % save_step == 0: saver.save(sess, "./checkpoints/" + model_name + ".ckpt")
def build_model_fn(features, labels, mode, params): """The model_fn for MnasNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) if isinstance(features, dict): features = features['feature'] if mode == tf.estimator.ModeKeys.PREDICT: # Adds an identify node to help TFLite export. features = tf.identity(features, 'float_image_input') # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if params['data_format'] == 'channels_first': assert not params['transpose_input'] # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant( imagenet_input.MEAN_RGB, shape=stats_shape, dtype=features.dtype) features /= tf.constant( imagenet_input.STDDEV_RGB, shape=stats_shape, dtype=features.dtype) has_moving_average_decay = (params['moving_average_decay'] > 0) tf.logging.info('Using open-source implementation for MnasNet definition.') override_params = {} if params['batch_norm_momentum']: override_params['batch_norm_momentum'] = params['batch_norm_momentum'] if params['batch_norm_epsilon']: override_params['batch_norm_epsilon'] = params['batch_norm_epsilon'] if params['dropout_rate']: override_params['dropout_rate'] = params['dropout_rate'] if params['data_format']: override_params['data_format'] = params['data_format'] if params['num_label_classes']: override_params['num_classes'] = params['num_label_classes'] if params['depth_multiplier']: override_params['depth_multiplier'] = params['depth_multiplier'] if params['depth_divisor']: override_params['depth_divisor'] = params['depth_divisor'] if params['min_depth']: override_params['min_depth'] = params['min_depth'] override_params['use_keras'] = params['use_keras'] def _build_model(model_name): """Build the model for a given model name.""" if model_name.startswith('mnasnet'): return mnasnet_models.build_mnasnet_model( features, model_name=model_name, training=is_training, override_params=override_params) elif model_name.startswith('mixnet'): return mixnet_builder.build_model( features, model_name=model_name, training=is_training, override_params=override_params) else: raise ValueError('Unknown model name {}'.format(model_name)) if params['precision'] == 'bfloat16': with tf.tpu.bfloat16_scope(): logits, _ = _build_model(params['model_name']) logits = tf.cast(logits, tf.float32) else: # params['precision'] == 'float32' logits, _ = _build_model(params['model_name']) if params['quantized_training']: try: from tensorflow.contrib import quantize # pylint: disable=g-import-not-at-top except ImportError as e: logging.exception('Quantized training is not supported in TensorFlow 2.x') raise e if is_training: tf.logging.info('Adding fake quantization ops for training.') quantize.create_training_graph( quant_delay=int(params['steps_per_epoch'] * FLAGS.quantization_delay_epochs)) else: tf.logging.info('Adding fake quantization ops for evaluation.') quantize.create_eval_graph() if mode == tf.estimator.ModeKeys.PREDICT: scaffold_fn = None if FLAGS.export_moving_average: # If the model is trained with moving average decay, to match evaluation # metrics, we need to export the model using moving average variables. restore_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir) variables_to_restore = get_pretrained_variables_to_restore( restore_checkpoint, load_moving_average=True) tf.logging.info('Restoring from the latest checkpoint: %s', restore_checkpoint) tf.logging.info(str(variables_to_restore)) def restore_scaffold(): saver = tf.train.Saver(variables_to_restore) return tf.train.Scaffold(saver=saver) scaffold_fn = restore_scaffold predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }, scaffold_fn=scaffold_fn) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, params['num_label_classes']) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=params['label_smoothing']) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=params['moving_average_decay'], num_updates=global_step) ema_vars = mnas_utils.get_ema_vars() host_call = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = ( tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = params['base_learning_rate'] * (params['train_batch_size'] / 256.0) # pylint: disable=line-too-long learning_rate = mnas_utils.build_learning_rate(scaled_lr, global_step, params['steps_per_epoch']) optimizer = mnas_utils.build_optimizer(learning_rate) if params['use_tpu']: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not params['skip_host_call']: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed params['iterations_per_loop'] times after # one TPU loop is finished, setting max_queue value to the same as # number of iterations will make the summary writer only flush the # data to storage once per loop. with tf2.summary.create_file_writer( FLAGS.model_dir, max_queue=params['iterations_per_loop']).as_default(): with tf2.summary.record_if(True): tf2.summary.scalar('loss', loss[0], step=gs) tf2.summary.scalar('learning_rate', lr[0], step=gs) tf2.summary.scalar('current_epoch', ce[0], step=gs) return tf.summary.all_v2_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('number of trainable parameters: {}'.format(num_params)) # Prepares scaffold_fn if needed. scaffold_fn = None if is_training and FLAGS.init_checkpoint: variables_to_restore = get_pretrained_variables_to_restore( FLAGS.init_checkpoint, has_moving_average_decay) tf.logging.info('Initializing from pretrained checkpoint: %s', FLAGS.init_checkpoint) if FLAGS.use_tpu: def init_scaffold(): tf.train.init_from_checkpoint(FLAGS.init_checkpoint, variables_to_restore) return tf.train.Scaffold() scaffold_fn = init_scaffold else: tf.train.init_from_checkpoint(FLAGS.init_checkpoint, variables_to_restore) restore_vars_dict = None if not is_training and has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) def eval_scaffold(): saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) scaffold_fn = eval_scaffold return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def bert_model_fn(features, labels, is_training): # pylint: disable=unused-argument """The `model_fn` for LowLevelRunner.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) use_one_hot_embeddings = False learning_rate = FLAGS.learning_rate num_train_steps = FLAGS.num_train_steps num_warmup_steps = FLAGS.num_warmup_steps start_warmup_step = FLAGS.start_warmup_step num_train_steps = FLAGS.num_train_steps use_tpu = FLAGS.use_tpu optimizer = FLAGS.optimizer poly_power = FLAGS.poly_power lamb_weight_decay_rate = FLAGS.lamb_weight_decay_rate lamb_beta_1 = FLAGS.lamb_beta_1 lamb_beta_2 = FLAGS.lamb_beta_2 log_epsilon = FLAGS.log_epsilon tf.logging.info("Using learning rate: %s", learning_rate) print("Using learning rate:", learning_rate) tf.logging.info("Using lamb_weight_decay_rate: %s", lamb_weight_decay_rate) print("Using lamb_weight_decay_rate:", lamb_weight_decay_rate) tf.logging.info("Using beta 1: %s", lamb_beta_1) print("Using beta 1:", lamb_beta_1) tf.logging.info("Using beta 2: %s", lamb_beta_2) print("Using beta 2:", lamb_beta_2) tf.logging.info("Using log_epsilon: %s", log_epsilon) print("Using log_epsilon:", log_epsilon) tf.logging.info("Using num_warmup_steps: %s", num_warmup_steps) print("Using num_warmup_steps:", num_warmup_steps) tf.logging.info("Using num_train_steps: %s", num_train_steps) print("Using num_train_steps:", num_train_steps) tf.get_variable_scope().set_custom_getter( modeling.bfloat16_var_getter if FLAGS.use_bfloat16_activation else None) if FLAGS.use_bfloat16_activation: tf.logging.info("Using bfloat16 for activations.") model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, use_bfloat16_activation=FLAGS.use_bfloat16_activation, num_partitions=FLAGS.num_partitions) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, tf.cast(model.get_sequence_output(), tf.float32), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights, FLAGS.num_partitions) ( next_sentence_loss, _, # next_sentence_example_loss, _ # next_sentence_log_probs ) = get_next_sentence_output(bert_config, tf.cast(model.get_pooled_output(), tf.float32), next_sentence_labels) total_loss = masked_lm_loss + next_sentence_loss if not is_training: # Computes the loss and accuracy of the model. masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax( masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_weighted_correct = tf.multiply( tf.cast(tf.equal(masked_lm_ids, masked_lm_predictions), tf.float32), masked_lm_weights) masked_lm_weighted_correct = tf.reduce_sum(masked_lm_weighted_correct) masked_lm_weighted_count = tf.reduce_sum(masked_lm_weights) return None, { "masked_lm_weighted_correct": tf.reshape(masked_lm_weighted_correct, [-1]), "masked_lm_weighted_count": tf.reshape(masked_lm_weighted_count, [-1])} train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, optimizer, poly_power, start_warmup_step, lamb_weight_decay_rate, lamb_beta_1, lamb_beta_2, log_epsilon, FLAGS.use_bfloat16_all_reduce) return train_op, None
def __init__(self, config, w2i_target, useTeacherForcing=True, useAttention=True, useBeamSearch=1): self.build_inputs(config) with tf.variable_scope("encoder"): encoder_embedding = tf.Variable(tf.random_uniform( [config.source_vocab_size, config.embedding_dim]), dtype=tf.float32, name='encoder_embedding') encoder_inputs_embedded = tf.nn.embedding_lookup( encoder_embedding, self.seq_inputs) ((encoder_fw_outputs, encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=tf.nn.rnn_cell.GRUCell(config.hidden_dim), cell_bw=tf.nn.rnn_cell.GRUCell(config.hidden_dim), inputs=encoder_inputs_embedded, sequence_length=self.seq_inputs_length, dtype=tf.float32, time_major=False) encoder_state = tf.add(encoder_fw_final_state, encoder_bw_final_state) encoder_outputs = tf.add(encoder_fw_outputs, encoder_bw_outputs) with tf.variable_scope("decoder"): decoder_embedding = tf.Variable(tf.random_uniform( [config.target_vocab_size, config.embedding_dim]), dtype=tf.float32, name='decoder_embedding') tokens_go = tf.ones([config.batch_size], dtype=tf.int32, name='tokens_GO') * w2i_target["_GO"] if useTeacherForcing: decoder_inputs = tf.concat( [tf.reshape(tokens_go, [-1, 1]), self.seq_targets[:, :-1]], 1) helper = tf.contrib.seq2seq.TrainingHelper( tf.nn.embedding_lookup(decoder_embedding, decoder_inputs), self.seq_targets_length) else: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( decoder_embedding, tokens_go, w2i_target["_EOS"]) with tf.variable_scope("gru_cell"): decoder_cell = tf.nn.rnn_cell.GRUCell(config.hidden_dim) if useAttention: if useBeamSearch > 1: tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, multiplier=useBeamSearch) tiled_sequence_length = tf.contrib.seq2seq.tile_batch( self.seq_inputs_length, multiplier=useBeamSearch) attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=config.hidden_dim, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length) decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism) tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=useBeamSearch) tiled_decoder_initial_state = decoder_cell.zero_state( batch_size=config.batch_size * useBeamSearch, dtype=tf.float32) tiled_decoder_initial_state = tiled_decoder_initial_state.clone( cell_state=tiled_encoder_final_state) decoder_initial_state = tiled_decoder_initial_state else: attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=config.hidden_dim, memory=encoder_outputs, memory_sequence_length=self.seq_inputs_length) # attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units=config.hidden_dim, memory=encoder_outputs, memory_sequence_length=self.seq_inputs_length) decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism) decoder_initial_state = decoder_cell.zero_state( batch_size=config.batch_size, dtype=tf.float32) decoder_initial_state = decoder_initial_state.clone( cell_state=encoder_state) else: if useBeamSearch > 1: decoder_initial_state = tf.contrib.seq2seq.tile_batch( encoder_state, multiplier=useBeamSearch) else: decoder_initial_state = encoder_state if useBeamSearch > 1: decoder = tf.contrib.seq2seq.BeamSearchDecoder( decoder_cell, decoder_embedding, tokens_go, w2i_target["_EOS"], decoder_initial_state, beam_width=useBeamSearch, output_layer=tf.layers.Dense(config.target_vocab_size)) else: decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, decoder_initial_state, output_layer=tf.layers.Dense(config.target_vocab_size)) decoder_outputs, decoder_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( decoder, maximum_iterations=tf.reduce_max(self.seq_targets_length)) if useBeamSearch > 1: self.out = decoder_outputs.predicted_ids[:, :, 0] else: decoder_logits = decoder_outputs.rnn_output self.out = tf.argmax(decoder_logits, 2) sequence_mask = tf.sequence_mask(self.seq_targets_length, dtype=tf.float32) self.loss = tf.contrib.seq2seq.sequence_loss( logits=decoder_logits, targets=self.seq_targets, weights=sequence_mask) self.train_op = tf.train.AdamOptimizer( learning_rate=config.learning_rate).minimize(self.loss)
with tf.name_scope('Loss'): cross_entropy = tf.reduce_mean( -tf.reduce_sum(ys * tf.log(tf.clip_by_value(prediction, 1e-15, 1.0)), reduction_indices=[1])) # tf.add_to_collection('losses', cross_entropy) # 将交叉熵加入损失函数集合losses # loss = tf.add_n(tf.get_collection('losses')) # 将losses全部结果相加 tf.summary.scalar('loss', cross_entropy) with tf.name_scope('Train'): # train = tf.train.AdamOptimizer(learnRate).minimize(cross_entropy) train = tf.train.MomentumOptimizer(learnRate, momentum=0.9).minimize(cross_entropy) with tf.name_scope('Accuracy'): correct = tf.equal(tf.argmax(ys, 1), tf.argmax(prediction, 1)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) tf.summary.scalar('accuracy', accuracy) saver = tf.train.Saver() init = tf.initialize_all_variables() merge = tf.summary.merge_all() trainWriter = tf.summary.FileWriter(trainLogPath, sess.graph) testWriter = tf.summary.FileWriter(testLogPath) if Path(saverDistPath).exists(): saver.restore(sess, saverPath) else: sess.run(init) # sess.run(init)
def _accuracy(self): self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'], self.placeholders['labels_mask']) self.pred = tf.argmax(self.outputs, 1) self.labels = tf.argmax(self.placeholders['labels'], 1)
def _model_fn(features, labels, mode, params=None): is_training = (mode == tf.estimator.ModeKeys.TRAIN) batch_size = features['point'].shape[0] n_sample_frames = features['point'].shape[1] accum_size = batch_size * n_sample_frames if params == 'gen_mesh': latent_output = tf.constant([0, 0, 0], dtype=tf.float32) latent_holder = tf.placeholder(tf.float32, latent_output.shape) # Decode the tranformed shapes and compute the losses with tf.variable_scope('shape/decode', reuse=tf.AUTO_REUSE): transform = tf.reshape(features['transform'], [accum_size, n_parts, transform_dims]) joint = tf.reshape(features['joint'], [accum_size, n_parts, n_dims]) points = features['point'] n_points = tf.shape(points)[2] points = tf.reshape(points, [accum_size, n_points, n_dims]) if is_training: labels = tf.reshape(features['label'], [accum_size, n_points, 1]) predictions, parts = model_utils.nasa_indicator( points, transform, joint, hparams, need_transformation=True) indicator_loss = model_utils.compute_l2_indicator_loss( labels, predictions) minimal_loss = tf.reduce_mean( tf.square(parts[..., :sample_bbox, :])) part_points = tf.reshape(features['vert'], [accum_size, -1, n_dims]) part_weight = tf.reshape(features['weight'], [accum_size, -1, n_parts]) if sample_vert > 0: # If 0, use all vertices. n_vert = part_points.shape[1] sample_indices = tf.random.uniform( [accum_size, sample_vert], minval=0, maxval=n_vert, dtype=tf.int32) part_points = tf.gather(part_points, sample_indices, axis=1, batch_dims=1) part_weight = tf.gather(part_weight, sample_indices, axis=1, batch_dims=1) unused_var, pred_parts = model_utils.nasa_indicator( part_points, transform, joint, hparams, need_transformation=True) part_label = tf.argmax(part_weight, axis=-1) part_label = tf.one_hot( part_label, depth=n_parts, axis=-1, dtype=tf.float32) * level_set part_label = tf.expand_dims(tf.transpose( part_label, [0, 2, 1]), axis=-1) label_loss = model_utils.compute_l2_indicator_loss( part_label, pred_parts) else: n_points = tf.shape(features['point'])[2] points = tf.reshape(features['point'], [accum_size, n_points, n_dims]) predictions, parts = model_utils.nasa_indicator( points, transform, joint, hparams, need_transformation=True, noise=labels) if params == 'gen_mesh': return latent_holder, latent_output, tf.concat( [parts, tf.expand_dims(predictions, axis=1)], axis=1) tf.summary.scalar('indicator', indicator_loss) loss = indicator_loss if label_w > 0: tf.summary.scalar('label', label_loss) indicator_loss += label_loss * label_w if minimal_w > 0: tf.summary.scalar('minimal', minimal_loss) indicator_loss += minimal_loss * minimal_w global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(indicator_loss, global_step=global_step, name='optimizer_shape') return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def resnet_model_fn(features, labels, mode, params): """The model_fn for ResNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. If transpose_input is enabled, it is transposed to device layout and reshaped to 1D tensor. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU/TPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. if params['data_format'] == 'channels_first': assert not params['transpose_input'] # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT: image_size = tf.sqrt(tf.shape(features)[0] / (3 * tf.shape(labels)[0])) features = tf.reshape(features, [image_size, image_size, 3, -1]) features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) # DropBlock keep_prob for the 4 block groups of ResNet architecture. # None means applying no DropBlock at the corresponding block group. dropblock_keep_probs = [None] * 4 if params['dropblock_groups']: # Scheduled keep_prob for DropBlock. train_steps = tf.cast(params['train_steps'], tf.float32) current_step = tf.cast(tf.train.get_global_step(), tf.float32) current_ratio = current_step / train_steps dropblock_keep_prob = (1 - current_ratio * ( 1 - params['dropblock_keep_prob'])) # Computes DropBlock keep_prob for different block groups of ResNet. dropblock_groups = [int(x) for x in params['dropblock_groups'].split(',')] for block_group in dropblock_groups: if block_group < 1 or block_group > 4: raise ValueError( 'dropblock_groups should be a comma separated list of integers ' 'between 1 and 4 (dropblcok_groups: {}).' .format(params['dropblock_groups'])) dropblock_keep_probs[block_group - 1] = 1 - ( (1 - dropblock_keep_prob) / 4.0**(4 - block_group)) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): network = resnet_model.resnet( resnet_depth=params['resnet_depth'], num_classes=params['num_label_classes'], dropblock_size=params['dropblock_size'], dropblock_keep_probs=dropblock_keep_probs, pre_activation=params['pre_activation'], norm_act_layer=params['norm_act_layer'], data_format=params['data_format']) return network( inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) if params['precision'] == 'bfloat16': with tf.tpu.bfloat16_scope(): logits = build_network() logits = tf.cast(logits, tf.float32) elif params['precision'] == 'float32': logits = build_network() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, params['num_label_classes']) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=params['label_smoothing']) # Add weight decay to the loss for non-batch-normalization variables. if params['enable_lars']: loss = cross_entropy else: loss = cross_entropy + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name and 'evonorm' not in v.name ]) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: # Compute the current epoch and associated learning rate from global_step. global_step = tf.train.get_global_step() steps_per_epoch = params['num_train_images'] / params['train_batch_size'] current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch) # LARS is a large batch optimizer. LARS enables higher accuracy at batch 16K # and larger batch sizes. if params['enable_lars']: learning_rate = 0.0 optimizer = lars_util.init_lars_optimizer(current_epoch, params) else: learning_rate = learning_rate_schedule(params, current_epoch) optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=params['momentum'], use_nesterov=True) if params['use_tpu']: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if not params['skip_host_call']: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed params['iterations_per_loop'] times after # one TPU loop is finished, setting max_queue value to the same as # number of iterations will make the summary writer only flush the data # to storage once per loop. with tf2.summary.create_file_writer( FLAGS.model_dir, max_queue=params['iterations_per_loop']).as_default(): with tf2.summary.record_if(True): tf2.summary.scalar('loss', loss[0], step=gs) tf2.summary.scalar('learning_rate', lr[0], step=gs) tf2.summary.scalar('current_epoch', ce[0], step=gs) return tf.summary.all_v2_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, alpha=0.0, use_tpu=False): """Produce predictions from the model.""" if not self._hparams.do_mask: infer_out = super(TransformerAE, self).infer(features, decode_length, beam_size, top_beams, alpha, use_tpu=use_tpu) return infer_out["outputs"] if not features: features = {} inputs_old = None if "inputs" in features and len(features["inputs"].shape) < 4: inputs_old = features["inputs"] features["inputs"] = tf.expand_dims(features["inputs"], 2) # Create an initial targets tensor. if "partial_targets" in features: initial_output = tf.convert_to_tensor(features["partial_targets"]) else: # inputs might not be present in features (e.g.: language modeling), # in which case we fallback to 'infer_targets' for calculating initial # input shape, type, etc. inputs_or_targets = features.get("inputs", features.get("infer_targets")) batch_size = common_layers.shape_list(inputs_or_targets)[0] length = common_layers.shape_list(inputs_or_targets)[1] hidden_dim = common_layers.shape_list(inputs_or_targets)[-1] target_length = tf.to_int32(2.0 * tf.to_float(length)) initial_output = tf.zeros( (batch_size, target_length, 1, hidden_dim), dtype=inputs_or_targets.dtype) features["targets"] = initial_output logits, _ = self(features) # pylint: disable=not-callable # this should only happen if we're doing target_modality not real if inputs_or_targets.dtype == tf.float32: samples = logits else: samples = tf.argmax(logits, axis=-1) # More steps. self.predict_mask = 0.0 # Use the provided targets this time. how_many_more_steps = 0 # Set to 1 or more for Gibbs-like sampling. for _ in range(how_many_more_steps): with tf.variable_scope(tf.get_variable_scope(), reuse=True): features["targets"] = samples logits, _ = self(features) # pylint: disable=not-callable if inputs_or_targets.dtype == tf.float32: # When target_modality is real, the last axis does not represent # classes, so it should not be argmax'ed samples = logits else: samples = tf.argmax(logits, axis=-1) self.predict_mask = 1.0 if inputs_old is not None: # Restore to not confuse Estimator. features["inputs"] = inputs_old return samples
print ('fc1',fc1.get_shape()) #fc1 = tf.nn.relu(fc1) #fc1 = tf.nn.dropout(fc1, dropout) out = tf.add(tf.matmul(fc1, weights['out']), biases['out']) return out pred = conv_lstm_net1(x, pixel_coordinate, weights, biases, keep_prob) pred1 = tf.nn.softmax(conv_lstm_net1(x, pixel_coordinate, weights, biases, keep_prob)) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))#reduce是求均值 optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))#equal返回值为true/false #argmax求最大元素索引值 accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))#cast将true和false转化为1和0 init = tf.global_variables_initializer() for i in range (test_num): with tf.Session() as sess: sess.run(init) out_num = int(training_iters / display_step) - 1 Testing_Accuarcy = np.zeros([1,2]) step = 0 accuracy_test = 0 rec_loss = np.zeros([out_num,1])
def __init__(self, config: configure_pretraining.PretrainingConfig, features, is_training): # Set up model config self._config = config self._bert_config = training_utils.get_bert_config(config) if config.debug: self._bert_config.num_hidden_layers = 3 self._bert_config.hidden_size = 144 self._bert_config.intermediate_size = 144 * 4 self._bert_config.num_attention_heads = 4 # Mask the input unmasked_inputs = pretrain_data.features_to_inputs(features) masked_inputs = pretrain_helpers.mask(config, unmasked_inputs, config.mask_prob) # Generator embedding_size = (self._bert_config.hidden_size if config.embedding_size is None else config.embedding_size) cloze_output = None if config.uniform_generator: # simple generator sampling fakes uniformly at random mlm_output = self._get_masked_lm_output(masked_inputs, None) elif ((config.electra_objective or config.electric_objective) and config.untied_generator): generator_config = get_generator_config(config, self._bert_config) if config.two_tower_generator: # two-tower cloze model generator used for electric generator = TwoTowerClozeTransformer(config, generator_config, unmasked_inputs, is_training, embedding_size) cloze_output = self._get_cloze_outputs(unmasked_inputs, generator) mlm_output = get_softmax_output( pretrain_helpers.gather_positions( cloze_output.logits, masked_inputs.masked_lm_positions), masked_inputs.masked_lm_ids, masked_inputs.masked_lm_weights, self._bert_config.vocab_size) else: # small masked language model generator generator = build_transformer( config, masked_inputs, is_training, generator_config, embedding_size=(None if config.untied_generator_embeddings else embedding_size), untied_embeddings=config.untied_generator_embeddings, scope="generator") mlm_output = self._get_masked_lm_output( masked_inputs, generator) else: # full-sized masked language model generator if using BERT objective or if # the generator and discriminator have tied weights generator = build_transformer(config, masked_inputs, is_training, self._bert_config, embedding_size=embedding_size) mlm_output = self._get_masked_lm_output(masked_inputs, generator) fake_data = self._get_fake_data(masked_inputs, mlm_output.logits) self.mlm_output = mlm_output self.total_loss = config.gen_weight * (cloze_output.loss if config.two_tower_generator else mlm_output.loss) # Discriminator disc_output = None if config.electra_objective or config.electric_objective: discriminator = build_transformer( config, fake_data.inputs, is_training, self._bert_config, reuse=not config.untied_generator, embedding_size=embedding_size) disc_output = self._get_discriminator_output( fake_data.inputs, discriminator, fake_data.is_fake_tokens, cloze_output) self.total_loss += config.disc_weight * disc_output.loss # Evaluation eval_fn_inputs = { "input_ids": masked_inputs.input_ids, "masked_lm_preds": mlm_output.preds, "mlm_loss": mlm_output.per_example_loss, "masked_lm_ids": masked_inputs.masked_lm_ids, "masked_lm_weights": masked_inputs.masked_lm_weights, "input_mask": masked_inputs.input_mask } if config.electra_objective or config.electric_objective: eval_fn_inputs.update({ "disc_loss": disc_output.per_example_loss, "disc_labels": disc_output.labels, "disc_probs": disc_output.probs, "disc_preds": disc_output.preds, "sampled_tokids": tf.argmax(fake_data.sampled_tokens, -1, output_type=tf.int32) }) eval_fn_keys = eval_fn_inputs.keys() eval_fn_values = [eval_fn_inputs[k] for k in eval_fn_keys] def metric_fn(*args): """Computes the loss and accuracy of the model.""" d = {k: arg for k, arg in zip(eval_fn_keys, args)} metrics = dict() metrics["masked_lm_accuracy"] = tf.metrics.accuracy( labels=tf.reshape(d["masked_lm_ids"], [-1]), predictions=tf.reshape(d["masked_lm_preds"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1])) metrics["masked_lm_loss"] = tf.metrics.mean( values=tf.reshape(d["mlm_loss"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1])) if config.electra_objective or config.electric_objective: metrics["sampled_masked_lm_accuracy"] = tf.metrics.accuracy( labels=tf.reshape(d["masked_lm_ids"], [-1]), predictions=tf.reshape(d["sampled_tokids"], [-1]), weights=tf.reshape(d["masked_lm_weights"], [-1])) if config.disc_weight > 0: metrics["disc_loss"] = tf.metrics.mean(d["disc_loss"]) metrics["disc_auc"] = tf.metrics.auc( d["disc_labels"] * d["input_mask"], d["disc_probs"] * tf.cast(d["input_mask"], tf.float32)) metrics["disc_accuracy"] = tf.metrics.accuracy( labels=d["disc_labels"], predictions=d["disc_preds"], weights=d["input_mask"]) metrics["disc_precision"] = tf.metrics.accuracy( labels=d["disc_labels"], predictions=d["disc_preds"], weights=d["disc_preds"] * d["input_mask"]) metrics["disc_recall"] = tf.metrics.accuracy( labels=d["disc_labels"], predictions=d["disc_preds"], weights=d["disc_labels"] * d["input_mask"]) return metrics self.eval_metrics = (metric_fn, eval_fn_values)
Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) Z3 = tf.nn.relu(tf.matmul(Z2r, w3) + b3) Yish = tf.matmul(Z3, W4) + b4 cost = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=T ) ) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(logits, 1) t0 = datetime.now() LL = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),] if len(Xbatch) == batch_sz: session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0:
def train(self, data_dir, save_model_path): print('ready load train dataset') X, y = self.init_data(data_dir) print('success load' + str(len(y)) + 'datas') train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0) out_put = self.cnn_construct() predicts = tf.nn.softmax(out_put) predicts = tf.argmax(predicts, axis=1) actual_y = tf.argmax(self.y_place, axis=1) accuracy = tf.reduce_mean( tf.cast(tf.equal(predicts, actual_y), dtype=tf.float32)) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=out_put, labels=self.y_place)) opt = tf.train.AdamOptimizer(learning_rate=0.001) train_step = opt.minimize(cost) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) step = 0 saver = tf.train.Saver() while True: train_index = np.random.choice(len(train_x), self.batch_size, replace=False) train_randx = train_x[train_index] train_randy = train_y[train_index] _, loss = sess.run( [train_step, cost], feed_dict={ self.x_place: train_randx, self.y_place: train_randy, self.keep_place: 0.75 }) step += 1 if step % 10 == 0: test_index = np.random.choice(len(test_x), self.batch_size, replace=False) test_randx = test_x[test_index] test_randy = test_y[test_index] acc = sess.run(accuracy, feed_dict={ self.x_place: test_randx, self.y_place: test_randy, self.keep_place: 1.0 }) print(step, loss) if step % 50 == 0: print('accuracy:' + str(acc)) if step % 500 == 0: saver.save(sess, save_model_path, global_step=step) if acc > 0.99 and step > 500: saver.save(sess, save_model_path, global_step=step) break
def k_clusters(K, v): # Loading data data = np.load('data2D.npy') #data = np.load('data100D.npy') [num_pts, dim] = np.shape(data) #set is_valid to false is_valid = v # For Validation set if is_valid: valid_batch = int(num_pts / 3.0) np.random.seed(45689) rnd_idx = np.arange(num_pts) np.random.shuffle(rnd_idx) val_data = data[rnd_idx[:valid_batch]] data = data[rnd_idx[valid_batch:]] np.random.seed(420) iterx = 500 loss_arr = [] loss_arr_valid = [] arr = [] initpi = tf.Variable(tf.random_normal([K, 1], stddev=0.05)) lpi = tf.squeeze(hlp.logsoftmax(initpi)) X = tf.placeholder("float", [None, dim], "X") init_mean = tf.random_normal([K, dim], stddev=0.05) MU = tf.Variable(init_mean) init_sigma = tf.random_normal([K, 1], stddev=0.05) sigma = tf.exp(tf.Variable(init_sigma)) pdf = log_GaussPDF(X, MU, sigma) red_min = hlp.reduce_logsumexp(pdf + lpi, 1, keep_dims=True) loss = -tf.reduce_sum(red_min) adam_opt = tf.train.AdamOptimizer(learning_rate=0.1, beta1=0.9, beta2=0.99, epsilon=1e-5).minimize(loss) lpost = log_posterior(pdf, lpi) smax = tf.nn.softmax(lpost) relu = tf.argmax(smax, 1) with tf.Session() as s: s.run(tf.global_variables_initializer()) s.run(tf.local_variables_initializer()) for steps in range(iterx): _, lTrain, _, arr = s.run([MU, loss, adam_opt, relu], feed_dict={X: data}) loss_arr.append(lTrain) if is_valid: _, lVal, _, _ = s.run([MU, loss, adam_opt, relu], feed_dict={X: val_data}) loss_arr_valid.append(lVal) #d_ = distanceFunc(X,MU) plot_sc(data, num_pts, arr, K) if is_valid: lval = np.format_float_positional(np.float32(lVal)) fig = plt.figure(1) plt.title('K Means Clusters K = %i' % K) plt.legend(loc="best") plt.ylabel('Y') plt.xlabel('X') if is_valid: fig.text(.1, .0005, f'Final Validation Loss: {lval}', ha='left') plt.grid() plt.show() plt.figure(1) plt.plot(range(len(loss_arr)), loss_arr, c="g", label="training Loss") plt.legend(loc="best") plt.title('K Means') plt.ylabel('Loss') plt.xlabel('Iterations') plt.show() return loss_arr_valid
def train_crack_captcha_cnn(): x = tf.reshape(X, shape=[-1, IMAGE_HEIGHT, IMAGE_WIDTH, 1]) py_x = model(x, p_keep_conv, p_keep_hidden) with tf.name_scope('cost'): cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=py_x, labels=Y)) # 对于多标签、多分类问题,网络的最后一层应该使用sigmoid 激活 train_op = tf.train.AdamOptimizer(0.001, 0.9).minimize( cost) # 使用RMSProp优化器构建模型 tf.summary.scalar('cost', cost) # 记录标量数据 with tf.name_scope('accuracy'): # correct_pred = tf.equal(tf.argmax(py_x,1),tf.argmax(Y,1)) # accuracy = tf.reduce_mean(tf.cast(correct_pred, 'float')) # tf.summary.scalar('accuracy', accuracy) predict = tf.reshape(py_x, [-1, MAX_CAPTCHA, len(CHAR_SET)]) max_idx_p = tf.argmax(predict, 2) max_idx_l = tf.argmax(tf.reshape( Y, [-1, MAX_CAPTCHA, len(CHAR_SET)]), 2) correct_pred = tf.equal(max_idx_p, max_idx_l) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar('accuracy', accuracy) # 记录标量数据 saver = tf.train.Saver() with tf.Session() as sess: writer = tf.summary.FileWriter("./人工智能实验课/logs/captcha_logs", sess.graph) merged = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) step = 0 while True: batch_x, batch_y = get_next_batch(64) _, loss = sess.run([train_op, cost], feed_dict={ X: batch_x, Y: batch_y, p_keep_conv: 0.75, p_keep_hidden: 0.75 }) print(step, loss) # 每10 step计算一次准确率 if step % 10 == 0: batch_x_test, batch_y_test = get_next_batch(100) summary, acc = sess.run( [merged, accuracy], feed_dict={ X: batch_x_test, Y: batch_y_test, p_keep_conv: 1.0, p_keep_hidden: 1.0 }) writer.add_summary(summary, step) print('准确率:', step, acc) # 如果准确率大于50%,保存模型,完成训练 if acc > 0.9: saver.save(sess, "crack_capcha.model", global_step=step) break step += 1
def argmax(x, axis=None): return tf.argmax(x, dimension=axis)
def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, cell_index, col_index, cell_mask): """Computes the loss for cell selection constrained to a single column. The loss is a hierarchical log-likelihood. The model first predicts a column and then selects cells within that column (conditioned on the column). Cells outside the selected column are never selected. Args: token_logits: <float>[batch_size, seq_length] Logits per token. column_logits: <float>[batch_size, max_num_cols] Logits per column. label_ids: <int32>[batch_size, seq_length] Labels per token. cell_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into cells. col_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into columns. cell_mask: <float>[batch_size, max_num_rows * max_num_cols] Input mask per cell, 1 for cells that exists in the example and 0 for padding. Returns: selection_loss_per_example: <float>[batch_size] Loss for each example. logits: <float>[batch_size, seq_length] New logits which are only allowed to select cells in a single column. Logits outside of the most likely column according to `column_logits` will be set to a very low value (such that the probabilities are 0). """ # First find the column we should select. We use the column with maximum # number of selected cells. labels_per_column, _ = segmented_tensor.reduce_sum( tf.cast(label_ids, tf.float32), col_index) column_label = tf.argmax(labels_per_column, axis=-1, output_type=tf.int32) # Check if there are no selected cells in the column. In that case the model # should predict the special column id 0, which means "select nothing". no_cell_selected = tf.equal(tf.reduce_max(labels_per_column, axis=-1), 0) column_label = tf.where(no_cell_selected, tf.zeros_like(column_label), column_label) column_dist = tfp.distributions.Categorical(logits=column_logits) column_loss_per_example = -column_dist.log_prob(column_label) # Reduce the labels and logits to per-cell from per-token. logits_per_cell, _ = segmented_tensor.reduce_mean(token_logits, cell_index) labels_per_cell, labels_index = segmented_tensor.reduce_max( tf.cast(label_ids, tf.int32), cell_index) # Mask for the selected column. column_id_for_cells = cell_index.project_inner(labels_index).indices column_mask = tf.cast( tf.equal(column_id_for_cells, tf.expand_dims(column_label, axis=1)), tf.float32) # Compute the log-likelihood for cells, but only for the selected column. cell_dist = tfp.distributions.Bernoulli(logits=logits_per_cell) cell_log_prob = cell_dist.log_prob(labels_per_cell) cell_loss = -tf.reduce_sum(cell_log_prob * column_mask * cell_mask, axis=1) # We need to normalize the loss by the number of cells in the column. cell_loss /= tf.reduce_sum(column_mask * cell_mask, axis=1) + _EPSILON_ZERO_DIVISION selection_loss_per_example = column_loss_per_example selection_loss_per_example += tf.where( no_cell_selected, tf.zeros_like(selection_loss_per_example), cell_loss) # Set the probs outside the selected column (selected by the *model*) # to 0. This ensures backwards compatibility with models that select # cells from multiple columns. selected_column_id = tf.argmax(column_logits, axis=-1, output_type=tf.int32) selected_column_mask = tf.cast( tf.equal(column_id_for_cells, tf.expand_dims(selected_column_id, axis=-1)), tf.float32) # Never select cells with the special column id 0. selected_column_mask = tf.where(tf.equal(column_id_for_cells, 0), tf.zeros_like(selected_column_mask), selected_column_mask) logits_per_cell += _CLOSE_ENOUGH_TO_LOG_ZERO * ( 1.0 - cell_mask * selected_column_mask) logits = segmented_tensor.gather(logits_per_cell, cell_index) return selection_loss_per_example, logits
def map_fn(inputs): flattened = tf.reshape(inputs, [-1]) argmax = tf.argmax(flattened, output_type=tf.int32) indices = tensor_utils.unravel_index_2d(argmax, inputs.shape) score = flattened[argmax] return indices, score
def model_fn(features, labels, mode, params): """The `model_fn` for TPUEstimator.""" del labels # Unused. tf.logging.info("*** Features ***") for name in sorted(features): tf.logging.info(" name = %s, shape = %s", name, features[name].shape) label_ids = features["label_ids"] input_mask = features["input_mask"] row_ids = features["row_ids"] column_ids = features["column_ids"] # Table cells only, without question tokens and table headers. table_mask = tf.where(row_ids > 0, tf.ones_like(row_ids), tf.zeros_like(row_ids)) do_model_aggregation = config.num_aggregation_labels > 0 aggregation_function_id = (tf.squeeze( features["aggregation_function_id"], axis=[1]) if do_model_aggregation else None) do_model_classification = config.num_classification_labels > 0 classification_class_index = (tf.squeeze( features["classification_class_index"], axis=[1]) if do_model_classification else None) is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = table_bert.create_model( features=features, mode=mode, bert_config=config.bert_config, disabled_features=config.disabled_features, disable_position_embeddings=config.disable_position_embeddings) if config.use_answer_as_supervision: answer = tf.squeeze(features["answer"], axis=[1]) numeric_values = features["numeric_values"] numeric_values_scale = features["numeric_values_scale"] else: answer = None numeric_values = None numeric_values_scale = None (total_loss, logits, logits_aggregation, probabilities, logits_cls) = _get_classification_outputs( config=config, output_layer=model.get_sequence_output(), output_layer_aggregation=model.get_pooled_output(), label_ids=label_ids, input_mask=input_mask, table_mask=table_mask, aggregation_function_id=aggregation_function_id, answer=answer, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, is_training=is_training, row_ids=row_ids, column_ids=column_ids, classification_class_index=classification_class_index) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None init_checkpoint = config.init_checkpoint if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if config.use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, config.learning_rate, config.num_train_steps, config.num_warmup_steps, config.use_tpu, gradient_accumulation_steps=params.get( "gradient_accumulation_steps", 1), grad_clipping=config.grad_clipping) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (_calculate_eval_metrics_fn, [ total_loss, label_ids, logits, input_mask, aggregation_function_id, logits_aggregation, classification_class_index, logits_cls ]) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "probabilities": probabilities, "column_ids": features["column_ids"], "row_ids": features["row_ids"], "segment_ids": features["segment_ids"], "question_id_ints": features["question_id_ints"], } # TODO Remove once the data has been updated. if "question_id" in features: # Only available when predicting on GPU. predictions["question_id"] = features["question_id"] if do_model_aggregation: predictions.update({ "gold_aggr": features["aggregation_function_id"], "pred_aggr": tf.argmax(logits_aggregation, axis=-1, output_type=tf.int32) }) if do_model_classification: predictions.update({ "gold_cls": features["classification_class_index"], "pred_cls": tf.argmax(logits_cls, axis=-1, output_type=tf.int32) }) if config.num_classification_labels == 2: predictions.update( {"logits_cls": logits_cls[:, 1] - logits_cls[:, 0]}) else: predictions.update({"logits_cls": logits_cls}) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def model_definition(vector_dimension, label_count, slot_vectors, value_vectors, use_delex_features=False, use_softmax=True, value_specific_decoder=False, learn_belief_state_update=True): """ This method defines the model and returns the required TensorFlow operations. slot_vectors, value_vectors should be of size [label_count + 2, 300]. For None, we should just pass zero vectors for both. Then, replicate using these vectors the old NBT and then combine each value's (including NONE) into softmax. List of values learned by this model: 1) h_utterance_representation, which uses a CNN to learn a representation of the utterance r. 2) candidates_transform, which includes w_candidates and b_candidates, which transforms candidate values to vector c. 3) w_joint_hidden_layer and b_joint_hidden_layer, which collapses the interaction of r and c to an intermediate vector. 4) w_joint_presoftmax and b_joint_presoftmax, which collapse the intermediate layer to a single feature. 5) sysreq_w_hidden_layer and sysreq_b_hidden_layer, which compute intermediate sysreq representation. 6) TODO: sysreq_w_softmax and sysreq_b_softmax, which map this to final decision. -- currently not size independent. 7) TODO: confirm_w1_hidden_layer, confirm_b1_hidden_layer, confirm_w1_softmax, confirm_b1_softmax: for confirmations. -- currently does not work. 8) a_memory, b_memory, a_current, b_current: for the belief state updates, composed into matrix. If all of these are initialised and then supplied to each of the models, we could train them together (batch of each slot), and just save these variables, then at test time, just load them (as session even), and then initialise all of the models with them. """ print "=========================== Model declaration ===========================" if use_softmax: label_size = label_count + 1 # 1 is for NONE, dontcare is added to the ontology. else: label_size = label_count # these are actual NN hyperparameters that we might want to tune at some point: hidden_units_1 = 100 longest_utterance_length = 40 summary_feature_count = 10 print "Hidden layer size:", hidden_units_1, "Label Size:", label_size, "Use Softmax:", use_softmax, "Use Delex Features:", use_delex_features utterance_representations_full = tf.placeholder( tf.float32, [None, 40, vector_dimension ]) # full feature vector, which we want to convolve over. utterance_representations_delex = tf.placeholder(tf.float32, [None, label_size]) # utterance_representations_delex = tf.placeholder(tf.float32, [None, label_size, 40, vector_dimension]) system_act_slots = tf.placeholder( tf.float32, shape=(None, vector_dimension)) # just slots, for requestables. system_act_confirm_slots = tf.placeholder(tf.float32, shape=(None, vector_dimension)) system_act_confirm_values = tf.placeholder(tf.float32, shape=(None, vector_dimension)) #slot_values = tf.placeholder(tf.float32, shape=(None, vector_dimension)) #candidate_values = tf.placeholder(tf.float32, shape=(None, vector_dimension)) # Initial (distributional) vectors. Needed for L2 regularisation. W_slots = tf.constant(slot_vectors, name="W_init") W_values = tf.constant(value_vectors, name="W_init") # output label, i.e. True / False, 1-hot encoded: y_ = tf.placeholder(tf.float32, [None, label_size]) y_past_state = tf.placeholder(tf.float32, [None, label_size]) # dropout placeholder, 0.5 for training, 1.0 for validation/testing: keep_prob = tf.placeholder("float") # constants useful for evaluation variables further below: ones = tf.constant(1.0, dtype="float") zeros = tf.constant(0.0, dtype="float") hidden_utterance_size = vector_dimension filter_sizes = [1, 2, 3] num_filters = 300 hidden_utterance_size = num_filters #* len(filter_sizes) #candidate_sum = candidate_values + slot_values # to avoid summing these two multiple times later. #w_candidates = tf.Variable(tf.random_normal([vector_dimension, vector_dimension])) #b_candidates = tf.Variable(tf.zeros([vector_dimension])) #candidates = tf.nn.sigmoid(tf.matmul(candidate_sum, w_candidates) + b_candidates) #candidates = tf.nn.sigmoid(tf.matmul(candidate_values, w_candidates) + b_candidates) # filter needs to be of shape: filter_height = 1,2,3, filter_width=300, in_channel=1, out_channel=num_filters # filter just dot products - in images these then overlap from different regions - we don't have that. h_utterance_representation = define_CNN_model( utterance_representations_full, num_filters, vector_dimension, longest_utterance_length) #candidate_sum = W_slots + W_values # size [label_size, vector_dimension] w_candidates = tf.Variable( tf.random_normal([vector_dimension, vector_dimension])) b_candidates = tf.Variable(tf.zeros([vector_dimension])) # multiply to get: [label_size, vector_dimension] candidates_transform = tf.nn.sigmoid( tf.matmul(W_values, w_candidates) + b_candidates) # Next, multiply candidates [label_size, vector_dimension] each with the uttereance representations [None, vector_dimension], to get [None, label_size, vector_dimension] # or utterance [None, vector_dimension] X [vector_dimension, label_size] to get [None, label_size] #h_utterance_representation_candidate_interaction = tf.Variable(tf.zeros([None, label_size, vector_dimension])) list_of_value_contributions = [] # get interaction of utterance with each value: for value_idx in range(0, label_count): list_of_value_contributions.append( tf.multiply(h_utterance_representation, candidates_transform[value_idx, :])) h_utterance_representation_candidate_interaction = tf.reshape( tf.transpose(tf.stack(list_of_value_contributions), [1, 0, 2]), [-1, vector_dimension]) # the same transform now runs across each value's vector, multiplying. w_joint_hidden_layer = tf.Variable( tf.random_normal([vector_dimension, hidden_units_1])) b_joint_hidden_layer = tf.Variable(tf.zeros([hidden_units_1])) # now multiply [None, label_size, vector_dimension] by [vector_dimension, hidden_units_1], to get [None, label_size, hidden_units_1] hidden_layer_joint = tf.nn.sigmoid( tf.reshape( tf.matmul(h_utterance_representation_candidate_interaction, w_joint_hidden_layer) + b_joint_hidden_layer, [-1, label_count, hidden_units_1])) hidden_layer_joint_with_dropout = tf.nn.dropout(hidden_layer_joint, keep_prob) # next initialise parameters that go into a softmax, i.e. mapping [None, label_size, hidden_units_1] -> [None, label_size] w_joint_presoftmax = tf.Variable(tf.random_normal([hidden_units_1, 1])) # collapse to 1 b_joint_presoftmax = tf.Variable(tf.zeros([1])) # collapse to 1 y_presoftmax = tf.reshape( tf.matmul( tf.reshape(hidden_layer_joint_with_dropout, [-1, hidden_units_1]), w_joint_presoftmax) + b_joint_presoftmax, [-1, label_count]) # for now we do not implement this sysreq_contributions = [] # a list of contributions for each of the values confirm_contributions = [ ] # a list of contributions for each of the values # =================== NETWORK FOR SYSTEM REQUESTS ========================== # is the current slot offered system_act_candidate_interaction = tf.multiply( W_slots[0, :], system_act_slots) # only multiply with slots for the requests. dot_product_sysreq = tf.reduce_mean(system_act_candidate_interaction, 1) #full_ones = tf.ones([tf.shape(dot_product_sysreq)[0], 1]) #dot_product = tf.cast(tf.equal(dot_product_sysreq, full_ones), "float32") decision = tf.multiply(tf.expand_dims(dot_product_sysreq, 1), h_utterance_representation) sysreq_w_hidden_layer = tf.Variable( tf.random_normal([vector_dimension, hidden_units_1])) sysreq_b_hidden_layer = tf.Variable(tf.zeros([hidden_units_1])) # allow each value to learn to map different utterances to yes. Mainly dontcare. for value_idx in range(0, label_count): sysreq_hidden_layer_1 = tf.nn.sigmoid( tf.matmul(decision, sysreq_w_hidden_layer) + sysreq_b_hidden_layer) sysreq_hidden_layer_1_with_dropout = tf.nn.dropout( sysreq_hidden_layer_1, keep_prob) sysreq_w_softmax = tf.Variable(tf.random_normal([hidden_units_1, 1])) sysreq_b_softmax = tf.Variable(tf.zeros([1])) sysreq_contribution = tf.matmul(sysreq_hidden_layer_1_with_dropout, sysreq_w_softmax) + sysreq_b_softmax sysreq_contributions.append(sysreq_contribution) sysreq = tf.concat(sysreq_contributions, 1) #, [-1, label_size]) # =================== NETWORK FOR CONFIRMATIONS ========================== # here, we do want to tie across all values, as it will get a different signal depending on whether both things match. confirm_w1_hidden_layer = tf.Variable( tf.random_normal([vector_dimension, hidden_units_1])) confirm_b1_hidden_layer = tf.Variable(tf.zeros([hidden_units_1])) confirm_w1_softmax = tf.Variable(tf.random_normal([hidden_units_1, 1])) confirm_b1_softmax = tf.Variable(tf.zeros([1])) for value_idx in range(0, label_count): dot_product = tf.multiply( tf.reduce_mean( tf.multiply(W_slots[0, :], system_act_confirm_slots), 1), tf.reduce_mean( tf.multiply(W_values[value_idx, :], system_act_confirm_values), 1)) # dot product: slot equality and value equality full_ones = tf.ones(tf.shape(dot_product)) dot_product = tf.cast(tf.equal(dot_product, full_ones), "float32") decision = tf.multiply(tf.expand_dims(dot_product, 1), h_utterance_representation) confirm_hidden_layer_1 = tf.nn.sigmoid( tf.matmul(decision, confirm_w1_hidden_layer) + confirm_b1_hidden_layer) confirm_hidden_layer_1_with_dropout = tf.nn.dropout( confirm_hidden_layer_1, keep_prob) confirm_contribution = tf.matmul( confirm_hidden_layer_1_with_dropout, confirm_w1_softmax) + confirm_b1_softmax confirm_contributions.append(confirm_contribution) sysconf = tf.concat(confirm_contributions, 1) if use_softmax: append_zeros_none = tf.zeros([tf.shape(y_presoftmax)[0], 1]) y_presoftmax = tf.concat([y_presoftmax, append_zeros_none], 1) append_zeros = tf.zeros([tf.shape(y_presoftmax)[0], 1]) sysreq = tf.concat([sysreq, append_zeros], 1) sysconf = tf.concat([sysconf, append_zeros], 1) y_presoftmax = y_presoftmax + sysconf + sysreq if use_delex_features: y_presoftmax = y_presoftmax + utterance_representations_delex # value-specific decoder: if value_specific_decoder and False: h_utterance_representation_for_full_softmax = define_CNN_model( utterance_representations_full, num_filters, vector_dimension, longest_utterance_length) h_utterance_dropout = tf.nn.dropout( h_utterance_representation_for_full_softmax, keep_prob) ss_w_hidden_layer = tf.Variable( tf.random_normal([vector_dimension, hidden_units_1])) ss_b_hidden_layer = tf.Variable(tf.zeros([hidden_units_1])) ss_hidden_layer_1 = tf.nn.relu( tf.matmul(h_utterance_dropout, ss_w_hidden_layer) + ss_b_hidden_layer) ss_hidden_layer_1_with_dropout = tf.nn.dropout(ss_hidden_layer_1, keep_prob) ss_w_softmax = tf.Variable( tf.random_normal([hidden_units_1, label_size])) ss_b_softmax = tf.Variable(tf.zeros([label_size])) ss_contribution = tf.matmul(ss_hidden_layer_1_with_dropout, ss_w_softmax) + ss_b_softmax y_presoftmax += ss_contribution # as we are returning always, can't be null update_coefficient = tf.constant(0.49) if use_softmax: if learn_belief_state_update: if value_specific_decoder: # value-specific update update_coefficient = tf.constant(0.8) ss_W_memory = tf.Variable( tf.random_normal([label_size, label_size])) ss_W_current = tf.Variable( tf.random_normal([label_size, label_size])) y_combine = tf.matmul(y_past_state, ss_W_memory) + tf.matmul( y_presoftmax, ss_W_current) else: update_coefficient = tf.constant(0.7) a_memory = tf.Variable(tf.random_normal([1, 1])) diag_memory = a_memory * tf.diag(tf.ones(label_size)) b_memory = tf.Variable(tf.random_normal([1, 1])) non_diag_memory = tf.matrix_set_diag( b_memory * tf.ones([label_size, label_size]), tf.zeros(label_size)) W_memory = diag_memory + non_diag_memory a_current = tf.Variable(tf.random_normal([1, 1])) diag_current = a_current * tf.diag(tf.ones(label_size)) b_current = tf.Variable(tf.random_normal([1, 1])) non_diag_current = tf.matrix_set_diag( b_current * tf.ones([label_size, label_size]), tf.zeros(label_size)) W_current = diag_current + non_diag_current y_combine = tf.matmul(y_past_state, W_memory) + tf.matmul( y_presoftmax, W_current ) #+ tf.matmul(sysreq, W_current_req) + tf.matmul(sysconf, W_current_conf) y = tf.nn.softmax(y_combine) # + y_ss_update_contrib) else: # This code runs the baseline experiments reported in Footnote 2 in the paper. update_coefficient = tf.Variable( 0.5) #this scales the contribution of the current turn. y_combine = update_coefficient * y_presoftmax + ( 1 - update_coefficient) * y_past_state y = tf.nn.softmax(y_combine) else: y = tf.nn.sigmoid( y_presoftmax ) # for requestables, we just have turn-level binary decisions # ======================== LOSS IS JUST CROSS ENTROPY ========================================== if use_softmax: cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=y_combine, labels=y_) else: cross_entropy = tf.reduce_sum(tf.square(y - y_)) # ============================= EVALUATION ===================================================== if use_softmax: predictions = tf.cast(tf.argmax(y, 1), "float32") # will have ones where positive true_predictions = tf.cast(tf.argmax(y_, 1), "float32") correct_prediction = tf.cast(tf.equal(predictions, true_predictions), "float") accuracy = tf.reduce_mean(correct_prediction) # this will count number of positives - they are marked with 1 in true_predictions num_positives = tf.reduce_sum(true_predictions) # positives are indicated with ones. classified_positives = tf.reduce_sum(predictions) # will have ones in all places where both are predicting positives true_positives = tf.multiply(predictions, true_predictions) # if indicators for positive of both are 1, then it is positive. num_true_positives = tf.reduce_sum(true_positives) recall = num_true_positives / num_positives precision = num_true_positives / classified_positives f_score = (2 * recall * precision) / (recall + precision) else: predictions = tf.cast(tf.round(y), "float32") # will have ones where positive true_predictions = tf.cast(tf.round(y_), "float32") correct_prediction = tf.cast(tf.equal(predictions, true_predictions), "float") num_positives = tf.reduce_sum(true_predictions) classified_positives = tf.reduce_sum(predictions) true_positives = tf.multiply(predictions, true_predictions) num_true_positives = tf.reduce_sum(true_positives) recall = num_true_positives / num_positives precision = num_true_positives / classified_positives f_score = (2 * recall * precision) / (recall + precision) accuracy = tf.reduce_mean(correct_prediction) optimizer = tf.train.AdamOptimizer(0.001) train_step = optimizer.minimize(cross_entropy) return keep_prob, utterance_representations_full, utterance_representations_delex, \ system_act_slots, system_act_confirm_slots, system_act_confirm_values, \ y_, y_past_state, accuracy, f_score, precision, \ recall, num_true_positives, num_positives, classified_positives, y, \ predictions, true_predictions, correct_prediction, true_positives, train_step, update_coefficient
training_step = tf.train.GradientDescentOptimizer(learning_rate).minimize( cost_function) print(cost_function.shape) sess = tf.Session() sess.run(init) # Calculate the cost and the accuracy for each epock mse_history = [] accuracy_history = [] print("trainy: ", train_y.shape) print("trainx: ", train_x.shape) for epoch in range(training_epochs): sess.run(training_step, feed_dict={x: train_x, y_: train_y}) cost = sess.run(cost_function, feed_dict={x: train_x, y_: train_y}) cost_history = np.append(cost_history, cost) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) #print("Accuracy: ", (sess.run(accuracy, feed_dict={x: test_x, y_: test_y}))) pred_y = sess.run(y, feed_dict={x: test_x}) mse = tf.reduce_mean(tf.square(pred_y - test_y)) mse_ = sess.run(mse) mse_history.append(mse_) accuracy = (sess.run(accuracy, feed_dict={x: train_x, y_: train_y})) accuracy_history.append(accuracy) print('epoch: ', epoch, ' - ', 'cost: ', cost, " - MSE: ", mse_, " - Train Accuracy: ", accuracy) save_path = saver.save(sess, model_path) print("Model saved in file: %s" % save_path) # print("Accuracy: ", (sess.run(accuracy, feed_dict={x: test_x, y_: test_y})))
def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model): """Masked language modeling softmax layer.""" masked_lm_weights = inputs.masked_lm_weights with tf.variable_scope("generator_predictions"): if self._config.uniform_generator: logits = tf.zeros(self._bert_config.vocab_size) logits_tiled = tf.zeros( modeling.get_shape_list(inputs.masked_lm_ids) + [self._bert_config.vocab_size]) logits_tiled += tf.reshape( logits, [1, 1, self._bert_config.vocab_size]) logits = logits_tiled else: relevant_hidden = pretrain_helpers.gather_positions( model.get_sequence_output(), inputs.masked_lm_positions) hidden = tf.layers.dense( relevant_hidden, units=modeling.get_shape_list( model.get_embedding_table())[-1], activation=modeling.get_activation( self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range), ) hidden = modeling.layer_norm(hidden) output_bias = tf.get_variable( "output_bias", shape=[self._bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(hidden, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) oh_labels = tf.one_hot( inputs.masked_lm_ids, depth=self._bert_config.vocab_size, dtype=tf.float32, ) probs = tf.nn.softmax(logits) log_probs = tf.nn.log_softmax(logits) label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1) numerator = tf.reduce_sum(inputs.masked_lm_weights * label_log_probs) denominator = tf.reduce_sum(masked_lm_weights) + 1e-6 loss = numerator / denominator preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32) MLMOutput = collections.namedtuple( "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"]) return MLMOutput( logits=logits, probs=probs, per_example_loss=label_log_probs, loss=loss, preds=preds, )