def build_graph(reader, model, input_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, distill_reader=None, transformer_class=feature_transform.DefaultTransformer): video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, input_data_pattern, batch_size=batch_size)) feature_transformer = transformer_class() model_input, num_frames = feature_transformer.transform( model_input_raw, num_frames=num_frames) with tf.name_scope("model"): if FLAGS.noise_level > 0: noise_level_tensor = tf.placeholder_with_default( 0.0, shape=[], name="noise_level") else: noise_level_tensor = None if FLAGS.dropout: keep_prob_tensor = tf.placeholder_with_default(1.0, shape=[], name="keep_prob") result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, dropout=FLAGS.dropout, keep_prob=keep_prob_tensor, noise_level=noise_level_tensor, is_training=False) else: result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, noise_level=noise_level_tensor, is_training=False) print "result", result predictions = result["predictions"] tf.add_to_collection("predictions", predictions) tf.add_to_collection("video_id_batch", video_id) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) if FLAGS.dropout: tf.add_to_collection("keep_prob", keep_prob_tensor) if FLAGS.noise_level > 0: tf.add_to_collection("noise_level", noise_level_tensor)
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', is_training=True, **unused_params): with tf.variable_scope(scope, tf.AUTO_REUSE): with tf.variable_scope('lstm1', tf.AUTO_REUSE): lstm1 = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs1, _ = tf.nn.dynamic_rnn(lstm1, model_input, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) with tf.variable_scope('lstm2', tf.AUTO_REUSE): lstm2 = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs2, _ = tf.nn.dynamic_rnn(lstm2, outputs1, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) with tf.variable_scope('lstm3', tf.AUTO_REUSE): lstm3 = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs, state = tf.nn.dynamic_rnn(lstm3, outputs2 + outputs1, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) if FLAGS.lstm_pooling_method == 'last': inp = state[-1].h else: inp = utils.FramePooling(outputs, FLAGS.lstm_pooling_method) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) results = aggregated_model().create_model(model_input=inp, vocab_size=vocab_size, is_training=is_training, **unused_params) results['features'] = inp if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', is_training=True, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers with tf.variable_scope(scope, tf.AUTO_REUSE): stacked_lstm = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.lstm_pooling_method == 'last': inp = state[-1].h else: inp = utils.FramePooling(outputs, FLAGS.lstm_pooling_method) results = aggregated_model().create_model(model_input=inp, vocab_size=vocab_size, is_training=is_training, **unused_params) results['features'] = inp if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', is_training=True, **unused_params): lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers with tf.variable_scope(scope, tf.AUTO_REUSE): stacked_lstm_fw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) stacked_lstm_bw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) outputs, state = tf.nn.bidirectional_dynamic_rnn( stacked_lstm_fw, stacked_lstm_bw, model_input, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) if FLAGS.lstm_pooling_method == 'last': l = [state[i][-1].h for i in range(2)] else: l = [ utils.FramePooling(outputs[0], FLAGS.lstm_pooling_method), utils.FramePooling(outputs[1], FLAGS.lstm_pooling_method) ] output = tf.concat(l, 1) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) results = aggregated_model().create_model(model_input=output, vocab_size=vocab_size, is_training=is_training, **unused_params) results['features'] = output if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, labels, scope='default', is_training=True, **unused_params): X = FLAGS.residualcnn_x with tf.variable_scope(scope, tf.AUTO_REUSE): fc = slim.fully_connected( model_input, X, weights_regularizer=tf.contrib.layers.l2_regularizer(0.01)) reshaped_input = tf.expand_dims(fc, -1) reshaped_input = tf.expand_dims(reshaped_input, -1) conv1 = slim.convolution(reshaped_input, 64, [49, 1]) conv1_norm = slim.batch_norm(conv1, is_training=is_training) module1 = self.residual_module([128, 192, 64], conv1_norm, 'module1') module1_norm = slim.batch_norm(module1, is_training=is_training) conv2 = slim.convolution(module1_norm, 128, 1) conv2_norm = slim.batch_norm(conv2, is_training=is_training) module2 = self.residual_module([256, 512, 128], conv2_norm, 'module2') module2_norm = slim.batch_norm(module2, is_training=is_training) conv3 = slim.convolution(module2_norm, 256, 1) conv3_norm = slim.batch_norm(conv3, is_training=is_training) module3 = self.residual_module([512, 256], conv3_norm, 'module3') module3_norm = slim.batch_norm(module3, is_training=is_training) conv4 = slim.convolution(module3_norm, X, 1) conv4_norm = slim.batch_norm(conv4, is_training=is_training) module4 = self.residual_module([512, X], conv4_norm, 'module4') features = tf.squeeze(module4, [2]) features = model_utils.FramePooling(features, FLAGS.residualcnn_pooling) + fc results = MoeModel().create_model(features, vocab_size) results['features'] = features if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', **unused_params): lstm_size = FLAGS.lstm_cells with tf.variable_scope(scope, tf.AUTO_REUSE): cells = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs1, _ = tf.nn.dynamic_rnn(cells, model_input, sequence_length=num_frames, dtype=tf.float32, swap_memory=True, scope='first') cells1 = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs2, state2 = tf.nn.dynamic_rnn(cells1, outputs1[:, 0:300:2, :], sequence_length=num_frames / 2, dtype=tf.float32, swap_memory=True, scope='second') aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.lstm_pooling_method == 'last': output = state2[-1].h else: output = utils.FramePooling(outputs2, FLAGS.lstm_pooling_method) results = aggregated_model().create_model(model_input=output, vocab_size=vocab_size, **unused_params) results['features'] = output if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', is_training=True, **unused_params): results = {} with tf.variable_scope(scope, tf.AUTO_REUSE): rgb_input = tf.slice(model_input, [0, 0, 0], [-1, -1, 1024]) audio_input = tf.slice(model_input, [0, 0, 1024], [-1, -1, 128]) rgb_model = globals()[FLAGS.rgb_frame_level_model] audio_model = globals()[FLAGS.audio_frame_level_model] rgb_results = rgb_model().create_model(model_input=rgb_input, vocab_size=vocab_size, num_frames=num_frames, labels=labels, scope='rgb', is_training=is_training, **unused_params) audio_results = audio_model().create_model( model_input=audio_input, vocab_size=vocab_size, num_frames=num_frames, labels=labels, scope='audio', is_training=is_training**unused_params) if labels != None: results['loss'] = rgb_results['loss'] + audio_results['loss'] aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) features = rgb_results['features'] + audio_results['features'] output = aggregated_model().create_model(model_input=features, vocab_size=vocab_size, is_training=is_training, **unused_params) if labels != None: results['loss'] += 6 * losses.CrossEntropyLoss( ).calculate_loss(output['predictions'], labels) results['predictions'] = output['predictions'] return results
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', **unused_params): """Creates a model which uses a logistic classifier over the average of the frame-level features. This class is intended to be an example for implementors of frame level models. If you want to train a model over averaged features it is more efficient to average them beforehand rather than on the fly. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) feature_size = model_input.get_shape().as_list()[2] denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]), [-1, feature_size]) avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators with tf.variable_scope(scope, tf.AUTO_REUSE): output = slim.fully_connected( avg_pooled, vocab_size, activation_fn=tf.nn.sigmoid, weights_regularizer=slim.l2_regularizer(1e-8)) return { "predictions": output, "features": avg_pooled, "loss": losses.CrossEntropyLoss().calculate_loss(output, labels) }
def build_model(self, model, reader): """Find the model and build the graph.""" label_loss_fn = losses.CrossEntropyLoss() optimizer_class = tf.train.AdamOptimizer build_graph( reader=reader, model=model, optimizer_class=optimizer_class, clip_gradient_norm=FLAGS.clip_gradient_norm, train_data_pattern=FLAGS.train_data_pattern, label_loss_fn=label_loss_fn, base_learning_rate=FLAGS.base_learning_rate, learning_rate_decay=FLAGS.learning_rate_decay, learning_rate_decay_examples=FLAGS.learning_rate_decay_examples, regularization_penalty=FLAGS.regularization_penalty, num_readers=FLAGS.num_readers, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) return tf.train.Saver(max_to_keep=0, keep_checkpoint_every_n_hours=0.25)
def build_graph(model, reader, train_data_pattern, test_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), base_learning_rate=0.01, learning_rate_decay_steps=10000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """ Creates the Tensorflow graph. :param model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. :param reader: The data file reader. It should inherit from BaseReader. :param train_data_pattern: path to the train data files. :param test_data_pattern: path to the test data files. :param label_loss_fn: loss to apply to the model. It should inherit from BaseLoss. :param base_learning_rate: learning rate to initialize the optimizer with. :param learning_rate_decay_steps: :param learning_rate_decay: :param optimizer_class: Which optimization algorithm to use. :param clip_gradient_norm: Magnitude of the gradient to clip to. :param regularization_penalty: How much weight to give the regularization loss compared to the label loss. :param num_readers: :param num_epochs: :return: """ global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay( base_learning_rate, global_step, learning_rate_decay_steps, learning_rate_decay, staircase=True) optimizer = optimizer_class(learning_rate) _, train_stk_input, train_stk_label = get_input_train_tensors( reader, train_data_pattern, batch_size=FLAGS.train_batch_size, num_readers=num_readers, num_epochs=num_epochs) _, test_stk_input, test_stk_label = get_input_test_tensors( reader, test_data_pattern, batch_size=FLAGS.test_batch_size, num_readers=1) train_stk_feature_dim = len(train_stk_input.get_shape()) - 1 test_stk_feature_dim = len(test_stk_input.get_shape()) - 1 assert train_stk_feature_dim == test_stk_feature_dim train_stk_model_input = tf.nn.l2_normalize(train_stk_input, train_stk_feature_dim) test_stk_model_input = tf.nn.l2_normalize(test_stk_input, test_stk_feature_dim) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): train_result = model.create_model(train_stk_model_input) test_result = model.create_model(test_stk_model_input) train_predictions = train_result["predictions"] test_predictions = test_result["predictions"] stk_embedding = test_result["stk_embedding"] if "loss" in train_result.keys(): train_loss = train_result["loss"] else: train_loss = label_loss_fn.calculate_loss(train_predictions, train_stk_label) train_aux_loss = tf.constant(0.0) if "aux_predictions" in train_result.keys(): for pred in train_result["aux_predictions"]: train_aux_loss += label_loss_fn.calculate_loss(pred, test_stk_label) if "regularization_loss" in train_result.keys(): train_reg_loss = train_result["regularization_loss"] else: train_reg_loss = tf.constant(0.0) train_reg_losses = tf.losses.get_regularization_losses() if train_reg_losses: train_reg_loss += tf.add_n(train_reg_losses) if "loss" in test_result.keys(): test_loss = test_result["loss"] else: test_loss = label_loss_fn.calculate_loss(test_predictions, test_stk_label) # A dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in train_result.keys(): update_ops += train_result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): train_loss = tf.identity(train_loss) train_aux_loss = tf.identity(train_aux_loss) # Incorporate the L2 weight penalties etc. train_final_loss = regularization_penalty * train_reg_loss + train_loss + train_aux_loss train_op = slim.learning.create_train_op( train_final_loss, optimizer, global_step=global_step, clip_gradient_norm=clip_gradient_norm) tf.add_to_collection("global_step", global_step) tf.add_to_collection("train_loss", train_loss) tf.add_to_collection("test_top_loss", test_loss) tf.add_to_collection("train_predictions", train_predictions) tf.add_to_collection("test_predictions", test_predictions) tf.add_to_collection("train_stk_input", train_stk_input) tf.add_to_collection("train_stk_model_input", train_stk_model_input) tf.add_to_collection("test_stk_input", test_stk_input) tf.add_to_collection("test_stk_model_input", test_stk_model_input) tf.add_to_collection("train_stk_label", tf.cast(train_stk_label, tf.float32)) tf.add_to_collection("test_stk_label", tf.cast(test_stk_label, tf.float32)) tf.add_to_collection("stk_embedding", stk_embedding) tf.add_to_collection("train_op", train_op)
def build_model(self): """Builds the model. Inputs: self.image_embeddings self.target_seqs (training and eval only) self.input_mask (training and eval only) Outputs: self.total_loss (training and eval only) self.target_cross_entropy_losses (training and eval only) self.target_cross_entropy_loss_weights (training and eval only) """ caption_model_fn = find_class_by_name(FLAGS.model, [im2txt_models]) caption_model = caption_model_fn() # model outputs = caption_model.create_model( input_seqs=self.input_seqs, image_model_output=self.image_model_output, initializer=self.initializer, mode=self.mode, target_seqs=self.target_seqs, global_step=self.global_step, input_mask=self.input_mask, target_lengths=self.target_lengths) # loss if self.mode == "inference": if "logits" in outputs: tf.nn.softmax(outputs["logits"], name="softmax") elif "bs_results" in outputs: self.predicted_ids = outputs["bs_results"].predicted_ids self.scores = outputs[ "bs_results"].beam_search_decoder_output.scores if "bs_results_lengths" in outputs: self.predicted_ids_lengths = outputs["bs_results_lengths"] if "top_n_attributes" in outputs: self.top_n_attributes = outputs["top_n_attributes"] else: if "mle_caption_logits" in outputs: logits = tf.reshape(outputs["mle_caption_logits"], [-1, FLAGS.vocab_size]) targets = tf.reshape(self.target_seqs, [-1]) weights = tf.to_float(tf.reshape(self.input_mask, [-1])) # Compute losses. mle_loss_fn = losses.SparseSoftmaxCrossEntropyLoss() mle_loss = mle_loss_fn.calculate_loss(logits, targets, weights) # Logging losses. tf.summary.scalar("losses/mle_loss", mle_loss) tf.losses.add_loss(mle_loss) # caption loss if FLAGS.rl_training == True: # rl loss # load greed caption and sample caption to calculate reward target_caption_words = self.target_seqs target_caption_lengths = self.target_lengths greedy_caption_words = outputs["greedy_caption_words"] greedy_caption_lengths = outputs["greedy_caption_lengths"] sample_caption_logits = outputs["sample_caption_logits"] sample_caption_words = outputs["sample_caption_words"] sample_caption_lengths = outputs["sample_caption_lengths"] if get_rank(target_caption_words) == 2: target_caption_words = tf.expand_dims( target_caption_words, 1) if get_rank(target_caption_lengths) == 1: target_caption_lengths = tf.expand_dims( target_caption_lengths, 1) if get_shape_as_list(target_caption_words)[-1] is None: target_caption_words, target_caption_lengths = \ pad_or_truncate(target_caption_words, target_caption_lengths, axis = -1, max_length = FLAGS.max_ref_length) if get_shape_as_list(greedy_caption_words)[-1] is None: greedy_caption_words, greedy_caption_lengths = \ pad_or_truncate(greedy_caption_words, greedy_caption_lengths, axis = -1, max_length = FLAGS.max_caption_length) if get_shape_as_list(sample_caption_logits)[1] is None: sample_caption_logits, _ = \ pad_or_truncate(sample_caption_logits, sample_caption_lengths, axis = 1, max_length = FLAGS.max_caption_length) if get_shape_as_list(sample_caption_words)[-1] is None: sample_caption_words, sample_caption_lengths = \ pad_or_truncate(sample_caption_words, sample_caption_lengths, axis = -1, max_length = FLAGS.max_caption_length) if FLAGS.rl_beam_search_approximation: target_caption_words = tf.contrib.seq2seq.tile_batch( target_caption_words, multiplier=FLAGS.beam_width) target_caption_lengths = tf.contrib.seq2seq.tile_batch( target_caption_lengths, multiplier=FLAGS.beam_width) rl_loss_cls = find_class_by_name(FLAGS.rl_training_loss, [losses]) rl_loss_fn = rl_loss_cls() rl_loss = rl_loss_fn.calculate_loss( target_caption_words=target_caption_words, target_caption_lengths=target_caption_lengths, greedy_caption_words=greedy_caption_words, greedy_caption_lengths=greedy_caption_lengths, sample_caption_words=sample_caption_words, sample_caption_lengths=sample_caption_lengths, sample_caption_logits=sample_caption_logits) tf.losses.add_loss(rl_loss) else: if "logits" in outputs: # prepare logits, targets and weight logits = outputs["logits"] logits = tf.reshape( logits, [FLAGS.batch_size, -1, FLAGS.vocab_size]) logits, _ = pad_or_truncate( logits, None, axis=1, max_length=FLAGS.max_ref_length) logits = tf.reshape(logits, [-1, FLAGS.vocab_size]) targets = tf.reshape(self.target_seqs, [-1]) weights = tf.to_float(tf.reshape(self.input_mask, [-1])) # Compute losses. loss_fn = losses.SparseSoftmaxCrossEntropyLoss() batch_loss = loss_fn.calculate_loss( logits, targets, weights) # Logging losses. tf.summary.scalar("losses/batch_loss", batch_loss) tf.losses.add_loss(batch_loss) self.target_cross_entropy_losses = batch_loss # Used in evaluation. self.target_cross_entropy_loss_weights = weights # Used in evaluation. # multi-label-loss if "attributes_logits" in outputs and "attributes_mask" in outputs: attributes_logits = outputs["attributes_logits"] attributes_targets = get_attributes_target( self.target_seqs, attributes_mask) if FLAGS.use_idf_weighted_attribute_loss: attributes_mask = outputs["idf_weighted_mask"] else: attributes_mask = outputs["attributes_mask"] attributes_loss_fn = losses.CrossEntropyLoss() attributes_loss = attributes_loss_fn.calculate_loss( attributes_logits, attributes_targets, attributes_mask) tf.losses.add_loss(attributes_loss) tf.summary.scalar("losses/attributes_loss", attributes_loss) self.attributes_loss = attributes_loss # discriminative loss # should be multi-label margin loss, but the loss below is a little different if "discriminative_logits" in outputs: word_labels = caption_to_multi_labels(self.target_seqs) discriminative_loss = tf.losses.hinge_loss( labels=word_labels, logits=outputs["discriminative_logits"], weights=FLAGS.discriminative_loss_weights) tf.summary.scalar("losses/discriminative_loss", discriminative_loss) self.discriminative_loss = discriminative_loss # word weighted cross entropy loss if "word_predictions" in outputs: word_loss_fn = losses.CrossEntropyLoss() word_loss = word_loss_fn.calculate_loss( outputs["word_predictions"], caption_to_multi_labels(self.target_seqs)) tf.summary.scalar("losses/word_loss", word_loss) tf.losses.add_loss(word_loss) self.word_loss = word_loss total_loss = tf.losses.get_total_loss() # Add summaries. tf.summary.scalar("losses/total_loss", total_loss) for var in tf.trainable_variables(): tf.summary.histogram("parameters/" + var.op.name, var) self.total_loss = total_loss
def create_model(self, model_input, vocab_size, labels, scope='default', is_training=True, **unused_params): with tf.variable_scope(scope, tf.AUTO_REUSE): reshaped_input = tf.expand_dims(model_input, -1) reshaped_input = tf.expand_dims(reshaped_input, -1) conv1 = slim.convolution(reshaped_input, 64, [49, 1], stride=(4, 1)) max_pool1 = slim.max_pool2d(conv1, (9, 1), (2, 1), padding='SAME') norm1 = tf.nn.local_response_normalization(max_pool1) conv2 = slim.convolution(norm1, 64, 1, 1) conv3 = slim.convolution(conv2, 192, (9, 1), 1) norm2 = tf.nn.local_response_normalization(conv3) max_pool2 = slim.max_pool2d(norm2, (9, 1), (2, 1), padding='SAME') inception3a = self.inception_module(max_pool2, [64, 96, 128, 16, 32, 32], '3a') inception3b = self.inception_module(inception3a, [128, 128, 192, 32, 96, 64], '3b') max_pool3 = slim.max_pool2d(inception3b, (9, 1), (2, 1), padding='SAME') inception4a = self.inception_module(max_pool3, [192, 96, 208, 16, 48, 64], '4a') inception4b = self.inception_module(inception4a, [160, 112, 224, 24, 64, 64], '4b') inception4c = self.inception_module(inception4b, [128, 128, 256, 24, 64, 64], '4c') inception4d = self.inception_module(inception4c, [112, 144, 288, 32, 64, 64], '4d') inception4e = self.inception_module(inception4d, [256, 160, 320, 32, 128, 128], '4e') max_pool4 = slim.max_pool2d(inception4e, (9, 1), (2, 1), padding='SAME') inception5a = self.inception_module(max_pool4, [256, 160, 320, 32, 128, 128], '5a') inception5b = self.inception_module(inception5a, [384, 192, 384, 48, 128, 128], '5b') inter1 = tf.squeeze(inception4a, axis=[2]) inter2 = tf.squeeze(inception4d, axis=[2]) output = tf.squeeze(inception5b, axis=[2]) inter1 = model_utils.FramePooling(inter1, FLAGS.googlenet_pooling) inter2 = model_utils.FramePooling(inter2, FLAGS.googlenet_pooling) output = model_utils.FramePooling(output, FLAGS.googlenet_pooling) inter_results1 = MoeModel().create_model(inter1, vocab_size, 'inter1') inter_results2 = MoeModel().create_model(inter2, vocab_size, 'inter2') results = MoeModel().create_model(output, vocab_size, 'final') results['features'] = output if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) results['loss'] += losses.CrossEntropyLoss().calculate_loss( inter_results1['predictions'], labels) results['loss'] += losses.CrossEntropyLoss().calculate_loss( inter_results2['predictions'], labels) return results
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) with tf.name_scope("model"): result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss train_op = slim.learning.create_train_op( final_loss, optimizer, global_step=global_step, clip_gradient_norm=clip_gradient_norm) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
def create_model(self, model_input, vocab_size, labels=None, scope='default', l2_penalty=1e-8, is_training=True, **unused_params): reshaped_input = tf.expand_dims(model_input, -1) reshaped_input = tf.expand_dims(reshaped_input, -1) out1 = tf.layers.conv2d( reshaped_input, 128, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out1_norm = tf.layers.batch_normalization(out1, training=is_training) out1_pool = tf.layers.max_pooling2d(out1_norm, (8, 1), 2, padding='same') out2 = tf.layers.conv2d( out1_pool, 256, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out2_norm = tf.layers.batch_normalization(out2, training=is_training) out2_pool = tf.layers.max_pooling2d(out2_norm, (8, 1), 2, padding='same') out3 = tf.layers.conv2d( out2_pool, 256, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out3_norm = tf.layers.batch_normalization(out3, training=is_training) out3_pool = tf.layers.max_pooling2d(out3_norm, (8, 1), 2, padding='same') encoded = tf.reduce_max(out3_pool, axis=[2, 3]) decode = tf.expand_dims(encoded, -1) decode = tf.expand_dims(decode, -1) decode1 = tf.layers.conv2d( decode, 64, (4, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') decode1_batch = tf.layers.batch_normalization(decode1, training=is_training) decode1_upsample = tf.layers.conv2d_transpose( decode1_batch, 256, (8, 1), strides=(2, 1), padding='same', activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer()) decode2 = tf.layers.conv2d( decode1_upsample, 64, 1, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer()) decode2 = tf.layers.conv2d( decode2, 64, (4, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') decode2_batch = tf.layers.batch_normalization(decode2, training=is_training) decode2_upsample = tf.layers.conv2d_transpose( decode2_batch, 256, (8, 1), strides=(2, 1), padding='same', activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer()) decode3 = tf.layers.conv2d( decode2_upsample, 64, 1, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer()) decode3 = tf.layers.conv2d( decode3, 64, (4, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') decode3_batch = tf.layers.batch_normalization(decode3, training=is_training) decode3_upsample = tf.layers.conv2d_transpose( decode3_batch, 256, (8, 1), strides=(2, 1), padding='same', activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer()) decoded = tf.reduce_max(decode3_upsample, axis=[2, 3]) results = {} results['loss'] = 500 * tf.losses.mean_squared_error( model_input, decoded) output = MoeModel().create_model(encoded, vocab_size) results['predictions'] = output['predictions'] if labels is not None: results['loss'] += losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, labels, scope='default', l2_penalty=1e-8, is_training=True, **unused_params): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): out1 = slim.fully_connected( model_input, 768, weights_regularizer=slim.l2_regularizer(l2_penalty)) out1_drop = slim.dropout(out1, is_training=is_training) out2 = slim.fully_connected( out1_drop, 512, weights_regularizer=slim.l2_regularizer(l2_penalty)) out2_drop = slim.dropout(out2, is_training=is_training) encoded = slim.fully_connected( out2_drop, 384, weights_regularizer=slim.l2_regularizer(l2_penalty)) out4 = slim.fully_connected( encoded, 640, weights_regularizer=slim.l2_regularizer(l2_penalty)) out4_drop = slim.dropout(out4, is_training=is_training) out5 = slim.fully_connected( out4_drop, 768, weights_regularizer=slim.l2_regularizer(l2_penalty)) decoded = slim.fully_connected( out5, int(model_input.shape[1]), weights_regularizer=slim.l2_regularizer(l2_penalty)) results = {} encoder_loss = 500 * tf.losses.mean_squared_error( model_input, decoded) tf.summary.scalar("encoder_loss", encoder_loss) results['loss'] = encoder_loss output = MoeModel().create_model(encoded, vocab_size, scope="final_layer") output1 = MoeModel().create_model(out4, vocab_size, scope="intermediate_layer") results['predictions'] = (output['predictions'] + output1['predictions']) / 2 if labels is not None: prediction_loss = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) tf.summary.scalar("prediction_loss", prediction_loss) results['loss'] += prediction_loss return results
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) with tf.name_scope("model"): result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss train_op = slim.learning.create_train_op( final_loss, optimizer, global_step=global_step, clip_gradient_norm=clip_gradient_norm) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
#data_pattern = ['/Users/super/yt8m_videofeature/train*.tfrecord'] data_pattern = ['I:\\yt8m_video\\train*.tfrecord'] num_epochs = 5 reader_batch_size = 1024 num_readers = 1 num_classes = 4716 mini_batch_size = 1024 label_loss_fn = losses.CrossEntropyLoss() regularization_penalty = 1 base_learning_rate = 0.01 learning_rate_decay_examples = 4000000 learning_rate_decay = 0.95 optimizer_class = tf.train.AdamOptimizer clip_gradient_norm = 1.0 regularization_penalty = 1
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, transformer_class=feature_transform.DefaultTransformer, augmenter_class=data_augmentation.DefaultAugmenter, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) if FLAGS.distillation_features: video_id, model_input_raw, labels_batch, num_frames, distill_labels_batch = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) if FLAGS.distillation_features and FLAGS.distillation_type == 2: p = FLAGS.distillation_percent print "distillation_percent =", p, "reforming labels" float_labels = tf.cast(labels_batch, dtype=tf.float32) sum_float_labels = tf.reduce_sum(float_labels, axis=1, keep_dims=True) sum_distill_labels = tf.reduce_sum( distill_labels_batch, axis=1, keep_dims=True) + 1e-6 distill_labels_batch = float_labels + distill_labels_batch * ( sum_float_labels / sum_distill_labels * p) distill_labels_batch = tf.clip_by_value(distill_labels_batch, clip_value_min=0.0, clip_value_max=1.0) else: video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) # data augmentation, will not persist in inference data_augmenter = augmenter_class() model_input_raw, labels_batch, num_frames = data_augmenter.augment( model_input_raw, num_frames=num_frames, labels_batch=labels_batch) tf.summary.histogram("model/input_raw", model_input_raw) feature_transformer = transformer_class() model_input, num_frames = feature_transformer.transform( model_input_raw, num_frames=num_frames) tf.summary.histogram("model/input", model_input) with tf.name_scope("model"): if FLAGS.noise_level > 0: noise_level_tensor = tf.placeholder_with_default( 0.0, shape=[], name="noise_level") else: noise_level_tensor = None if FLAGS.distillation_as_input: distillation_predictions = distill_labels_batch else: distillation_predictions = None if FLAGS.dropout: keep_prob_tensor = tf.placeholder_with_default(1.0, shape=[], name="keep_prob") result = model.create_model( model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, dropout=FLAGS.dropout, keep_prob=keep_prob_tensor, distillation_predictions=distillation_predictions, noise_level=noise_level_tensor) else: result = model.create_model( model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, distillation_predictions=distillation_predictions, noise_level=noise_level_tensor) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) print "result", result predictions = result["predictions"] if "loss" in result.keys(): label_loss = result["loss"] else: video_weights_batch = None if FLAGS.reweight: video_weights_batch = get_video_weights(video_id) if FLAGS.distillation_as_boosting: video_weights_batch = get_weights_by_predictions( labels_batch, distillation_predictions) if FLAGS.multitask: support_predictions = result["support_predictions"] tf.summary.histogram("model/support_predictions", support_predictions) print "support_predictions", support_predictions if FLAGS.distillation_features and FLAGS.distillation_type == 1: p = FLAGS.distillation_percent print "distillation_percent =", p if p <= 0: label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, labels_batch, weights=video_weights_batch) elif p >= 1: label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, distill_labels_batch, weights=video_weights_batch) else: label_loss = label_loss_fn.calculate_loss(predictions, support_predictions, labels_batch, weights=video_weights_batch) * (1.0 - p) \ + label_loss_fn.calculate_loss(predictions, support_predictions, distill_labels_batch, weights=video_weights_batch) * p elif FLAGS.distillation_features and FLAGS.distillation_type == 2: print "using pure distillation loss" label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, distill_labels_batch, weights=video_weights_batch) else: print "using original loss" label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, labels_batch, weights=video_weights_batch) else: if FLAGS.distillation_features and FLAGS.distillation_type == 1: p = FLAGS.distillation_percent print "distillation_percent =", p if p <= 0: label_loss = label_loss_fn.calculate_loss( predictions, labels_batch, weights=video_weights_batch) elif p >= 1: label_loss = label_loss_fn.calculate_loss( predictions, distill_labels_batch, weights=video_weights_batch) else: label_loss = label_loss_fn.calculate_loss(predictions, labels_batch, weights=video_weights_batch) * (1.0 - p) \ + label_loss_fn.calculate_loss(predictions, distill_labels_batch, weights=video_weights_batch) * p elif FLAGS.distillation_features and FLAGS.distillation_type == 2: print "using pure distillation loss" label_loss = label_loss_fn.calculate_loss( predictions, distill_labels_batch, weights=video_weights_batch) else: print "using original loss" label_loss = label_loss_fn.calculate_loss( predictions, labels_batch, weights=video_weights_batch) tf.summary.histogram("model/predictions", predictions) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op) if FLAGS.dropout: tf.add_to_collection("keep_prob", keep_prob_tensor) if FLAGS.noise_level > 0: tf.add_to_collection("noise_level", noise_level_tensor)
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.compat.v1.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == "GPU"] gpus = gpus[:FLAGS.num_gpu] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = "/gpu:%d" else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = "/cpu:%d" learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar("learning_rate", learning_rate) optimizer = optimizer_class(learning_rate) input_data_dict = (get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) print('input_data_dict', input_data_dict) model_input_raw = input_data_dict["video_matrix"] labels_batch = input_data_dict["labels"] num_frames = input_data_dict["num_frames"] print("model_input_shape, ", model_input_raw.shape) print("labels_batch, ", labels_batch) import csv import urllib3 import numpy as np import pandas as pd whitelisted_cls_mask = np.zeros((3862, ), dtype=np.float32) url = pd.read_csv('segment_label_ids.csv') # response = urllib2.urlopen(url) for line in url: try: cls_id = int(line[0]) whitelisted_cls_mask[cls_id] = 1. except ValueError: # Simply skip the non-integer line. continue #response.close() # url2 = 'http://storage.googleapis.com/youtube8m-lijun-mlengine/classCount.csv' # response2 = urllib2.urlopen(url2) # fobj2 = csv.reader(response2) # for line in fobj2: # try: # cls_id = int(line[0]) # whitelisted_cls_mask[cls_id] = (15-np.log(int(line[1])))**2 # except ValueError: # # Simply skip the non-integer line. # continue # response2.close() # select=tf.matmul(tf.cast(labels_batch, tf.float32),tf.reshape(whitelisted_cls_mask,[3862,1]))>0 # select=tf.squeeze(select) # model_input_raw = model_input_raw[select,:,:] # labels_batch = labels_batch[select,:] # num_frames = num_frames[select] tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) tower_inputs = tf.split(model_input, num_towers) tower_labels = tf.split(labels_batch, num_towers) tower_num_frames = tf.split(num_frames, num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] # import csv # import urllib2 # import numpy as np # whitelisted_cls_mask = np.zeros((3862,), # dtype=np.float32) # url = 'http://storage.googleapis.com/youtube8m-lijun-mlengine/segment_label_ids.csv' # response = urllib2.urlopen(url) # fobj = csv.reader(response) # for line in fobj: # try: # cls_id = int(line[0]) # whitelisted_cls_mask[cls_id] = 1. # except ValueError: # # Simply skip the non-integer line. # continue # response.close() # whitelisted_cls_mask=whitelisted_cls_mask+np.ones((3862,),dtype=np.float32) whitelisted_cls_mask = whitelisted_cls_mask * 4 + np.ones( (3862, ), dtype=np.float32) # whitelisted_cls_mask=0.05*(whitelisted_cls_mask*99+np.ones((3862,),dtype=np.float32)) # print('whitelisted_cls_mask',np.amin(whitelisted_cls_mask)) for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested.f with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): # reuse=True if i > 0 else None with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): result = model.create_model(tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=reader.num_classes, labels=tower_labels[i]) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) # print('result predictions',result["predictions"]) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i], label_weights=whitelisted_cls_mask) if "aux_predictions" in result.keys(): for pred in result["aux_predictions"]: label_loss += label_loss_fn.calculate_loss( pred, tower_labels[i], label_weights=whitelisted_cls_mask) # print('label_loss',label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) tf.summary.scalar("label_loss", label_loss) if regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) merged_gradients = utils.combine_gradients(tower_gradients) if clip_gradient_norm > 0: with tf.name_scope("clip_grads"): merged_gradients = utils.clip_gradient_norms( merged_gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
def build_graph(reader, model, train_data_pattern, train_data_pattern2, train_data_pattern3, eval_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None, l2_penalty=1e-8, gpu_only=1): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ # data files files1 = gfile.Glob(train_data_pattern) files2 = gfile.Glob(train_data_pattern2) files3 = gfile.Glob(train_data_pattern3) files = files1 + files2 + files3 if not files: raise IOError("Unable to find training files. data_pattern='" + data_pattern + "'.") logging.info("Total number of training files: %s + %s + %s = %s.", str(len(files1)), str(len(files2)), str(len(files3)), str(len(files))) files4 = gfile.Glob(eval_data_pattern) logging.info("Total number of eval files: %s.", str(len(files4))) if FLAGS.fold == -1: validate_files = files4 train_files = files else: validate_files = files[FLAGS.fold::5] train_files = [x for x in files if x not in validate_files] logging.info("train files: {}, first is: {}.".format( len(train_files), train_files[0].split('/')[-1])) logging.info("eval files: {}, first is: {}.".format( len(validate_files), validate_files[0].split('/')[-1])) # label weights for loss function. ugly hard coded for now. wgts_np = np.ones(FLAGS.truncated_num_classes) over_weight_labels = False if over_weight_labels: labels_to_overwgt = [ 38, 47, 49, 55, 72, 76, 86, 89, 93, 94, 95, 98, 99, 101, 102, 110, 111, 113, 114, 115, 120, 121 ] wgts_np[labels_to_overwgt] = 2.0 wgts_4_lossfn = tf.constant(wgts_np, dtype=tf.float32) global_step = tf.Variable(0, trainable=False, name="global_step") restart_learning_rate = tf.Variable(base_learning_rate, trainable=False, name="restart_learning_rate") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' learning_rate = tf.train.exponential_decay(restart_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_files, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) # model params # probabilities for keeping a neuron in a layer, assuming max 10 layers, below default value with tf.variable_scope("tower", reuse=True) as scope: layers_keep_probs = tf.Variable( [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], trainable=False, name="layers_keep_probs") model_input = model_input_raw if FLAGS.apply_global_normalization: g_mean, g_std = model_utils.load_global_moments() g_inv_std = 1.0 / g_std global_mean = tf.constant(g_mean, dtype=tf.float32) # expand global mean to match new dimension and fill rest with zeros new_dim = tf.cast(model_input_raw.shape[1], tf.int32) zero_padding = tf.zeros(new_dim - tf.shape(global_mean), tf.float32) global_mean_padded = tf.concat([global_mean, zero_padding], 0) # expand global inv std to match new dimension and fill rest with ones global_inv_std = tf.constant(g_inv_std, dtype=tf.float32) one_padding = tf.ones(new_dim - tf.shape(global_inv_std), tf.float32) global_inv_std_padded = tf.concat([global_inv_std, one_padding], 0) # apply normalizations (can do both) if requested # global L2 normalization model_input = tf.multiply(tf.subtract(model_input, global_mean_padded), global_inv_std_padded) # regular L2 normalization if FLAGS.apply_batch_l2_normalization: feature_dim = len(model_input.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input, feature_dim) tower_inputs = tf.split(model_input, num_towers) tower_labels = tf.split(labels_batch, num_towers) tower_num_frames = tf.split(num_frames, num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] # eval graph - to monitor performance out of sample during training e_video_id, e_input_raw, e_labels_batch, e_num_frames = ( get_input_data_tensors(reader, validate_files, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=2 * num_epochs)) e_input = e_input_raw if FLAGS.apply_global_normalization: e_input = tf.multiply(tf.subtract(e_input, global_mean_padded), global_inv_std_padded) if FLAGS.apply_batch_l2_normalization: feature_dim = len(model_input.get_shape()) - 1 e_input = tf.nn.l2_normalize(e_input, feature_dim) e_tower_inputs = tf.split(e_input, num_towers) e_tower_labels = tf.split(e_labels_batch, num_towers) e_tower_num_frames = tf.split(e_num_frames, num_towers) e_tower_predictions = [] e_tower_layers_keep_probs = tf.Variable( [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], trainable=False, name="layers_keep_probs") logging.info(e_tower_inputs) # end eval for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. logging.info('For tower: ' + str(i)) with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): logging.info(layers_keep_probs) result = model.create_model( tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=reader.num_classes, labels=tower_labels[i], layers_keep_probs=layers_keep_probs, l2_penalty=l2_penalty, is_training=True) for variable in slim.get_model_variables(): logging.info(variable) tf.summary.histogram(variable.op.name, variable) # create shadow moving average model variables if FLAGS.use_ema == True: model_vars = [x for x in slim.get_model_variables()] ema = tf.train.ExponentialMovingAverage( decay=1.0 - 1.0 / FLAGS.ema_halflife) ema_op = ema.apply(model_vars) logging.info("model_vars:") logging.info(" || ".join([str(x) for x in model_vars])) ema_vars = [ema.average(x) for x in model_vars] ema_vars_pair_dict = { ema.average_name(x): x.op.name for x in model_vars } logging.info("ema_vars_pair_dict:") for x, y in ema_vars_pair_dict.items(): logging.info(x + ': ' + y) for v in ema_vars: tf.summary.histogram(v.op.name, v) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op) tf.add_to_collection("ema_op", ema_op) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i], FLAGS.loss_epsilon) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) # eval ops logging.info("eval ops") e_result = model.create_model( e_tower_inputs[i], num_frames=e_tower_num_frames[i], vocab_size=reader.num_classes, labels=e_tower_labels[i], layers_keep_probs= e_tower_layers_keep_probs, #tf.Variable([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], tf.float32, name="layers_keep_probs") l2_penalty=l2_penalty, is_training=False) e_predictions = e_result["predictions"] e_tower_predictions.append(e_predictions) # end eval ops label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) tf.summary.scalar("label_loss", label_loss) if regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) merged_gradients = utils.combine_gradients(tower_gradients) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms( merged_gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("restart_learning_rate", restart_learning_rate) tf.add_to_collection("layers_keep_probs", layers_keep_probs) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op) #tf.add_to_collection("ema_op", ema_op) # add eval graph e_label_loss = label_loss_fn.calculate_loss( tf.concat(e_tower_predictions, 0), e_labels_batch, FLAGS.loss_epsilon) tf.summary.scalar("e_label_loss", e_label_loss) tf.add_to_collection("e_predictions", tf.concat(e_tower_predictions, 0)) tf.add_to_collection("e_labels", tf.cast(e_labels_batch, tf.float32)) tf.add_to_collection("e_loss", e_label_loss)
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == "GPU"] print(gpus) gpus = gpus[:FLAGS.num_gpu] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = "/gpu:%d" else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = "/cpu:%d" learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar("learning_rate", learning_rate) if clip_gradient_norm > 0.: optimizer = optimizer_class(learning_rate) else: optimizer = optimizer_class(learning_rate) input_data_dict = (get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) model_input_raw = input_data_dict["video_matrix"] labels_batch = input_data_dict["labels"] num_frames = input_data_dict["num_frames"] print("model_input_shape, ", model_input_raw.shape) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 offset = np.array([4. / 512] * 1024 + [0] * 128) offset = tf.constant(offset, dtype=tf.float32) eigen_val = tf.constant(np.sqrt( np.load("yt8m_pca/eigenvals.npy")[:1024, 0]), dtype=tf.float32) model_input = tf.multiply( model_input_raw - offset, tf.pad(eigen_val + 1e-4, [[0, 128]], constant_values=1.)) # model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) if FLAGS.segment_labels: label_weights = input_data_dict["label_weights"] else: label_weights = None tower_logits = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] print("flag1!!!!", device_string) for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(device_string % i): with tf.variable_scope("tower_%d" % i, reuse=False): result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, is_training=True) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] tower_predictions.append(predictions) logits = result["logits"] tower_logits.append(logits) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, labels_batch, label_weights=label_weights) if "aux_predictions" in result.keys(): for pred in result["aux_predictions"]: label_loss += label_loss_fn.calculate_loss( pred, labels_batch, label_weights=label_weights) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) with tf.device("/gpu:%d" % 0): with tf.variable_scope("ensemble"): ftr_mean = tf.reduce_mean(model_input, axis=1) print("ftr mean shape: ", ftr_mean.get_shape().as_list()) ftr_mean = slim.batch_norm(ftr_mean, center=True, scale=True, fused=False, is_training=True, scope="mix_weights_bn") mix_weights = slim.fully_connected( ftr_mean, num_towers, activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), scope="mix_weights") mix_weights = tf.nn.softmax(mix_weights, axis=-1) tf.summary.histogram("mix_weights", mix_weights) logits = tf.stack(tower_logits, axis=1) final_logit = tf.reduce_sum(tf.multiply( logits, tf.expand_dims(mix_weights, axis=-1)), axis=1, keepdims=False) final_predictions = tf.nn.sigmoid(final_logit) print("flag2!!!", FLAGS.final_temperature, FLAGS.final_lambda) rank_pred = tf.expand_dims(tf.nn.softmax(tf.div( final_logit, FLAGS.final_temperature), axis=-1), axis=1) aux_rank_preds = tf.nn.softmax(tf.div(logits, FLAGS.final_temperature), axis=-1) epsilon = 1e-8 kl_loss = tf.reduce_sum( rank_pred * (tf.log(rank_pred + epsilon) - tf.log(aux_rank_preds + epsilon)), axis=-1) regularization_loss = FLAGS.final_lambda * tf.reduce_mean( tf.reduce_sum(kl_loss, axis=-1), axis=-1) final_label_loss = label_loss_fn.calculate_loss( final_predictions, labels_batch, label_weights=label_weights) label_loss = tf.reduce_sum( tf.stack(tower_label_losses)) + final_label_loss tf.summary.scalar("label_loss", label_loss) reg_loss = tf.reduce_sum( tf.stack(tower_reg_losses)) + regularization_loss tf.summary.scalar("reg_loss", reg_loss) final_loss = label_loss + regularization_penalty * reg_loss gradients = optimizer.compute_gradients(final_loss, colocate_gradients_with_ops=True) if clip_gradient_norm > 0: gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm) final_train_op = optimizer.apply_gradients(gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", final_predictions) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", final_train_op)
def build_graph(reader, input_data_pattern, model, distill_readers=None, distill_data_patterns=None, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, transformer_class=feature_transform.DefaultTransformer): video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, input_data_pattern, batch_size=batch_size)) if distill_readers is not None: all_distill_labels = [] for dreader, dpattern in zip(distill_readers, distill_data_patterns): distill_video_id, distill_labels_batch, unused_labels_batch, unused_num_frames = ( get_input_data_tensors(dreader, dpattern, batch_size=batch_size)) all_distill_labels.append(distill_labels_batch) all_distill_labels = tf.stack(all_distill_labels, axis=2) distill_weight_var = tf.get_variable("distill_weight", [len(distill_readers)]) distill_weight = tf.nn.softmax(distill_weight_var) final_distill_labels = tf.einsum("ijk,k->ij", all_distill_labels, distill_weight) feature_transformer = transformer_class() model_input, num_frames = feature_transformer.transform( model_input_raw, num_frames=num_frames) with tf.name_scope("model"): if FLAGS.noise_level > 0: noise_level_tensor = tf.placeholder_with_default( 0.0, shape=[], name="noise_level") else: noise_level_tensor = None if distill_readers is not None: distillation_predictions = final_distill_labels else: distillation_predictions = None if FLAGS.dropout: keep_prob_tensor = tf.placeholder_with_default(1.0, shape=[], name="keep_prob") result = model.create_model( model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, dropout=FLAGS.dropout, keep_prob=keep_prob_tensor, noise_level=noise_level_tensor, distillation_predictions=distillation_predictions, is_training=False) else: result = model.create_model( model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, noise_level=noise_level_tensor, distillation_predictions=distillation_predictions, is_training=False) print "result", result predictions = result["predictions"] tf.add_to_collection("predictions", predictions) tf.add_to_collection("video_id_batch", video_id) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) if FLAGS.dropout: tf.add_to_collection("keep_prob", keep_prob_tensor) if FLAGS.noise_level > 0: tf.add_to_collection("noise_level", noise_level_tensor)
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) if FLAGS.distillation_features: unused_video_id, model_input_raw, labels_batch, num_frames, distill_labels_batch = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) if FLAGS.distillation_features and FLAGS.distillation_type == 2: p = FLAGS.distillation_percent print("distillation_percent =", p, "reforming labels") float_labels = tf.cast(labels_batch, dtype=tf.float32) sum_float_labels = tf.reduce_sum(float_labels, axis=1, keep_dims=True) sum_distill_labels = tf.reduce_sum( distill_labels_batch, axis=1, keep_dims=True) + 1e-6 distill_labels_batch = float_labels + distill_labels_batch * ( sum_float_labels / sum_distill_labels * p) distill_labels_batch = tf.clip_by_value(distill_labels_batch, clip_value_min=0.0, clip_value_max=1.0) else: unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 if FLAGS.norm: model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) else: model_input = model_input_raw with tf.name_scope("model"): if FLAGS.distillation_features and FLAGS.distillation_type == 0: result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch, distill_labels=distill_labels_batch) else: result = model.create_model(model_input, num_frames=num_frames, vocab_size=reader.num_classes, labels=labels_batch) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] if "predictions_negative" in result.keys(): predictions_negative = result["predictions_negative"] else: predictions_negative = 1 - predictions if "predictions_positive" in result.keys(): predictions_positive = result["predictions_positive"] else: predictions_positive = predictions if predictions.get_shape().ndims == 3: predictions = tf.reshape( predictions, [-1, predictions.get_shape().as_list()[2]]) labels_batch = tf.reshape( labels_batch, [-1, labels_batch.get_shape().as_list()[2]]) if "bottleneck" in result.keys(): bottle_neck = result["bottleneck"] else: bottle_neck = tf.constant(0.0) if "predictions_class" in result.keys(): predictions_class = result["predictions_class"] else: predictions_class = predictions if "predictions_encoder" in result.keys(): predictions_encoder = result["predictions_encoder"] else: predictions_encoder = predictions if "predictions_experts" in result.keys(): predictions_experts = result["predictions_experts"] else: predictions_experts = predictions if "predictions_postprocess" in result.keys(): predictions_postprocess = result["predictions_postprocess"] else: predictions_postprocess = predictions if "loss" in result.keys(): append_loss = result["loss"] else: append_loss = tf.constant(0.0) if "predictions_encoder" in result.keys(): label_loss, float_encoders = label_loss_fn.calculate_loss_mix2( predictions, predictions_class, predictions_encoder, labels_batch) tf.summary.histogram("model/float_encoders", float_encoders) elif FLAGS.distillation_features and FLAGS.distillation_type == 1: label_loss = label_loss_fn.calculate_loss_distill_boost( predictions, distill_labels_batch, labels_batch) elif FLAGS.distillation_features and FLAGS.distillation_type == 2: label_loss = label_loss_fn.calculate_loss_distill( predictions, distill_labels_batch, labels_batch) elif FLAGS.distillation_features and FLAGS.distillation_type == 3: label_loss = label_loss_fn.calculate_loss_distill_relabel( predictions, distill_labels_batch, labels_batch) elif "predictions_class" in result.keys(): label_loss = label_loss_fn.calculate_loss_mix( predictions, predictions_class, labels_batch) elif "predictions_experts" in result.keys(): label_loss = label_loss_fn.calculate_loss_max( predictions, predictions_experts, labels_batch) elif "predictions_postprocess" in result.keys(): label_loss = label_loss_fn.calculate_loss_postprocess( predictions_postprocess, labels_batch) elif "predictions_negative" in result.keys(): label_loss = label_loss_fn.calculate_loss_negative( predictions_positive, predictions_negative, labels_batch) else: label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) if "prediction_frames" in result.keys(): predictions_frames = result["prediction_frames"] labels_frames = tf.tile( tf.reshape(labels_batch, [-1, 1, reader.num_classes]), [1, FLAGS.moe_num_extend, 1]) labels_frames = tf.cast( tf.reshape(labels_frames, [-1, reader.num_classes]), tf.float32) frame_loss = label_loss_fn.calculate_loss(predictions_frames, labels_frames) if "prediction_prepare_frames" in result.keys(): prediction_prepare_frames = result["prediction_prepare_frames"] prediction_prepare_video = result["prediction_prepare_video"] max_frames = model_input.get_shape().as_list()[1] frames_sum = tf.reduce_sum(tf.abs(model_input), axis=2) frames_true = tf.ones(tf.shape(frames_sum)) frames_false = tf.zeros(tf.shape(frames_sum)) frames_bool = tf.where(tf.greater(frames_sum, frames_false), frames_true, frames_false) frames_bool = tf.reshape( frames_bool[:, 0:max_frames:FLAGS.stride_size], [-1, 1]) labels_prepare_frames = tf.tile( tf.reshape(labels_batch, [-1, 1, reader.num_classes]), [1, max_frames // FLAGS.stride_size, 1]) labels_prepare_frames = tf.cast( tf.reshape(labels_prepare_frames, [-1, reader.num_classes]), tf.float32) * frames_bool prediction_prepare_frames = prediction_prepare_frames * frames_bool label_loss = 0.1*label_loss_fn.calculate_loss(prediction_prepare_frames, labels_prepare_frames) + \ 0.1*label_loss_fn.calculate_loss(prediction_prepare_video, labels_batch) else: label_loss = label_loss * 0.0 elif "prediction_minmax" in result.keys(): predictions_minmax = result["prediction_minmax"] predictions_min = tf.reduce_min(predictions_minmax, axis=1) predictions_max = tf.reduce_max(predictions_minmax, axis=1) epsilon = 10e-6 float_labels = tf.cast(labels_batch, tf.float32) cross_entropy_loss = float_labels * tf.log( predictions_min + epsilon) + ( 1 - float_labels) * tf.log(1 - predictions_max + epsilon) frame_loss = tf.reduce_mean( tf.reduce_sum(tf.negative(cross_entropy_loss), 1)) label_loss = label_loss * 0.0 else: frame_loss = tf.constant(0.0) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss + frame_loss + append_loss if FLAGS.gradient == "my": opt = tf.train.AdamOptimizer(learning_rate=learning_rate) variables_to_train = tf_variables.trainable_variables() top_grads, top_vars = mygradients(final_loss, variables_to_train, global_step=global_step, name="mygradients_net") grads_and_vars = list(zip(top_grads, top_vars)) train_op = opt.apply_gradients(grads_and_vars, global_step=global_step) else: optimizer = optimizer_class(learning_rate) train_op = slim.learning.create_train_op( final_loss, optimizer, global_step=global_step, clip_gradient_norm=clip_gradient_norm) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", final_loss) tf.add_to_collection("reg_loss", reg_loss) tf.add_to_collection("bottleneck", bottle_neck) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
def main(): batch_size = 256 num_classes = 10 """ load dataset """ dataset = loaders.Cifar10Loader('./datasets/CIFAR-10').load() train_dataset, valid_dataset = dataset """ processor """ train_processor = processors.Cifar10ClassificationProcessor( batch_size, num_classes=num_classes, enable_augmentation=True, image_size=(32, 32)) valid_processor = processors.Cifar10ClassificationProcessor( batch_size, num_classes=num_classes, enable_augmentation=False, image_size=(32, 32)) """ iterator """ train_iterator = iterators.MultiprocessIterator(train_dataset, train_processor, num_workers=1) valid_iterator = iterators.MultiprocessIterator(valid_dataset, valid_processor, num_workers=1) """ device """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') """ model """ model = models.RiriverceCifar10Net9(input_channels=3, num_classes=num_classes).to(device) """ loss """ loss_function = losses.CrossEntropyLoss().to(device) """ optimizer """ optimizer = torch.optim.Adam(model.parameters(), lr=0.01) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20) """ logger """ logger = loggers.SimpleLogger() """ learning """ for epoch in range(100): print(f"-" * 64) print(f"[epoch {epoch:>4d}]") phase = 'train' torch.set_grad_enabled(True) for batch_data in tqdm.tqdm(train_iterator, desc=phase): optimizer.zero_grad() batch_image = torch.from_numpy(batch_data['image']).to(device) batch_target = torch.from_numpy(batch_data['target']).to(device) batch_output = model(batch_image) batch_loss = loss_function(batch_output, batch_target) batch_loss.sum().backward() optimizer.step() batch_loss = batch_loss.data.cpu().numpy() batch_label = np.argmax(batch_target.data.cpu().numpy(), axis=-1).flatten() batch_pred = np.argmax(batch_output.data.cpu().numpy(), axis=-1).flatten() logger.add_batch_loss(batch_loss, phase=phase) logger.add_batch_pred(batch_pred, phase=phase) logger.add_batch_label(batch_label, phase=phase) loss = logger.get_loss(phase) accuracy = logger.get_accuracy(phase) print(f"loss : {loss}") print(f"accuracy : {accuracy}") phase = 'valid' torch.set_grad_enabled(False) for batch_data in tqdm.tqdm(valid_iterator, desc=phase): optimizer.zero_grad() batch_image = torch.from_numpy(batch_data['image']).to(device) batch_target = torch.from_numpy(batch_data['target']).to(device) batch_output = model(batch_image) batch_loss = loss_function(batch_output, batch_target) batch_loss = batch_loss.data.cpu().numpy() batch_label = np.argmax(batch_target.data.cpu().numpy(), axis=-1).flatten() batch_pred = np.argmax(batch_output.data.cpu().numpy(), axis=-1).flatten() logger.add_batch_loss(batch_loss, phase=phase) logger.add_batch_pred(batch_pred, phase=phase) logger.add_batch_label(batch_label, phase=phase) loss = logger.get_loss(phase) accuracy = logger.get_accuracy(phase) print(f"loss : {loss:.4f}") print(f"accuracy : {accuracy:.4f}") logger.step()
def evaluate(): tf.set_random_seed(0) # for reproducibility # Write json of flags model_flags_path = os.path.join(FLAGS.train_dir, "model_flags.json") if not file_io.file_exists(model_flags_path): raise IOError(("Cannot find file %s. Did you run train.py on the same " "--train_dir?") % model_flags_path) flags_dict = json.loads(file_io.FileIO(model_flags_path, "r").read()) with tf.Graph().as_default(): # convert feature_names and feature_sizes to lists of values feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( flags_dict["feature_names"], flags_dict["feature_sizes"]) if flags_dict["frame_features"]: reader = readers.YT8MFrameFeatureReader( feature_names=feature_names, feature_sizes=feature_sizes) else: reader = readers.YT8MAggregatedFeatureReader( feature_names=feature_names, feature_sizes=feature_sizes) model = frame_level_models.NetVLADModelLF() model_lst = [model] if FLAGS.ensemble_num > 1: for ensemble_idx in range(1, FLAGS.ensemble_num): model2 = frame_level_models.NetVLADModelLF() model_lst.append(model2) label_loss_fn = losses.CrossEntropyLoss() if FLAGS.eval_data_pattern is "": raise IOError("'eval_data_pattern' was not specified. " + "Nothing to evaluate.") video_id_batch = build_graph(reader=reader, model=model_lst, eval_data_pattern=FLAGS.eval_data_pattern, label_loss_fn=label_loss_fn, num_readers=FLAGS.num_readers, batch_size=FLAGS.batch_size) logging.info("built evaluation graph") if not FLAGS.force_output_model_name: video_id_batch = tf.get_collection("video_id_batch")[0] prediction_batch = tf.get_collection("predictions")[0] label_batch = tf.get_collection("labels")[0] loss = tf.get_collection("loss")[0] summary_op = tf.get_collection("summary_op")[0] saver = tf.train.Saver(tf.global_variables()) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=tf.get_default_graph()) evl_metrics = eval_util.EvaluationMetrics(reader.num_classes, FLAGS.top_k) last_global_step_val = -1 #with tf.device("/gpu:0"): while True: last_global_step_val = evaluation_loop(video_id_batch, prediction_batch, label_batch, loss, summary_op, saver, summary_writer, evl_metrics, last_global_step_val) if FLAGS.run_once: break
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): global_step = tf.Variable(0, trainable=False, name="global_step") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) tower_inputs = tf.split(model_input, num_towers) tower_labels = tf.split(labels_batch, num_towers) tower_num_frames = tf.split(num_frames, num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): result = model.create_model(tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=reader.num_classes, labels=tower_labels[i]) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i]) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) tf.summary.scalar("label_loss", label_loss) if regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) merged_gradients = utils.combine_gradients(tower_gradients) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms( merged_gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
def build_graph(all_readers, all_train_data_patterns, input_reader, input_data_pattern, model, label_loss_fn=losses.CrossEntropyLoss(), batch_size=256, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: all_readers: The data file readers. Every element in it should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_patterns: glob paths to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) original_input = None if input_data_pattern is not None: original_video_id, original_input, unused_labels_batch, unused_num_frames = ( get_input_data_tensors(input_reader, input_data_pattern, batch_size=batch_size, num_epochs=num_epochs)) optimizer = optimizer_class(learning_rate) model_input_raw_tensors = [] labels_batch_tensor = None for reader, data_pattern in zip(all_readers, all_train_data_patterns): video_id, model_input_raw, labels_batch, unused_num_frames = ( get_input_data_tensors(reader, data_pattern, batch_size=batch_size, num_epochs=num_epochs)) if labels_batch_tensor is None: labels_batch_tensor = labels_batch model_input_raw_tensors.append(tf.expand_dims(model_input_raw, axis=2)) if original_input is not None: id_match = tf.ones_like(original_video_id, dtype=tf.float32) id_match = id_match * tf.cast( tf.equal(original_video_id, video_id), dtype=tf.float32) tf.summary.scalar("model/id_match", tf.reduce_mean(id_match)) model_input = tf.concat(model_input_raw_tensors, axis=2) labels_batch = labels_batch_tensor tf.summary.histogram("model/input", model_input) with tf.name_scope("model"): if FLAGS.noise_level > 0: noise_level_tensor = tf.placeholder_with_default( 0.0, shape=[], name="noise_level") else: noise_level_tensor = None if FLAGS.dropout: keep_prob_tensor = tf.placeholder_with_default(1.0, shape=[], name="keep_prob") result = model.create_model(model_input, labels=labels_batch, vocab_size=reader.num_classes, original_input=original_input, dropout=FLAGS.dropout, keep_prob=keep_prob_tensor, noise_level=noise_level_tensor) else: result = model.create_model(model_input, labels=labels_batch, vocab_size=reader.num_classes, original_input=original_input, noise_level=noise_level_tensor) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] if "loss" in result.keys(): label_loss = result["loss"] else: video_weights_batch = None if FLAGS.reweight: video_weights_batch = get_video_weights(video_id) else: video_weights_batch = None if FLAGS.multitask: print "using multitask loss" support_predictions = result["support_predictions"] tf.summary.histogram("model/support_predictions", support_predictions) print "support_predictions", support_predictions label_loss = label_loss_fn.calculate_loss( predictions, support_predictions, labels_batch, weights=video_weights_batch) else: print "using original loss" label_loss = label_loss_fn.calculate_loss( predictions, labels_batch, weights=video_weights_batch) tf.summary.histogram("model/predictions", predictions) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss if FLAGS.training: gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): gradients = utils.clip_gradient_norms( gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(gradients, global_step=global_step) else: train_op = tf.no_op() tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch_raw", model_input) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op) if FLAGS.dropout: tf.add_to_collection("keep_prob", keep_prob_tensor) if FLAGS.noise_level > 0: tf.add_to_collection("noise_level", noise_level_tensor)
def build_graph(refiner_model, discriminator_model, train_data_reader, train_data_pattern, true_label_reader, true_label_pattern, refiner_loss_fn=losses.CrossEntropyLoss(), similarity_loss_fn=losses.MeanSquareErrorLoss(), discriminator_loss_fn=losses.CrossEntropyLoss(), batch_size=4, base_learning_rate=0.01, learning_rate_decay_examples=4000, learning_rate_decay=0.99, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, prediction_threshold=0.5, regularization_penalty=1, num_readers=2, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") if FLAGS.accumulate_gradients: actual_batch_size = batch_size * FLAGS.apply_every_n_batches else: actual_batch_size = batch_size learning_rate = tf.train.exponential_decay( base_learning_rate, global_step * actual_batch_size, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) image_id, image_data, image_mask = ( get_input_data_tensors( train_data_reader, train_data_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) _, _, true_mask = ( get_input_data_tensors( true_label_reader, true_label_pattern, batch_size=batch_size, num_readers=num_readers, num_epochs=num_epochs)) model_input = image_data tf.summary.histogram("model/input", model_input) with tf.name_scope("refiner_model"): result = refiner_model.create_model( model_input, scope="refiner_model", l2_penalty=FLAGS.l2_penalty) print "result", result for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] if "loss" in result.keys(): label_loss = result["loss"] else: refiner_loss = refiner_loss_fn.calculate_loss(predictions, image_mask) image_input = tf.cast(tf.squeeze(image_data, axis=3) > 127, tf.int32) similarity_loss = similarity_loss_fn.calculate_loss(predictions, image_input) tf.summary.scalar("refiner_loss", refiner_loss) tf.summary.scalar("similarity_loss", similarity_loss) label_loss = refiner_loss + similarity_loss tf.summary.scalar("label_loss", label_loss) tf.add_to_collection("refiner_loss", refiner_loss) tf.add_to_collection("similarity_loss", similarity_loss) tf.summary.histogram("model/predictions", predictions) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses(scope="refiner_model") if reg_losses: reg_loss += tf.add_n(reg_losses) if regularization_penalty != 0: tf.summary.scalar("reg_loss", reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope="refiner_model") if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): bar_label_loss = tf.identity(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + bar_label_loss optimizing(optimizer, final_loss, clip_gradient_norm, global_step, prefix="refiner", scope="refiner_model") labels = tf.cast(image_mask, tf.int32) float_labels = tf.cast(image_mask, tf.float32) bool_predictions = tf.greater(predictions, prediction_threshold) true_pos = tf.cast(tf.reduce_sum(tf.cast(labels > 0, tf.int32) * tf.cast(predictions > prediction_threshold, tf.int32)), tf.float32) false_pos = tf.cast(tf.reduce_sum(tf.cast(labels <= 0, tf.int32) * tf.cast(predictions > prediction_threshold, tf.int32)), tf.float32) false_neg = tf.cast(tf.reduce_sum(tf.cast(labels > 0, tf.int32) * tf.cast(predictions <= prediction_threshold, tf.int32)), tf.float32) mean_iou = (2.0 * true_pos + 1e-7) / (2.0 * true_pos + false_pos + false_neg + 1e-7) print mean_iou num_examples = tf.shape(labels)[0] tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("id_batch", image_id) tf.add_to_collection("predictions", predictions) tf.add_to_collection("model_input", model_input) tf.add_to_collection("num_examples", num_examples) tf.add_to_collection("labels", labels) tf.add_to_collection("float_labels", float_labels) tf.add_to_collection("bool_predictions", bool_predictions) tf.add_to_collection("mean_iou", mean_iou) def split_into_small_patches(masks, label_value): masks = tf.expand_dims(masks, axis=3) masks = tf.pad(masks, paddings=[[0,0], [0,0], [1,1], [0,0]]) PATCH_SIZE = [1, 320, 320, 1] HALF_PATCH_SIZE = [1, 160, 160, 1] patches = tf.extract_image_patches(masks, PATCH_SIZE, HALF_PATCH_SIZE, [1,1,1,1], "VALID") patches = tf.reshape(patches, [-1, 320, 320, 1]) if label_value == 0: labels = tf.zeros([tf.shape(patches)[0],1]) else: labels = tf.ones([tf.shape(patches)[0],1]) return patches, labels with tf.name_scope("discriminator_model"): p_patches, p_labels = split_into_small_patches(predictions, 0) t_patches, t_labels = split_into_small_patches(tf.cast(true_mask, tf.float32), 1) disc_batch = tf.concat([p_patches, t_patches], axis=0) disc_labels = tf.concat([p_labels, t_labels], axis=0) print "disc_batch", disc_batch print "disc_labels", disc_labels disc_result = discriminator_model.create_model( disc_batch, scope="discriminator_model", l2_penalty=FLAGS.l2_penalty) print "disc_result", disc_result for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) disc_predictions = disc_result["predictions"] if "loss" in disc_result.keys(): disc_label_loss = disc_result["loss"] else: disc_label_loss = discriminator_loss_fn.calculate_loss(disc_predictions, disc_labels) * 20000 tf.summary.scalar("discriminator_loss", disc_label_loss) tf.summary.histogram("model/disc_predictions", disc_predictions) if "regularization_loss" in disc_result.keys(): disc_reg_loss = result["regularization_loss"] else: disc_reg_loss = tf.constant(0.0) disc_reg_losses = tf.losses.get_regularization_losses(scope="discriminator_model") if disc_reg_losses: disc_reg_loss += tf.add_n(disc_reg_losses) if regularization_penalty != 0: tf.summary.scalar("disc_reg_loss", disc_reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. disc_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope="discriminator_model") if "update_ops" in disc_result.keys(): disc_update_ops += disc_result["update_ops"] if disc_update_ops: with tf.control_dependencies(disc_update_ops): disc_barrier = tf.no_op(name="disc_gradient_barrier") with tf.control_dependencies([disc_barrier]): bar_disc_label_loss = tf.identity(disc_label_loss) # Incorporate the L2 weight penalties etc. disc_final_loss = regularization_penalty * disc_reg_loss + bar_disc_label_loss tf.add_to_collection("discriminator_loss", disc_label_loss) optimizing(optimizer, disc_final_loss, clip_gradient_norm, global_step, prefix="discriminator", scope="discriminator_model") # refiner 2 refiner2_label_loss = label_loss - disc_label_loss # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. refiner2_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope="refiner_model") if "update_ops" in result.keys(): refiner2_update_ops += result["update_ops"] if refiner2_update_ops: with tf.control_dependencies(refiner2_update_ops): refiner2_barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([refiner2_barrier]): bar_refiner2_label_loss = tf.identity(refiner2_label_loss) refiner2_final_loss = regularization_penalty * reg_loss + bar_refiner2_label_loss optimizing(optimizer, refiner2_final_loss, clip_gradient_norm, global_step, prefix="refiner2", scope="refiner_model") tf.add_to_collection("refiner2_loss", refiner2_label_loss)
def build_graph(reader, model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. model: The core model (e.g. logistic or neural net). It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] gpus = gpus[:FLAGS.num_gpu] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) unused_video_id, model_input_raw, labels_batch, num_frames = ( get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) tower_inputs = tf.split(model_input, num_towers) tower_labels = tf.split(labels_batch, num_towers) tower_num_frames = tf.split(num_frames, num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): result = model.create_model(tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=reader.num_classes, labels=tower_labels[i]) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i]) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) # Incorporate the L2 weight penalties etc. final_loss = regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) tf.summary.scalar("label_loss", label_loss) if regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) merged_gradients = utils.combine_gradients(tower_gradients) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms( merged_gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("train_op", train_op)
def build_graph(reader, generator_model, discriminator_model, train_data_pattern, label_loss_fn=losses.CrossEntropyLoss(), batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000, learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0, regularization_penalty=1, num_readers=1, num_epochs=None): """Creates the Tensorflow graph. This will only be called once in the life of a training model, because after the graph is created the model will be restored from a meta graph file rather than being recreated. Args: reader: The data file reader. It should inherit from BaseReader. generator_model: The core model for generator. It should inherit from BaseModel. discriminator_model: The core model for discriminator. It should inherit from BaseModel. train_data_pattern: glob path to the training data files. label_loss_fn: What kind of loss to apply to the model. It should inherit from BaseLoss. batch_size: How many examples to process at a time. base_learning_rate: What learning rate to initialize the optimizer with. optimizer_class: Which optimization algorithm to use. clip_gradient_norm: Magnitude of the gradient to clip to. regularization_penalty: How much weight to give the regularization loss compared to the label loss. num_readers: How many threads to use for I/O operations. num_epochs: How many passes to make over the data. 'None' means an unlimited number of passes. """ global_step = tf.Variable(0, trainable=False, name="global_step") gpus = get_gpus() num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) model_input_raw, _ = (get_input_data_tensors(reader, train_data_pattern, batch_size=batch_size * num_towers, num_readers=num_readers, num_epochs=num_epochs)) tf.summary.histogram("model/input_raw", model_input_raw) model_input = model_input_raw noise_input = tf.placeholder( tf.float32, shape=[None, random_noise_generator.get_dim()]) image_width, image_height = reader.get_image_size() tower_inputs = tf.split(model_input, num_towers) tower_noise_input = tf.split(noise_input, num_towers) tower_D_gradients = [] tower_G_gradients = [] tower_generated_images = [] tower_predictions_for_fake = [] tower_predictions_for_real = [] tower_D_losses = [] tower_G_losses = [] for i in range(num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope( [slim.model_variable, slim.variable], device="/cpu:0" if num_gpus != 1 else "/gpu:0")): generator_model.create_model(image_width * image_height) discriminator_model.create_model(image_width * image_height) generated_result = generator_model.run_model( tower_noise_input[i]) generated_images = generated_result["output"] generated_images_shaped = tf.reshape( generated_images, [-1, image_height, image_width, 1]) tf.summary.image('generated_images', generated_images_shaped, 10) tower_generated_images.append(generated_images) result_from_fake = discriminator_model.run_model( generated_images) result_from_real = discriminator_model.run_model( tower_inputs[i]) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions_for_fake = result_from_fake["predictions"] predictions_for_real = result_from_real["predictions"] tower_predictions_for_fake.append(predictions_for_fake) tower_predictions_for_real.append(predictions_for_real) logits_for_fake = result_from_fake["logits"] logits_for_real = result_from_real["logits"] D_loss_fake = label_loss_fn.calculate_loss( logits_for_fake, tf.zeros_like(logits_for_fake)) D_loss_real = label_loss_fn.calculate_loss( logits_for_real, tf.ones_like(logits_for_real)) D_loss = D_loss_fake + D_loss_real tower_D_losses.append(D_loss) G_loss = label_loss_fn.calculate_loss( logits_for_fake, tf.ones_like(logits_for_fake)) tower_G_losses.append(G_loss) D_var = discriminator_model.get_variables() D_gradients = optimizer.compute_gradients(D_loss, var_list=D_var) tower_D_gradients.append(D_gradients) G_var = generator_model.get_variables() G_gradients = optimizer.compute_gradients(G_loss, var_list=G_var) tower_G_gradients.append(G_gradients) D_loss = tf.reduce_mean(tf.stack(tower_D_losses)) G_loss = tf.reduce_mean(tf.stack(tower_G_losses)) tf.summary.scalar("D_loss", D_loss) tf.summary.scalar("G_loss", G_loss) merged_D_gradients = utils.combine_gradients(tower_D_gradients) merged_G_gradients = utils.combine_gradients(tower_G_gradients) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_D_gradients = utils.clip_gradient_norms( merged_D_gradients, clip_gradient_norm) merged_G_gradients = utils.clip_gradient_norms( merged_G_gradients, clip_gradient_norm) # Attach global_step only once so that it will be increased by 1. D_train_op = optimizer.apply_gradients(merged_D_gradients) G_train_op = optimizer.apply_gradients(merged_G_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("D_loss", D_loss) tf.add_to_collection("G_loss", G_loss) tf.add_to_collection("p_for_fake", tf.concat(tower_predictions_for_fake, 0)) tf.add_to_collection("p_for_data", tf.concat(tower_predictions_for_real, 0)) tf.add_to_collection("input_batch_raw", model_input_raw) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("generated_images", tf.concat(tower_generated_images, 0)) tf.add_to_collection("D_train_op", D_train_op) tf.add_to_collection("G_train_op", G_train_op) tf.add_to_collection("noise_input_placeholder", noise_input)