) valid_triplets_loader = DataLoader(weak_valid_dl_triplet, batch_size=batch_size, shuffle=False, num_workers=cfg.num_workers, drop_last=True, collate_fn=collate_fn) test_triplets_loader = DataLoader(test_triplets, batch_size=batch_size, shuffle=False, num_workers=cfg.num_workers, drop_last=True, collate_fn=collate_fn) # ######### # # Model and optimizer # ######## if resume_training is None: model_triplet, state = get_model(state, f_args) optimizer, state = get_optimizer(model_triplet, state) LOG.info(model_triplet) pytorch_total_params = sum(p.numel() for p in model_triplet.parameters() if p.requires_grad) LOG.info("number of parameters in the model: {}".format(pytorch_total_params)) model_triplet.train() # scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5, verbose=True) LOG.info(optimizer) model_triplet = to_cuda_if_available(model_triplet) # ########## # # Callbacks # ########## if cfg.save_best: save_best_call = SaveBest(val_comp="sup") if cfg.early_stopping is not None:
params_name = { "early_stopping": cfg.early_stopping, "conv_dropout": cfg.conv_dropout, "frames": cfg.frames_in_sec, } params_name.update(args.__dict__) base_model_name = get_model_name(params_name) # Model state = { "scaler": scaler.state_dict(), "many_hot_encoder": many_hot_encoder.state_dict(), "args": vars(args), } model, state = get_model(state, args) optimizer, state = get_optimizer(model, state) model = to_cuda_if_available(model) LOG.info(model) # ########## # # Callbacks # ########## if cfg.save_best: save_best_call = SaveBest(val_comp="sup") if cfg.early_stopping is not None: early_stopping_call = EarlyStopping(patience=cfg.early_stopping, val_comp="sup") # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5) # x, y = next(iter(train_loader)) x, y = train_set[0]
def _init_graph(self): ''' Init a tensorflow Graph containing: input data, variables, model, loss, optimizer ''' self.graph = tf.Graph() with self.graph.as_default(): # , tf.device('/cpu:0'): # Set graph level random seed tf.set_random_seed(self.random_seed) np.random.seed(self.random_seed) # Input data. if self.is_lookup: self.train_features = tf.placeholder( tf.int32, shape=[None, self.num_field]) # None * num_features elif self.is_sparse: self.train_features = tf.sparse_placeholder( tf.float32, shape=[None, self.num_features]) # None * num_features else: self.train_features = tf.placeholder( tf.float32, shape=[None, self.num_features]) # None * num_features self.train_labels = tf.placeholder(tf.float32, shape=[None, 1]) # None * 1 # Variables. self.weights = self._initialize_weights() self.weights_feature = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'feature') # Model. ################################################################################### # bilinear embedding self.sample_embedding = [] # the embeddings of each FM self.sample_flag = [ ] # flags of whether a specific column exists in each FM. self.sample_embedding.append( tf.nn.embedding_lookup(self.weights['feature_bilinear_0'], self.train_features)) # the first FM contains all nonzero columns self.sample_flag.append(tf.ones([self.num_field], dtype=tf.float32)) self.sample_flag[0] = self.sample_flag[0][tf.newaxis, :, tf.newaxis] # setting flags and weights for each FM for k in range(self.feature_table.shape[1]): # the k+1-th FM tmp_zero = tf.zeros([1, self.embedding_dim[k + 1]], dtype=tf.float32) cur_weight_table = tf.concat( (tmp_zero, self.weights['feature_bilinear_%d' % (k + 1)]), axis=0) cur_ids = tf.nn.embedding_lookup(self.feature_table[:, k], self.train_features) self.sample_embedding.append( tf.nn.embedding_lookup(cur_weight_table, cur_ids)) self.sample_flag.append( tf.nn.embedding_lookup(self.feature_flag[:, k], self.train_features)) self.sample_flag[-1] = self.sample_flag[-1][:, :, tf.newaxis] self.bilinear = [] # the bilinear parts of each FM # core of RaFM: multiple embeddings base = tf.zeros_like(self.train_labels, dtype=np.float32) for k in range(self.feature_table.shape[1]): free_part = self.sample_embedding[k] * ( self.sample_flag[k] - self.sample_flag[k + 1]) dependent_part = self.sample_embedding[k] * self.sample_flag[ k + 1] # Note the stop_gradient here! low_output = common.get_bilinear_embedding_from_feature( tf.stop_gradient(free_part) + dependent_part) low_output = tf.reduce_sum(low_output, axis=1, keep_dims=True) self.bilinear.append( tf.add_n([tf.stop_gradient(base), low_output])) low_interaction = common.get_bilinear_embedding_from_feature( dependent_part + free_part) low_interaction = tf.reduce_sum(low_interaction, axis=1, keep_dims=True) correction = common.get_bilinear_embedding_from_feature( dependent_part) correction = tf.reduce_sum(correction, axis=1, keep_dims=True) base = tf.add_n([base, -correction, low_interaction]) final_high_interaction = common.get_bilinear_embedding_from_feature( self.sample_embedding[-1]) final_high_interaction = tf.reduce_sum(final_high_interaction, axis=1, keep_dims=True) self.bilinear.append(tf.add_n([base, final_high_interaction])) # linear embedding self.weights_linear_reshape = self.weights['feature_linear'] self.linear = common.get_linear_embedding( self.train_features, self.weights_linear_reshape, self.is_sparse, True) self.linear = self.linear[:, tf.newaxis] # bias self.weights_bias_reshape = self.weights['bias'] self.bias = tf.ones_like( self.train_labels, dtype=np.float32) * self.weights_bias_reshape # out[k]: \mathcal{B}_{1, k+1} in our paper self.out = [] for k in range(self.feature_table.shape[1] + 1): self.out.append( tf.add_n([self.bilinear[k], self.linear, self.bias])) # The loss function, which uses different update rules for free variables and dependent variables self.loss = 0 if self.loss_type == 'square_loss': # free variables self.loss += tf.nn.l2_loss( tf.subtract(self.train_labels, self.out[-1])) # loss of dependent variables. We use stop_gradient to mimic the update rule of dependent parts for k in range(self.feature_table.shape[1]): self.loss += self.dependent_lr_coef[k] * tf.nn.l2_loss( tf.subtract(self.out[k], tf.stop_gradient(self.out[k + 1]))) elif self.loss_type == 'log_loss': for k in range(len(self.out)): self.out[k] = tf.sigmoid(self.out[k]) # free variables self.loss += tf.losses.log_loss(self.train_labels, self.out[-1], weights=1.0, epsilon=1e-07, scope=None) # loss of dependent variables. We use stop_gradient to mimic the update rule of dependent parts for k in range(self.feature_table.shape[1]): loss = self.dependent_lr_coef[k] * tf.losses.log_loss( tf.stop_gradient(self.out[k + 1]), self.out[k], weights=1.0, epsilon=1e-07, scope=None) self.loss += loss self.reg_loss = 0 # L2 regularization of each embedding for k in range(self.feature_table.shape[1] + 1): if self.lambda_bilinear[k] > 0: self.reg_loss += tf.contrib.layers.l2_regularizer( self.lambda_bilinear[k])( self.weights['feature_bilinear_%d' % k]) self.loss += self.reg_loss self.optimizer = common.get_optimizer(self.optimizer_type, self.learning_rate, self.loss, None) # init self.saver = tf.train.Saver() init = tf.global_variables_initializer() self.sess = tf.Session() self.sess.run(init) if self.is_continuous == 1: self.saver.restore(self.sess, self.save_file + self.suffix) # number of params total_parameters = 0 for variable in self.weights.values(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value total_parameters += variable_parameters if self.verbose > 0: print("#params: %d" % total_parameters)
def main(): usage = "%prog project documents.json" parser = OptionParser(usage=usage) parser.add_option('-a', dest='alpha', default=0.00001, help='Regularization strength: default=%default') parser.add_option('-d', dest='hidden_dim', default=50, help='Hidden node dimension: default=%default') parser.add_option('-e', dest='epochs', default=10, help='Number of epochs: default=%default') parser.add_option('-i', dest='iter_display', default=5000, help='Number of iterations between output: default=%default') parser.add_option('-o', dest='optimization', default='sgd', help='Optimization method [sgd|sgdm|adagrad]: default=%default') parser.add_option('-l', dest='learning_rate', default=0.1, help='Initial learning rate: default=%default') parser.add_option('--decay', dest='decay', default=1.00, help='Learning rate decay: default=%default') parser.add_option('--momentum', dest='momentum', default=0.5, help='Momentum parameter (sgdm only): default=%default') parser.add_option('--word2vec_file', dest='word2vec_file', default='', help='Location of word2vec file: default=do not load') parser.add_option('--glove_file', dest='glove_file', default='', help='Location of glove file: default=do not load') parser.add_option('--save_vectors', action="store_true", dest="save_vectors", default=False, help='Save loaded vectors for faster loading next time: default=%default') parser.add_option('-s', dest='seed', default=42, help='Random seed: default=%default') parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False, help='Skip the evaluation between epochs: default=%default') parser.add_option('--test_fold', dest='test_fold', default=0, help='Test fold: default=%default') parser.add_option('--dev_fold', dest='dev_fold', default=0, help='Dev fold: default=%default') parser.add_option('--n_labels', dest='n_labels', default=14, help='Number of labels to use (max 15): default=%default') parser.add_option('--w_word', dest='w_word', default=1.0, help='Weight on word prediction: default=%default') parser.add_option('--w_sentence', dest='w_sentence', default=1.0, help='Weight on word prediction: default=%default') parser.add_option('--w_article', dest='w_article', default=1.0, help='Weight on word prediction: default=%default') (options, args) = parser.parse_args() project_name = args[0] input_filename = args[1] dirs.make_base_dir(project_name) sents_dir = dirs.data_raw_sentences_dir seed = int(options.seed) n_epochs = int(options.epochs) alpha = float(options.alpha) lr = float(options.learning_rate) iter_display = int(options.iter_display) opti_method = options.optimization lr_decay = float(options.decay) momentum = float(options.momentum) no_eval = options.no_eval word2vec_file = options.word2vec_file glove_file = options.glove_file save_vectors = options.save_vectors test_fold = int(options.test_fold) dev_fold = int(options.dev_fold) n_labels = int(options.n_labels) w_word = float(options.w_word) w_sentence = float(options.w_sentence) w_article = float(options.w_article) if seed > 0: np.random.seed(seed) random.seed(seed) dh = int(options.hidden_dim) dx = 300 np.__config__.show() article_sent_words, article_word_labels, vocab, n_labels, n_unique_articles, annotation_counts = load_data(input_filename, n_labels) train_keys, dev_keys, test_keys = ds.get_all_splits(test_fold=test_fold, dev_subfold=dev_fold) vocab = vocab.keys() vocab.sort() vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) print "Vocab size =", vocab_size n_articles = len(article_sent_words) keys = article_sent_words.keys() keys.sort() print keys[:10] print "Loaded %d annotations for %d articles using %d labels" % (n_articles, n_unique_articles, n_labels) print list(train_keys)[:10] train_keys = [k for k in keys if k.split('__')[0] in train_keys] dev_keys = [k for k in keys if k.split('__')[0] in dev_keys] test_keys = [k for k in keys if k.split('__')[0] in test_keys] #dev_indices = np.random.choice(n_articles, n_dev, replace=False).tolist() #train_indices = list(set(range(n_articles)) - set(dev_indices)) #train_keys = [keys[i] for i in train_indices] #dev_keys = [keys[i] for i in dev_indices] if glove_file != '': initial_embeddings = vector_utils.load_glove_vectors(glove_file, vocab, dx) elif word2vec_file != '': initial_embeddings = vector_utils.load_word2vec_vectors(word2vec_file, vocab, dx) else: initial_embeddings, vocab, vocab_index = vector_utils.load_from_file(input_filename) vocab_size = len(vocab) if save_vectors: vector_utils.save_vectors(input_filename, initial_embeddings, vocab) # index words into vocabulary and make mask and label arrays idxs_dict = {} mask_dict = {} label_dict = {} for key, sent_words in article_sent_words.items(): n_sents = len(sent_words) max_len = max([len(s) for s in sent_words]) word_idxs = np.zeros([max_len, n_sents], dtype=np.int32) mask = np.zeros([max_len, n_sents], dtype=np.int32) labels = np.zeros([max_len, n_sents, n_labels], dtype=np.int32) for s_i, s in enumerate(sent_words): n_words = len(s) word_idxs[:n_words, s_i] = [vocab_index[w] for w in s] mask[:n_words, s_i] = 1 labels[:n_words, s_i, :] = article_word_labels[key][s_i][:, :] idxs_dict[key] = word_idxs mask_dict[key] = mask label_dict[key] = labels article_lengths = [(idxs_dict[k].size, k) for k in train_keys] article_lengths.sort() # create the LSTM theano_seed = np.random.randint(2 ** 30) print "Number of distributions =", 2 print "Building RNN" optimizer, opti_params = get_optimizer(opti_method, momentum) bilstm = BiLSTM(vocab_size, dh, dx, n_labels, optimizer, opti_params, initial_embeddings=initial_embeddings, alpha=alpha, update=opti_method, seed=theano_seed, momentum=momentum, word_weight=w_word, sent_weight=w_sentence, article_weight=w_article) # create RNN best_dev_f1 = np.zeros(n_labels) corr_test_f1 = np.zeros(n_labels) print "Training" for epoch in range(n_epochs): sum_log_loss = 0 sum_loss = 0 mistakes = 0 # sort by keys on the first pass, then shuffle if epoch == 0: keys = [key for length, key in article_lengths] else: keys = train_keys random.shuffle(keys) print "epoch\titems\tloss\tl+reg\terrs" # consider each sentence in turn for k_i, k in enumerate(keys): idxs = idxs_dict[k] mask = mask_dict[k] word_labels = label_dict[k] p_word_labels, p_sent_labels, p_article_labels, log_loss, loss = bilstm.train(idxs, mask, word_labels, lr, 1) sum_log_loss += log_loss sum_loss += loss y_pred_words = np.array(p_word_labels > 0.5, dtype=int) # (n_words, n_sents, n_labels) y_pred_sents = np.array(p_sent_labels > 0.5, dtype=int) y_pred_article = np.array(p_article_labels > 0.5, dtype=int) sent_labels = np.max(word_labels, axis=0) article_labels = np.max(sent_labels, axis=0) mistakes += np.sum(np.abs(article_labels - y_pred_article))/float(n_labels) to_print = False if k_i == 0 and to_print: print "\tTraining example:", k print article_labels print np.array(y_pred_article, dtype=int) max_len, n_sents = mask.shape for s_i in range(n_sents): if np.max(y_pred_words[:, s_i, :]) == 1: n_words = np.argmin(mask[:, s_i]) - 1 sentence = [vocab[c] for c in idxs[:n_words, s_i]] print "Full:", k_i, ' '.join(sentence) for code in range(n_labels): if y_pred_sents[s_i, code] == 1: highlight = [w if word_labels[w_i, s_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)] print '-------------------------------------' print "True:", k_i, code, ' '.join(highlight) highlight = [w if y_pred_words[w_i, s_i, code] else ' ' * len(w) for w_i, w in enumerate(sentence)] #highlight = [vocab[c][1:2] if (p_y_given_x[c_i, code] > 0.5 or vocab[c][1:2] == '\n') else ' ' for c_i, c in enumerate(idxs)] print '-------------------------------------' print "Pred:", k_i, code, ' '.join(highlight) print "" if k_i % iter_display == 0 and k_i > 0: d = float(k_i+1) print '%d\t%d\t%.4f\t%.4f\t%.4f' % \ (epoch, k_i, sum_log_loss/d, sum_loss/d, mistakes/d) if not no_eval: print "\nDev evaluation" valid_z_o_loss, valid_log_loss, valid_f1, valid_per_class_f1 = evaluate(idxs_dict, mask_dict, label_dict, dev_keys, bilstm, vocab, annotation_counts) print "\nTest evaluation" test_z_o_loss, test_log_loss, test_f1, test_per_class_f1 = evaluate(idxs_dict, mask_dict, label_dict, test_keys, bilstm, vocab, annotation_counts) print ('epoch=%d\tdev_log_loss=%.3f\tdev_0/1=%.3f\tdev_f1=%.3f\ttest_log_loss=%.3f\ttest_0/1=%.3f\ttest_f1=%.3f\t') % (epoch, valid_log_loss, valid_z_o_loss, valid_f1, test_log_loss, test_z_o_loss, test_f1) for k in range(n_labels): if valid_per_class_f1[k] > best_dev_f1[k]: best_dev_f1[k] = valid_per_class_f1[k] corr_test_f1[k] = test_per_class_f1[k] print "Best valid f1s:", best_dev_f1 print "Corr. test f1s:", corr_test_f1 # decay learning rate lr *= lr_decay
def __init__(self, flags_obj, time_callback): standard_runnable.StandardRunnableWithWarmup.__init__( self, flags_obj.use_tf_while_loop, flags_obj.use_tf_function) self.strategy = tf.distribute.get_strategy() self.flags_obj = flags_obj self.dtype = flags_core.get_tf_dtype(flags_obj) self.time_callback = time_callback # Input pipeline related batch_size = flags_obj.batch_size if batch_size % self.strategy.num_replicas_in_sync != 0: raise ValueError( 'Batch size must be divisible by number of replicas : {}'.format( self.strategy.num_replicas_in_sync)) steps_per_epoch, train_epochs = common.get_num_train_iterations(flags_obj) if train_epochs > 1: train_epochs = flags_obj.train_epochs # As auto rebatching is not supported in # `experimental_distribute_datasets_from_function()` API, which is # required when cloning dataset to multiple workers in eager mode, # we use per-replica batch size. self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync) self.synthetic_input_fn = common.get_synth_input_fn( height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, num_channels=imagenet_preprocessing.NUM_CHANNELS, num_classes=self.flags_obj.num_classes, dtype=self.dtype, drop_remainder=True) if self.flags_obj.use_synthetic_data: self.input_fn = self.synthetic_input_fn else: self.input_fn = imagenet_preprocessing.input_fn resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers) self.model = resnet_model.resnet50( num_classes=self.flags_obj.num_classes, batch_size=flags_obj.batch_size, use_l2_regularizer=not flags_obj.single_l2_loss_op) self.use_lars_optimizer = False self.num_accumulation_steps = self.flags_obj.num_accumulation_steps if self.flags_obj.optimizer == 'LARS': self.use_lars_optimizer = True self.optimizer, _ = common.get_optimizer( flags_obj=flags_obj, steps_per_epoch=steps_per_epoch, train_steps=steps_per_epoch * train_epochs) # Make sure iterations variable is created inside scope. self.global_step = self.optimizer.iterations if self.dtype == tf.float16: loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) self.optimizer = ( tf.keras.mixed_precision.experimental.LossScaleOptimizer( self.optimizer, loss_scale)) elif flags_obj.fp16_implementation == 'graph_rewrite': # `dtype` is still float32 in this case. We built the graph in float32 # and let the graph rewrite change parts of it float16. if not flags_obj.use_tf_function: raise ValueError('--fp16_implementation=graph_rewrite requires ' '--use_tf_function to be true') loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) self.optimizer = ( tf.train.experimental.enable_mixed_precision_graph_rewrite( self.optimizer, loss_scale)) self.one_hot = False self.label_smoothing = flags_obj.label_smoothing if self.label_smoothing and self.label_smoothing > 0: self.one_hot = True if flags_obj.report_accuracy_metrics: self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32) if self.one_hot: self.train_accuracy = tf.keras.metrics.CategoricalAccuracy( 'train_accuracy', dtype=tf.float32) else: self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'train_accuracy', dtype=tf.float32) self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) else: self.train_loss = None self.train_accuracy = None self.test_loss = None if self.one_hot: self.test_accuracy = tf.keras.metrics.CategoricalAccuracy( 'test_accuracy', dtype=tf.float32) else: self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'test_accuracy', dtype=tf.float32) # self.test_corrects = tf.keras.metrics.Sum( # 'test_corrects', dtype=tf.float32) self.num_eval_steps = common.get_num_eval_steps(flags_obj) self.checkpoint = tf.train.Checkpoint( model=self.model, optimizer=self.optimizer) # Handling epochs. self.epoch_steps = steps_per_epoch self.epoch_helper = utils.EpochHelper(steps_per_epoch, self.global_step) self.steps_per_loop = flags_obj.steps_per_loop profile_steps = flags_obj.profile_steps if profile_steps: profile_steps = [int(i) for i in profile_steps.split(',')] self.trace_start_step = profile_steps[0] if profile_steps[0] >= 0 else None self.trace_end_step = profile_steps[1] else: self.trace_start_step = None self.trace_end_step = None self.epochs_between_evals = flags_obj.epochs_between_evals self.training_vars = self.model.trainable_variables self.accum_grads = [] self.accum_grads_dtype = tf.float32 if self.num_accumulation_steps > 1: for var in self.training_vars: self.accum_grads.append(self.optimizer.add_weight( name=var.name + '_accum', shape=var.shape, dtype=self.accum_grads_dtype, initializer='zeros', trainable=False, synchronization=tf.VariableSynchronization.ON_READ, aggregation=tf.VariableAggregation.SUM))
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ print('@@@@enable_eager = {}'.format(flags_obj.enable_eager)) keras_utils.set_session_config( enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) dtype = flags_core.get_tf_dtype(flags_obj) if dtype == tf.float16: policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_float16') tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) elif dtype == tf.bfloat16: policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) # This only affects GPU. common.set_cudnn_batchnorm_mode() # TODO(anj-s): Set data_format without using Keras. data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster(), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) train_ds, test_ds = get_input_dataset(flags_obj, strategy) per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations( flags_obj) steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps) logging.info("Training %d epochs, each epoch has %d steps, " "total steps: %d; Eval %d steps", train_epochs, per_epoch_steps, train_epochs * per_epoch_steps, eval_steps) time_callback = keras_utils.TimeHistory(flags_obj.batch_size, flags_obj.log_steps) with distribution_utils.get_strategy_scope(strategy): resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers) use_l2_regularizer = not flags_obj.single_l2_loss_op if flags_obj.use_resnet_d: resnetd = network_tweaks.ResnetD(image_data_format=tf.keras.backend.image_data_format(), use_l2_regularizer=use_l2_regularizer) else: resnetd = None model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES, batch_size=flags_obj.batch_size, zero_gamma=flags_obj.zero_gamma, last_pool_channel_type=flags_obj.last_pool_channel_type, use_l2_regularizer=use_l2_regularizer, resnetd=resnetd) if flags_obj.learning_rate_decay_type == 'piecewise': lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) elif flags_obj.learning_rate_decay_type == 'cosine': lr_schedule = common.CosineDecayWithWarmup( base_lr=flags_obj.base_learning_rate, batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], train_epochs=flags_obj.train_epochs, compute_lr_on_cpu=True) else: raise NotImplementedError optimizer = common.get_optimizer(lr_schedule) if dtype == tf.float16: loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, loss_scale) elif flags_obj.fp16_implementation == 'graph_rewrite': # `dtype` is still float32 in this case. We built the graph in float32 and # let the graph rewrite change parts of it float16. if not flags_obj.use_tf_function: raise ValueError('--fp16_implementation=graph_rewrite requires ' '--use_tf_function to be true') loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale) current_step = 0 checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir) if latest_checkpoint: checkpoint.restore(latest_checkpoint) logging.info("Load checkpoint %s", latest_checkpoint) current_step = optimizer.iterations.numpy() train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32) test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) categorical_cross_entopy_and_acc = losses.CategoricalCrossEntropyAndAcc( batch_size=flags_obj.batch_size, num_classes=imagenet_preprocessing.NUM_CLASSES, label_smoothing=flags_obj.label_smoothing) trainable_variables = model.trainable_variables def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs with tf.GradientTape() as tape: logits = model(images, training=True) loss = categorical_cross_entopy_and_acc.loss_and_update_acc(labels, logits, training=True) #loss = tf.reduce_sum(prediction_loss) * (1.0/ flags_obj.batch_size) num_replicas = tf.distribute.get_strategy().num_replicas_in_sync if flags_obj.single_l2_loss_op: l2_loss = resnet_model.L2_WEIGHT_DECAY * 2 * tf.add_n([ tf.nn.l2_loss(v) for v in trainable_variables if 'bn' not in v.name ]) loss += (l2_loss / num_replicas) else: loss += (tf.reduce_sum(model.losses) / num_replicas) # Scale the loss if flags_obj.dtype == "fp16": loss = optimizer.get_scaled_loss(loss) grads = tape.gradient(loss, trainable_variables) # Unscale the grads if flags_obj.dtype == "fp16": grads = optimizer.get_unscaled_gradients(grads) optimizer.apply_gradients(zip(grads, trainable_variables)) train_loss.update_state(loss) @tf.function def train_steps(iterator, steps): """Performs distributed training steps in a loop.""" for _ in tf.range(steps): strategy.experimental_run_v2(step_fn, args=(next(iterator),)) def train_single_step(iterator): if strategy: strategy.experimental_run_v2(step_fn, args=(next(iterator),)) else: return step_fn(next(iterator)) def test_step(iterator): """Evaluation StepFn.""" def step_fn(inputs): images, labels = inputs logits = model(images, training=False) loss = categorical_cross_entopy_and_acc.loss_and_update_acc(labels, logits, training=False) #loss = tf.reduce_sum(loss) * (1.0/ flags_obj.batch_size) test_loss.update_state(loss) if strategy: strategy.experimental_run_v2(step_fn, args=(next(iterator),)) else: step_fn(next(iterator)) if flags_obj.use_tf_function: train_single_step = tf.function(train_single_step) test_step = tf.function(test_step) if flags_obj.enable_tensorboard: summary_writer = tf.summary.create_file_writer(flags_obj.model_dir) else: summary_writer = None train_iter = iter(train_ds) time_callback.on_train_begin() for epoch in range(current_step // per_epoch_steps, train_epochs): train_loss.reset_states() categorical_cross_entopy_and_acc.training_accuracy.reset_states() steps_in_current_epoch = 0 while steps_in_current_epoch < per_epoch_steps: time_callback.on_batch_begin( steps_in_current_epoch+epoch*per_epoch_steps) steps = _steps_to_run(steps_in_current_epoch, per_epoch_steps, steps_per_loop) if steps == 1: train_single_step(train_iter) else: # Converts steps to a Tensor to avoid tf.function retracing. train_steps(train_iter, tf.convert_to_tensor(steps, dtype=tf.int32)) time_callback.on_batch_end( steps_in_current_epoch+epoch*per_epoch_steps) steps_in_current_epoch += steps #temp_loss = array_ops.identity(categorical_cross_entopy_and_acc.training_loss).numpy() #temp_loss = categorical_cross_entopy_and_acc.training_loss.numpy() logging.info('Training loss: %s, accuracy: %s, cross_entropy: %s at epoch %d', train_loss.result().numpy(), categorical_cross_entopy_and_acc.training_accuracy.result().numpy(), 0., epoch + 1) if (not flags_obj.skip_eval and (epoch + 1) % flags_obj.epochs_between_evals == 0): test_loss.reset_states() categorical_cross_entopy_and_acc.test_accuracy.reset_states() test_iter = iter(test_ds) for _ in range(eval_steps): test_step(test_iter) logging.info('Test loss: %s, accuracy: %s%% at epoch: %d', test_loss.result().numpy(), categorical_cross_entopy_and_acc.test_accuracy.result().numpy(), epoch + 1) if flags_obj.enable_checkpoint_and_export: checkpoint_name = checkpoint.save( os.path.join(flags_obj.model_dir, 'model.ckpt-{}'.format(epoch + 1))) logging.info('Saved checkpoint to %s', checkpoint_name) if summary_writer: current_steps = steps_in_current_epoch + (epoch * per_epoch_steps) with summary_writer.as_default(): #tf.summary.scalar('train_cross_entropy', categorical_cross_entopy_and_acc.training_loss.numpy(), current_steps) tf.summary.scalar('train_loss', train_loss.result(), current_steps) tf.summary.scalar('train_accuracy', categorical_cross_entopy_and_acc.training_accuracy.result(), current_steps) lr_for_monitor = lr_schedule(current_steps) if callable(lr_for_monitor): lr_for_monitor = lr_for_monitor() tf.summary.scalar('learning_rate', lr_for_monitor, current_steps) tf.summary.scalar('eval_loss', test_loss.result(), current_steps) tf.summary.scalar( 'eval_accuracy', categorical_cross_entopy_and_acc.test_accuracy.result(), current_steps) time_callback.on_train_end() if summary_writer: summary_writer.close() eval_result = None train_result = None if not flags_obj.skip_eval: eval_result = [test_loss.result().numpy(), categorical_cross_entopy_and_acc.test_accuracy.result().numpy()] train_result = [train_loss.result().numpy(), categorical_cross_entopy_and_acc.training_accuracy.result().numpy()] stats = build_stats(train_result, eval_result, time_callback) return stats