def train(dataset, args): # use mask to split train/validation/test test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) # build model model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args) scheduler, opt = utils.build_optimizer(args, model.parameters()) # train for epoch in range(args.epochs): total_loss = 0 model.train() for batch in loader: opt.zero_grad() pred = model(batch) label = batch.y pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_loss /= len(loader.dataset) print(total_loss) if epoch % 10 == 0: test_acc = test(loader, model) print(test_acc, ' test')
def train(args=None, param_path=None, **kw): if args is None: args = kw.get('args') if param_path is None: if args.param_path is not None: param_path = args.param_path else: param_path = './params/' #sys.path.append(param_path) model_dict, optimizer_dict, trainer_dict, data_loader_dict = get_param_dicts( args) trainer = build_trainer(trainer_dict) data_loader = build_data_loader(data_loader_dict) trainer.bind_data_loader(data_loader) # model can be RSLP, RMLP, RCNN ... model = build_model(model_dict) # optimizer can be BP, TP or CHL optimizer. optimizer = build_optimizer(optimizer_dict) optimizer.bind_model(model) optimizer.bind_trainer(trainer) trainer.bind_model(model) trainer.bind_optimizer(optimizer) trainer.train( ) # the model needs some data from data_loader to get response properties. model.analyze(data_loader=data_loader)
def train(dataset, task, args): if task == 'graph': # graph classification: separate dataloader for test set data_size = len(dataset) loader = DataLoader(dataset[:int(data_size * 0.8)], batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(dataset[int(data_size * 0.8):], batch_size=args.batch_size, shuffle=True) elif task == 'node': # use mask to split train/validation/test test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) else: raise RuntimeError('Unknown task') # build model model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args, task=task) print(model) scheduler, opt = utils.build_optimizer(args, model.parameters()) # train best_val_acc = 0 test_acc = 0 for epoch in range(args.epochs): total_loss = 0 model.train() for batch in loader: opt.zero_grad() pred = model(batch) label = batch.y if task == 'node': pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_loss /= len(loader.dataset) print("Loss in Epoch {0}: {1}".format(epoch, total_loss)) if epoch % 10 == 0: val_acc, tmp_test_acc = test(loader, model, is_validation=True), test( loader, model) if val_acc > best_val_acc: best_val_acc = val_acc test_acc = tmp_test_acc print("Current Best Val Acc {0}, with Test Acc {1}".format( best_val_acc, test_acc)) print('Final Val Acc {0}, Test Acc {1}'.format(val_acc, test_acc))
def __init__(self, net_type, net_argv, init_path, init_argv, dim_argv, batch_size, opt_argv): if net_type == "nn": self.graph = tf.Graph() nb_dim = dim_argv[0] depth, h_dims, act_func = net_argv with self.graph.as_default(): var_init = [] if not init_path: _j = 0 for _i in range(depth - 1): var_init.extend([ ("W{}".format(_i), [h_dims[_i], h_dims[_i + 1]], init_argv[_j][0], init_argv[_j][1:]), ("b{}".format(_i), [1, h_dims[_i + 1]], init_argv[_j + 1][0], init_argv[_j + 1][1:]) ]) _j += 2 var_map = init_var_map(init_path, var_init) self.W = [0] * (depth - 1) self.b = [0] * (depth - 1) for _i in range(depth - 1): self.W[_i] = tf.Variable(var_map["W{}".format(_i)]) self.b[_i] = tf.Variable(var_map["b{}".format(_i)]) self.x_vec = tf.placeholder(tf.float32, shape=[1, nb_dim]) self.batch_x_vecs = tf.placeholder(tf.float32, shape=[batch_size, nb_dim]) self.batch_value_labels = tf.placeholder(tf.float32, shape=[batch_size, 1]) self.value_prediction = self.forward(net_type, depth, act_func, self.x_vec, [self.W, self.b]) self.batch_value_predictions = self.forward( net_type, depth, act_func, self.batch_x_vecs, [self.W, self.b]) square_loss_value = tf.square(self.batch_value_labels - self.batch_value_predictions) if opt_argv[-1] == "sum": self.loss_value = tf.reduce_sum(square_loss_value) elif opt_argv[-1] == "mean": self.loss_value = tf.reduce_mean(square_loss_value) self.opt_value = build_optimizer(opt_argv, self.loss_value) #self.init = tf.initialize_all_variables() self.init = tf.global_variables_initializer() #self.log = "net_type={}\tnet_argv={}\tinit_path={}\tinit_argv={}\tdim_argv={}\tbatch_size={}\topt_argv={}" \ # .format(net_type, net_argv, init_path, init_argv, dim_argv, batch_size, opt_argv) self.log = "net_type={}\tnet_argv={}\tinit_path={}\tinit_argv={}\tdim_argv={}\tbatch_size={}\topt_argv={}" \ .format(net_type, net_argv, init_path, init_argv, dim_argv, batch_size, opt_argv)
def train(dataset, task, args): # use mask to split train/validation/test test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) # build model if args.model_type != 'APPNP': model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args, task=task) else: alpha = 0.1 # Change here if you need to change alpha niter = 10 # Change here if you need to change niterations of Pagerank appnp_prop = models.PPRPowerIteration(dataset.data.edge_index, alpha, niter, args.dropout) model = models.APPNP(dataset.num_node_features, args.hidden_dim, dataset.num_classes, appnp_prop, args, task=task) scheduler, opt = utils.build_optimizer(args, model.parameters()) accuracy = [] # train for epoch in range(args.epochs): total_loss = 0 model.train() for batch in loader: opt.zero_grad() pred = model(batch) label = batch.y pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_loss /= len(loader.dataset) print('Epoch: ', epoch, 'Training loss: ', total_loss) if epoch % 100 == 0: test_acc = test(loader, model) print('Test acc: ', test_acc) accuracy.append([epoch, test_acc]) test_acc = test(loader, model) accuracy.append([args.epochs, test_acc]) plot_accuracy(np.array(accuracy), args) print('Final test acc: ', test_acc)
def train(dataset, task, args): if task == 'graph': # graph classification: separate dataloader for test set data_size = len(dataset) loader = DataLoader(dataset[:int(data_size * 0.8)], batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(dataset[int(data_size * 0.8):], batch_size=args.batch_size, shuffle=True) elif task == 'node': # use mask to split train/validation/test test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) else: raise RuntimeError('Unknown task') # build model model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args, task=task) scheduler, opt = utils.build_optimizer(args, model.parameters()) loss_t = [] acc = [] # train for epoch in range(args.epochs): total_loss = 0 model.train() for batch in loader: opt.zero_grad() pred = model(batch) label = batch.y if task == 'node': pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_loss /= len(loader.dataset) loss_t.append(total_loss) print(total_loss) if epoch % 10 == 0: test_acc = test(loader, model) acc.append(test_acc) print(test_acc, ' test') print(loss_t) print(acc)
def main(): """Main workflow""" args = utils.build_args(argparse.ArgumentParser()) utils.init_logger(args.model_file) assert torch.cuda.is_available() torch.cuda.set_device(args.gpuid) utils.init_random(args.seed) utils.set_params(args) logger.info("Config:\n%s", pformat(vars(args))) fields = utils.build_fields() logger.info("Fields: %s", fields.keys()) logger.info("Load %s", args.train_file) train_data = LMDataset(fields, args.train_file, args.sent_length_trunc) logger.info("Training sentences: %d", len(train_data)) logger.info("Load %s", args.valid_file) val_data = LMDataset(fields, args.valid_file, args.sent_length_trunc) logger.info("Validation sentences: %d", len(val_data)) fields["sent"].build_vocab(train_data) train_iter = utils.build_dataset_iter(train_data, args) val_iter = utils.build_dataset_iter(val_data, args, train=False) if args.resume and os.path.isfile(args.checkpoint_file): logger.info("Resume training") logger.info("Load checkpoint %s", args.checkpoint_file) checkpoint = torch.load(args.checkpoint_file, map_location=lambda storage, loc: storage) es_stats = checkpoint["es_stats"] args = utils.set_args(args, checkpoint) else: checkpoint = None es_stats = ESStatistics(args) model = utils.build_model(fields, args, checkpoint) logger.info("Model:\n%s", model) optimizer = utils.build_optimizer(model, args, checkpoint) try_train_val(fields, model, optimizer, train_iter, val_iter, es_stats, args)
def train(dataset, task, args): f1 = open(task + "_" + args.model_type+'.txt','w') if task == 'graph': # graph classification: separate dataloader for test set data_size = len(dataset) print("==> There are", data_size, "graphs in the dataset.") loader = DataLoader( dataset[:int(data_size * 0.8)], batch_size=args.batch_size, shuffle=True) test_loader = DataLoader( dataset[int(data_size * 0.8):], batch_size=args.batch_size, shuffle=True) elif task == 'node': print("==> There are", dataset.data.edge_index.shape[1], "edges, and", dataset.data.y.shape[0], "nodes in the dataset.") # use mask to split train/validation/test test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) else: raise RuntimeError('Unknown task') # build model model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args, task=task) scheduler, opt = utils.build_optimizer(args, model.parameters()) # train for epoch in range(args.epochs): total_loss = 0 model.train() for batch in loader: opt.zero_grad() pred = model(batch) label = batch.y if task == 'node': pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_loss /= len(loader.dataset) #print(total_loss) if epoch % 10 == 0: test_acc = test(loader, model) print("Epoch {}. Loss: {:.4f}. Test accuracy: {:.4f}".format( epoch, total_loss, test_acc)) f1.write("{} {:.4f} {:.4f}\n".format( epoch, total_loss, test_acc)) f1.close()
def main(): # parse args args = parse_args() # build data_loader file_path = args.file_path data_loader = build_train_loader(file_path) device = torch.device("cpu") model = build_model().to(device) optimizer = build_optimizer(model, lr=args.lr) lr_milestones = [len(data_loader) * m for m in args.lr_milestones] lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=lr_milestones, gamma=args.lr_gamma) def save_model_checkpoint(): if args.output_dir: checkpoint = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args } torch.save( checkpoint, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) torch.save(checkpoint, os.path.join(args.output_dir, 'checkpoint.pth')) print("Start training") start_time = time.time() import ipdb ipdb.set_trace() for epoch in range(args.epochs): train_one_epoch(model, optimizer, lr_scheduler, data_loader, epoch, args.print_freq, checkpoint_fn=save_model_checkpoint) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def __init__(self, config, device, resume=False): self.config = config self.cfg_stg = config['strategy'] self.device = device self.model = utils.build_model(config['model']) self.model.to(device) self.logger = utils.create_logger(self.cfg_stg['save_path']) self.tb_logger = SummaryWriter( join(self.cfg_stg['save_path'], 'events')) self.start_epoch = 1 if resume: self.load_model() self.optimizer = utils.build_optimizer(config['strategy'], self.model, self.start_epoch)
def sim(options): mt_model, text_processor = SenSim.load(options.model_path, tok_dir=options.tokenizer_path) print("Model initialization done!") optimizer = build_optimizer(mt_model, options.learning_rate, warump_steps=options.warmup) trainer = SenSimEval(model=mt_model, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip, fp16=options.fp16) pin_memory = torch.cuda.is_available() mt_dev_data = dataset.MTDataset(batch_pickle_dir=options.mt_dev_path, max_batch_capacity=options.total_capacity, max_batch=int(options.batch / (options.beam_width * 2)), pad_idx=mt_model.text_processor.pad_token_id(), keep_pad_idx=False) dl = data_utils.DataLoader(mt_dev_data, batch_size=1, shuffle=False, pin_memory=pin_memory) trainer.eval(mt_dev_iter=dl, saving_path=options.output)
def train_model(model, loader, args): # build model scheduler, opt = utils.build_optimizer(args, model.parameters()) # train for epoch in range(args.epochs): total_loss = 0 model.train() for batch in loader: opt.zero_grad() pred = model(batch) label = batch.y pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_loss /= len(loader.dataset) # print(total_loss) return model
def final_model_fn(features, labels, mode, params): """The model_fn for ConvNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU/TPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. if FLAGS.data_format == 'channels_first': if not FLAGS.transpose_input: # channels_first only for GPU raise ValueError('The option transpose_input is set to False') features = tf.transpose(features, [0, 3, 1, 2]) if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (FLAGS.moving_average_decay > 0) # This is essential, if using a keras-derived model. K.set_learning_phase(is_training) tf.logging.info('Using open-source implementation for MnasNet definition.') # Override params when necessary override_params = utils.get_override_params_dict(FLAGS) logits, _ = models.build_model(features, model_name=FLAGS.model_name, training=is_training, override_params=override_params) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=FLAGS.label_smoothing) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + FLAGS.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=FLAGS.moving_average_decay, num_updates=global_step) ema_vars = tf.trainable_variables() + tf.get_collection('moving_vars') for v in tf.global_variables(): # We maintain mva for batch norm moving mean and variance as well. if 'moving_mean' in v.name or 'moving_variance' in v.name: ema_vars.append(v) ema_vars = list(set(ema_vars)) host_call = None restore_vars_dict = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0) learning_rate = utils.build_learning_rate(scaled_lr, global_step, params['steps_per_epoch']) optimizer = utils.build_optimizer(learning_rate) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not FLAGS.skip_host_call: # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (train_host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (eval_metric_fn, [labels, logits]) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('number of trainable parameters: {}'.format(num_params)) def _scaffold_fn(): saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=_scaffold_fn if has_moving_average_decay else None)
def train(options): if not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) lm_class = ReformerLM if options.reformer else LM if options.pretrained_path is None: lm = lm_class(text_processor=text_processor, size=options.model_size) else: lm = lm_class.load(options.pretrained_path) if options.reformer: lm.config.hidden_dropout_prob = options.dropout lm.config.local_attention_probs_dropout_prob = options.dropout lm.config.lsh_attention_probs_dropout_prob = options.dropout else: LMTrainer.config_dropout(lm, options.dropout) train_data = dataset.TextDataset(save_cache_dir=options.train_path, max_cache_size=options.cache_size) dev_data = dataset.TextDataset(save_cache_dir=options.dev_path, max_cache_size=options.cache_size, load_all=True) if options.continue_train: with open(os.path.join(options.pretrained_path, "optim"), "rb") as fp: optimizer = pickle.load(fp) else: optimizer = build_optimizer(lm, options.learning_rate, options.warmup) trainer = LMTrainer(model=lm, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip) collator = dataset.TextCollator(pad_idx=text_processor.pad_token_id()) train_sampler, dev_sampler = None, None pin_memory = torch.cuda.is_available() loader = data_utils.DataLoader(train_data, batch_size=options.batch, shuffle=False, pin_memory=pin_memory, collate_fn=collator, sampler=train_sampler) dev_loader = data_utils.DataLoader(dev_data, batch_size=options.batch, shuffle=False, pin_memory=pin_memory, collate_fn=collator, sampler=dev_sampler) step, train_epoch = 0, 1 while step <= options.step: print("train epoch", train_epoch) step = trainer.train_epoch(data_iter=loader, dev_data_iter=dev_loader, saving_path=options.model_path, step=step)
def model_fn(features, labels, mode, params): """The model_fn to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] stats_shape = [1, 1, 3] is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (FLAGS.moving_average_decay > 0) # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) tf.logging.info('Using open-source implementation.') override_params = {} if FLAGS.batch_norm_momentum is not None: override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum if FLAGS.batch_norm_epsilon is not None: override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon if FLAGS.dropout_rate is not None: override_params['dropout_rate'] = FLAGS.dropout_rate if FLAGS.drop_connect_rate is not None: override_params['drop_connect_rate'] = FLAGS.drop_connect_rate if FLAGS.num_label_classes: override_params['num_classes'] = FLAGS.num_label_classes if FLAGS.depth_coefficient: override_params['depth_coefficient'] = FLAGS.depth_coefficient if FLAGS.width_coefficient: override_params['width_coefficient'] = FLAGS.width_coefficient def normalize_features(features, mean_rgb, stddev_rgb): """Normalize the image given the means and stddevs.""" features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype) features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype) return features def build_model(): """Build model using the model_name given through the command line.""" model_builder = None if FLAGS.model_name.startswith('efficientnet'): model_builder = efficientnet_builder else: raise ValueError('Model must be either efficientnet-b*') normalized_features = normalize_features(features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB) logits, _ = model_builder.build_model(normalized_features, model_name=FLAGS.model_name, training=is_training, override_params=override_params, model_dir=FLAGS.model_dir) return logits logits = build_model() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=FLAGS.label_smoothing) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + FLAGS.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=FLAGS.moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() train_op = None restore_vars_dict = None training_hooks = [] if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0) learning_rate = utils.build_learning_rate(scaled_lr, global_step, params['steps_per_epoch']) optimizer = utils.build_optimizer(learning_rate, optimizer_name='adam') # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) predictions = tf.argmax(logits, axis=1) top1_accuray = tf.metrics.accuracy(labels, predictions) logging_hook = tf.train.LoggingTensorHook( { "loss": loss, "accuracy": top1_accuray[1], "step": global_step }, every_n_iter=1) training_hooks.append(logging_hook) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: predictions = tf.argmax(logits, axis=1) top1_accuray = tf.metrics.accuracy(labels, predictions) eval_metrics = {'val_accuracy': top1_accuray} num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('number of trainable parameters: {}'.format(num_params)) scaffold = None if has_moving_average_decay and not is_training: # Only apply scaffold for eval jobs. saver = tf.train.Saver(restore_vars_dict) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=training_hooks, eval_metric_ops=eval_metrics, scaffold=scaffold)
def train(args, model_id, tb): torch.manual_seed(args.seed) np.random.seed(args.seed) train_data = MedicalEasyEnsembleDataloader(args.train_data, args.class_id, args.batch_size, True, args.num_workers) val_data = MedicalEasyEnsembleDataloader(args.val_data, args.class_id, args.batch_size, False, args.num_workers) if os.path.exists(args.w2v_file): embedding = utils.load_embedding(args.w2v_file, vocab_size=args.vocab_size, embedding_size=args.embedding_size) else: embedding = None if args.model_type == 'lstm': model = models.LSTMModel(args, embedding) elif args.model_type == 'conv': model = models.ConvModel(args, embedding) elif args.model_type == 'char': model = models.CharCNNModel(args, embedding) elif args.model_type == 'base': model = models.BaseModel(args, embedding) else: raise NotImplementedError if os.path.isfile( os.path.join(args.checkpoint_path, str(args.class_id), "%s_%s" % (args.model_type, args.type_suffix), "model_%d.pth" % model_id)): print("Load %d class %s type %dth model from previous step" % (args.class_id, args.model_type, model_id)) model.load_state_dict( torch.load( os.path.join(args.checkpoint_path, str(args.class_id), "%s_%s" % (args.model_type, args.type_suffix), "model_%d.pth" % model_id))) iteration = 0 model = model.cuda(args.device) model.train() optimizer = utils.build_optimizer(args, model) loss_func = MultiBceLoss() cur_worse = 1000 bad_times = 0 for epoch in range(args.epochs): if epoch >= args.start_epoch: factor = (epoch - args.start_epoch) // args.decay_every decay_factor = args.decay_rate**factor current_lr = args.lr * decay_factor utils.set_lr(optimizer, current_lr) # if epoch != 0 and epoch % args.sample_every == 0: # train_data.re_sample() for i, data in enumerate(train_data): tmp = [ _.cuda(args.device) if isinstance(_, torch.Tensor) else _ for _ in data ] report_ids, sentence_ids, sentence_lengths, output_vec = tmp optimizer.zero_grad() loss = loss_func(model(sentence_ids, sentence_lengths), output_vec) loss.backward() train_loss = loss.item() optimizer.step() iteration += 1 if iteration % args.print_every == 0: print("iter %d epoch %d loss: %.3f" % (iteration, epoch, train_loss)) if iteration % args.save_every == 0: torch.save( model.state_dict(), os.path.join(args.checkpoint_path, str(args.class_id), "%s_%s" % (args.model_type, args.type_suffix), "model_%d.pth" % model_id)) with open(os.path.join(args.checkpoint_path, str(args.class_id), "config.json"), 'w', encoding='utf-8') as config_f: json.dump(vars(args), config_f, indent=2) with open(os.path.join( args.checkpoint_path, str(args.class_id), "%s_%s" % (args.model_type, args.type_suffix), "config.json"), 'w', encoding='utf-8') as config_f: json.dump(vars(args), config_f, indent=2) if iteration % args.val_every == 0: val_loss = eval_model(model, loss_func, val_data, epoch) tb.add_scalar("model_%d val_loss" % model_id, val_loss, iteration) if val_loss > cur_worse: print("Bad Time Appear") cur_worse = val_loss bad_times += 1 else: cur_worse = val_loss bad_times = 0 if bad_times > args.patient: print('Early Stop !!!!') return if iteration % args.loss_log_every == 0: tb.add_scalar("model_%d train_loss" % model_id, loss.item(), iteration) print("The train finished")
def train(dataset, task, args): global device if task == 'graph': # graph classification: separate dataloader for test set # shuffle dataset before splitting data_size = len(dataset) idxs = np.arange(data_size).astype(int) np.random.shuffle(idxs) idxs = list(idxs) dataset = dataset[idxs] loader = DataLoader(dataset[:int(data_size * 0.8)], batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(dataset[int(data_size * 0.8):], batch_size=args.batch_size, shuffle=True) elif task == 'node': # use mask to split train/validation/test test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) else: raise RuntimeError('Unknown task') # build model model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args, task=task) model = model.to(device) print(model) scheduler, opt = utils.build_optimizer(args, model.parameters()) # train test_accs = [] best_acc = 0 timestr = time.strftime("%Y%m%d-%H%M%S") for epoch in range(args.epochs): total_loss = 0 model.train() for batch in loader: batch = batch.to(device) opt.zero_grad() pred = model(batch) label = batch.y if task == 'node': pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_loss /= len(loader.dataset) print(total_loss) if epoch % 10 == 0: if task == 'graph': test_acc = test(test_loader, model) else: test_acc = test(loader, model, is_validation=True) test_accs.append(test_acc) print(test_acc, ' test') # save best model if test_acc > best_acc: best_acc = test_acc torch.save(model.state_dict(), str(args.model_type) + timestr + '.pt') # plot accuracies x = range(0, epoch + 1, 10) plt.plot(x, test_accs) plt.savefig(str(args.model_type) + timestr + '.png') print(f'best achieved accuracy: {best_acc}') if model.task == 'node': best_model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args, task=task) best_model.load_state_dict( torch.load(str(args.model_type) + timestr + '.pt')) best_model = best_model.to(device) test_acc = test(loader, best_model, is_validation=False) print(f'test accuracy: {test_acc}')
def train(model, A, X, L, args, normalize_adjacency=False): num_nodes = A.shape[0] num_train = int(num_nodes * args.train_ratio) idx = [i for i in range(num_nodes)] np.random.shuffle(idx) train_idx = idx[:num_train] test_idx = idx[num_train:] if normalize_adjacency == True: A_ = normalize_A(A) else: A_ = A # add batch dim A_ = np.expand_dims(A_, axis=0) X_ = np.expand_dims(X, axis=0) L_ = np.expand_dims(L, axis=0) labels_train = torch.tensor(L_[:, train_idx], dtype=torch.long) adj = torch.tensor(A_, dtype=torch.float) x = torch.tensor(X_, requires_grad=True, dtype=torch.float) scheduler, optimizer = utils.build_optimizer( args, model.parameters(), weight_decay=args.weight_decay) model.train() ypred = None for epoch in range(args.num_epochs): begin_time = time.time() model.zero_grad() ypred, adj_att = model(x, adj) ypred_train = ypred[:, train_idx, :] loss = model.loss(ypred_train, labels_train) loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() elapsed = time.time() - begin_time result_train, result_test = evaluate_node(ypred.cpu(), L_, train_idx, test_idx) if epoch % 10 == 0: print( "epoch: ", epoch, "; loss: ", loss.item(), "; train_acc: ", result_train["acc"], "; test_acc: ", result_test["acc"], "; train_prec: ", result_train["prec"], "; test_prec: ", result_test["prec"], "; epoch time: ", "{0:0.2f}".format(elapsed), ) if scheduler is not None: scheduler.step() print(result_train["conf_mat"]) print(result_test["conf_mat"]) model.eval() ypred, _ = model(x, adj) save_data = { "adj": A_, "feat": X_, "label": L_, "pred": ypred.cpu().detach().numpy(), "train_idx": train_idx, } utils.save_checkpoint(model, optimizer, args, num_epochs=-1, save_data=save_data)
def model_fn(features, mode, params): '''The model_fn to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model ''' def preprocess_image(image): # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if FLAGS.data_format == 'channels_first': assert not FLAGS.transpose_input # channels_first only for GPU image = tf.transpose(image, [0, 3, 1, 2]) if FLAGS.transpose_input and mode == tf.estimator.ModeKeys.TRAIN: image = tf.transpose(image, [3, 0, 1, 2]) # HWCN to NHWC return image def normalize_image(image): # Normalize the image to zero mean and unit variance. if FLAGS.data_format == 'channels_first': stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] mean, std = task_info.get_mean_std(FLAGS.task_name) image -= tf.constant(mean, shape=stats_shape, dtype=image.dtype) image /= tf.constant(std, shape=stats_shape, dtype=image.dtype) return image image = features['image'] image = preprocess_image(image) image_shape = image.get_shape().as_list() tf.logging.info('image shape: {}'.format(image_shape)) is_training = (mode == tf.estimator.ModeKeys.TRAIN) if mode != tf.estimator.ModeKeys.PREDICT: labels = features['label'] else: labels = None # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable if FLAGS.unlabel_ratio and is_training: unl_bsz = features['unl_probs'].shape[0] else: unl_bsz = 0 lab_bsz = image.shape[0] - unl_bsz assert lab_bsz == batch_size metric_dict = {} global_step = tf.train.get_global_step() has_moving_average_decay = (FLAGS.moving_average_decay > 0) # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) tf.logging.info('Using open-source implementation.') override_params = {} if FLAGS.dropout_rate is not None: override_params['dropout_rate'] = FLAGS.dropout_rate if FLAGS.stochastic_depth_rate is not None: override_params['stochastic_depth_rate'] = FLAGS.stochastic_depth_rate if FLAGS.data_format: override_params['data_format'] = FLAGS.data_format if FLAGS.num_label_classes: override_params['num_classes'] = FLAGS.num_label_classes if FLAGS.depth_coefficient: override_params['depth_coefficient'] = FLAGS.depth_coefficient if FLAGS.width_coefficient: override_params['width_coefficient'] = FLAGS.width_coefficient def build_model(scope=None, reuse=tf.AUTO_REUSE, model_name=None, model_is_training=None, input_image=None, use_adv_bn=False, is_teacher=False): model_name = model_name or FLAGS.model_name if model_is_training is None: model_is_training = is_training if input_image is None: input_image = image input_image = normalize_image(input_image) scope_model_name = model_name if scope: scope = scope + '/' else: scope = '' with tf.variable_scope(scope + scope_model_name, reuse=reuse): if model_name.startswith('efficientnet'): logits, _ = efficientnet_builder.build_model( input_image, model_name=model_name, training=model_is_training, override_params=override_params, model_dir=FLAGS.model_dir, use_adv_bn=use_adv_bn, is_teacher=is_teacher) else: assert False, 'model {} not implemented'.format(model_name) return logits if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): logits = tf.cast(build_model(), tf.float32) else: logits = build_model() if FLAGS.teacher_model_name: teacher_image = preprocess_image(features['teacher_image']) if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): teacher_logits = tf.cast( build_model(scope='teacher_model', model_name=FLAGS.teacher_model_name, model_is_training=False, input_image=teacher_image, is_teacher=True), tf.float32) else: teacher_logits = build_model(scope='teacher_model', model_name=FLAGS.teacher_model_name, model_is_training=False, input_image=teacher_image, is_teacher=True) teacher_logits = tf.stop_gradient(teacher_logits) if FLAGS.teacher_softmax_temp != -1: teacher_prob = tf.nn.softmax(teacher_logits / FLAGS.teacher_softmax_temp) else: teacher_prob = None teacher_one_hot_pred = tf.argmax(teacher_logits, axis=1, output_type=labels.dtype) if mode == tf.estimator.ModeKeys.PREDICT: if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=FLAGS.moving_average_decay) ema_vars = utils.get_all_variable() restore_vars_dict = ema.variables_to_restore(ema_vars) tf.logging.info( 'restored variables:\n%s', json.dumps(sorted(restore_vars_dict.keys()), indent=4)) predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=functools.partial(_scaffold_fn, restore_vars_dict=restore_vars_dict) if has_moving_average_decay else None) if has_moving_average_decay: ema_step = global_step ema = tf.train.ExponentialMovingAverage( decay=FLAGS.moving_average_decay, num_updates=ema_step) ema_vars = utils.get_all_variable() lab_labels = labels[:lab_bsz] lab_logits = logits[:lab_bsz] lab_pred = tf.argmax(lab_logits, axis=-1, output_type=labels.dtype) lab_prob = tf.nn.softmax(lab_logits) lab_acc = tf.to_float(tf.equal(lab_pred, lab_labels)) metric_dict['lab/acc'] = tf.reduce_mean(lab_acc) metric_dict['lab/pred_prob'] = tf.reduce_mean( tf.reduce_max(lab_prob, axis=-1)) one_hot_labels = tf.one_hot(lab_labels, FLAGS.num_label_classes) if FLAGS.unlabel_ratio: unl_labels = labels[lab_bsz:] unl_logits = logits[lab_bsz:] unl_pred = tf.argmax(unl_logits, axis=-1, output_type=labels.dtype) unl_prob = tf.nn.softmax(unl_logits) unl_acc = tf.to_float(tf.equal(unl_pred, unl_labels)) metric_dict['unl/acc_to_dump'] = tf.reduce_mean(unl_acc) metric_dict['unl/pred_prob'] = tf.reduce_mean( tf.reduce_max(unl_prob, axis=-1)) # compute lab_loss one_hot_labels = tf.one_hot(lab_labels, FLAGS.num_label_classes) lab_loss = tf.losses.softmax_cross_entropy( logits=lab_logits, onehot_labels=one_hot_labels, label_smoothing=FLAGS.label_smoothing, reduction=tf.losses.Reduction.NONE) if FLAGS.label_data_sample_prob != 1: # mask out part of the labeled data random_mask = tf.floor( FLAGS.label_data_sample_prob + tf.random_uniform(tf.shape(lab_loss), dtype=lab_loss.dtype)) lab_loss = tf.reduce_mean(lab_loss * random_mask) else: lab_loss = tf.reduce_mean(lab_loss) metric_dict['lab/loss'] = lab_loss if FLAGS.unlabel_ratio: if FLAGS.teacher_softmax_temp == -1: # Hard labels # Get one-hot labels if FLAGS.teacher_model_name: ext_teacher_pred = teacher_one_hot_pred[lab_bsz:] one_hot_labels = tf.one_hot(ext_teacher_pred, FLAGS.num_label_classes) else: one_hot_labels = tf.one_hot(unl_labels, FLAGS.num_label_classes) # Compute cross entropy unl_loss = tf.losses.softmax_cross_entropy( logits=unl_logits, onehot_labels=one_hot_labels, label_smoothing=FLAGS.label_smoothing) else: # Soft labels # Get teacher prob if FLAGS.teacher_model_name: unl_teacher_prob = teacher_prob[lab_bsz:] else: scaled_prob = tf.pow(features['unl_probs'], 1 / FLAGS.teacher_softmax_temp) unl_teacher_prob = scaled_prob / tf.reduce_sum( scaled_prob, axis=-1, keepdims=True) metric_dict['unl/target_prob'] = tf.reduce_mean( tf.reduce_max(unl_teacher_prob, axis=-1)) unl_loss = cross_entropy(unl_teacher_prob, unl_logits, return_mean=True) metric_dict['ext/loss'] = unl_loss else: unl_loss = 0 real_lab_bsz = tf.to_float(lab_bsz) * FLAGS.label_data_sample_prob real_unl_bsz = batch_size * FLAGS.label_data_sample_prob * FLAGS.unlabel_ratio data_loss = lab_loss * real_lab_bsz + unl_loss * real_unl_bsz data_loss = data_loss / real_lab_bsz # Add weight decay to the loss for non-batch-normalization variables. loss = data_loss + FLAGS.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) metric_dict['train/data_loss'] = data_loss metric_dict['train/loss'] = loss host_call = None restore_vars_dict = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) real_train_batch_size = FLAGS.train_batch_size real_train_batch_size *= FLAGS.label_data_sample_prob scaled_lr = FLAGS.base_learning_rate * (real_train_batch_size / 256.0) if FLAGS.final_base_lr: # total number of training epochs total_epochs = FLAGS.train_steps * FLAGS.train_batch_size * 1. / FLAGS.num_train_images - 5 decay_times = math.log(FLAGS.final_base_lr / FLAGS.base_learning_rate) / math.log(0.97) decay_epochs = total_epochs / decay_times tf.logging.info( 'setting decay_epochs to {:.2f}'.format(decay_epochs) + '\n' * 3) else: decay_epochs = 2.4 * FLAGS.train_ratio learning_rate = utils.build_learning_rate( scaled_lr, global_step, params['steps_per_epoch'], decay_epochs=decay_epochs, start_from_step=FLAGS.train_steps - FLAGS.train_last_step_num, warmup_epochs=5, ) metric_dict['train/lr'] = learning_rate metric_dict['train/epoch'] = current_epoch optimizer = utils.build_optimizer(learning_rate) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) tvars = tf.trainable_variables() g_vars = [] tvars = sorted(tvars, key=lambda var: var.name) for var in tvars: if 'teacher_model' not in var.name: g_vars += [var] with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step, var_list=g_vars) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not FLAGS.skip_host_call: host_call = utils.construct_scalar_host_call(metric_dict) scaffold_fn = None if FLAGS.teacher_model_name or FLAGS.init_model: scaffold_fn = utils.init_from_ckpt(scaffold_fn) else: train_op = None if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: scaffold_fn = functools.partial(_scaffold_fn, restore_vars_dict=restore_vars_dict ) if has_moving_average_decay else None def metric_fn(labels, logits): '''Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. ''' predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) result_dict = { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } return result_dict eval_metrics = (metric_fn, [labels, logits]) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('number of trainable parameters: {}'.format(num_params)) return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def mnasnet_model_fn(features, labels, mode, params): """The model_fn for MnasNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) # This is essential, if using a keras-derived model. K.set_learning_phase(is_training) if isinstance(features, dict): features = features['feature'] if mode == tf.estimator.ModeKeys.PREDICT: # Adds an identify node to help TFLite export. features = tf.identity(features, 'float_image_input') # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if params['data_format'] == 'channels_first': assert not params['transpose_input'] # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(imagenet_input.MEAN_RGB, shape=stats_shape, dtype=features.dtype) features /= tf.constant(imagenet_input.STDDEV_RGB, shape=stats_shape, dtype=features.dtype) has_moving_average_decay = (params['moving_average_decay'] > 0) tf.logging.info('Using open-source implementation for MnasNet definition.') override_params = {} if params['batch_norm_momentum']: override_params['batch_norm_momentum'] = params['batch_norm_momentum'] if params['batch_norm_epsilon']: override_params['batch_norm_epsilon'] = params['batch_norm_epsilon'] if params['dropout_rate']: override_params['dropout_rate'] = params['dropout_rate'] if params['data_format']: override_params['data_format'] = params['data_format'] if params['num_label_classes']: override_params['num_classes'] = params['num_label_classes'] if params['depth_multiplier']: override_params['depth_multiplier'] = params['depth_multiplier'] if params['depth_divisor']: override_params['depth_divisor'] = params['depth_divisor'] if params['min_depth']: override_params['min_depth'] = params['min_depth'] override_params['use_keras'] = params['use_keras'] if params['precision'] == 'bfloat16': with tf.contrib.tpu.bfloat16_scope(): logits, _ = mnasnet_models.build_mnasnet_model( features, model_name=params['model_name'], training=is_training, override_params=override_params) logits = tf.cast(logits, tf.float32) else: # params['precision'] == 'float32' logits, _ = mnasnet_models.build_mnasnet_model( features, model_name=params['model_name'], training=is_training, override_params=override_params) if params['quantized_training']: if is_training: tf.logging.info('Adding fake quantization ops for training.') tf.contrib.quantize.create_training_graph( quant_delay=int(params['steps_per_epoch'] * FLAGS.quantization_delay_epochs)) else: tf.logging.info('Adding fake quantization ops for evaluation.') tf.contrib.quantize.create_eval_graph() if mode == tf.estimator.ModeKeys.PREDICT: scaffold_fn = None if FLAGS.export_moving_average: # If the model is trained with moving average decay, to match evaluation # metrics, we need to export the model using moving average variables. restore_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir) variables_to_restore = get_pretrained_variables_to_restore( restore_checkpoint, load_moving_average=True) tf.logging.info('Restoring from the latest checkpoint: %s', restore_checkpoint) tf.logging.info(str(variables_to_restore)) def restore_scaffold(): saver = tf.train.Saver(variables_to_restore) return tf.train.Scaffold(saver=saver) scaffold_fn = restore_scaffold predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }, scaffold_fn=scaffold_fn) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, params['num_label_classes']) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=params['label_smoothing']) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=params['moving_average_decay'], num_updates=global_step) ema_vars = utils.get_ema_vars() host_call = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = params['base_learning_rate'] * (params['train_batch_size'] / 256.0) # pylint: disable=line-too-long learning_rate = utils.build_learning_rate(scaled_lr, global_step, params['steps_per_epoch']) optimizer = utils.build_optimizer(learning_rate) if params['use_tpu']: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not params['skip_host_call']: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed params['iterations_per_loop'] times after # one TPU loop is finished, setting max_queue value to the same as # number of iterations will make the summary writer only flush the # data to storage once per loop. with tf.contrib.summary.create_file_writer( FLAGS.model_dir, max_queue=params['iterations_per_loop']).as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('loss', loss[0], step=gs) tf.contrib.summary.scalar('learning_rate', lr[0], step=gs) tf.contrib.summary.scalar('current_epoch', ce[0], step=gs) return tf.contrib.summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('number of trainable parameters: {}'.format(num_params)) # Prepares scaffold_fn if needed. scaffold_fn = None if is_training and FLAGS.init_checkpoint: variables_to_restore = get_pretrained_variables_to_restore( FLAGS.init_checkpoint, has_moving_average_decay) tf.logging.info('Initializing from pretrained checkpoint: %s', FLAGS.init_checkpoint) if FLAGS.use_tpu: def init_scaffold(): tf.train.init_from_checkpoint(FLAGS.init_checkpoint, variables_to_restore) return tf.train.Scaffold() scaffold_fn = init_scaffold else: tf.train.init_from_checkpoint(FLAGS.init_checkpoint, variables_to_restore) restore_vars_dict = None if not is_training and has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) def eval_scaffold(): saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) scaffold_fn = eval_scaffold return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
dropout=0.2, tok2id=tok2id) if CUDA: model = model.cuda() model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print('NUM PARAMS: ', params) # # # # # # # # ## # # # ## # # OPTIMIZER, LOSS # # # # # # # # ## # # # ## # # num_train_steps = (num_train_examples * 40) if ARGS.pretrain_data: num_train_steps += (num_pretrain_examples * ARGS.pretrain_epochs) optimizer = utils.build_optimizer(model, num_train_steps) loss_fn, cross_entropy_loss = utils.build_loss_fn(vocab_size=len(tok2id)) writer = SummaryWriter(ARGS.working_dir) # # # # # # # # # # # PRETRAINING (optional) # # # # # # # # # # # # # # # # if ARGS.pretrain_data: print('PRETRAINING...') for epoch in range(ARGS.pretrain_epochs): model.train() losses = utils.train_for_epoch( model, pretrain_dataloader, tok2id, optimizer,
def train(args, features, weights, edges, num_features): """ args - args from command line features - a filepath with each line being a space-delimited string of [node ID, [features], label name] weights - array of len(classes) indicating weight of each class when computing loss. Higher weight should be assigned to less common classes. 0 means to ignore a class. edges - a filepath of the (directed) edges file (each line being "n1 n2" representing n1 -> n2) num_features - number of computed features. """ # For reproducibility torch.manual_seed(1) np.random.seed(1) random.seed(1) # Load the data x, y, feat_data, labels, adj_list = load_dataset(args, features, edges, num_features) print("Loaded dataset") # Define embeddings for each node to be used in aggregation (FEATURES DON'T CHANGE) features = nn.Embedding(NUM_NODES, num_features) features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False) # build model model = models.createGNN(args, features, adj_list, num_features, weights) # Train loop print("Starting training") f1_test = [] accuracy_test = [] auc_test = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123) for train_index, test_index in skf.split(x, y): train, test = x[train_index], x[test_index] total_loss = 0 # TODO should this be outside the loop? _, opt = utils.build_optimizer(args, model.parameters()) for batch in range(1000): model.train() batch_nodes = train[:args.batch_size] train = np.roll( train, args.batch_size) # Prepare train set for next batch. opt.zero_grad() loss = model.loss( batch_nodes, Variable(torch.LongTensor(labels[np.array(batch_nodes)]))) loss.backward() opt.step() total_loss += loss.data.item() if batch % 50 == 0: model.eval() val_output = model(test) labels_pred_validation = val_output.data.numpy().argmax(axis=1) labels_true_validation = labels[test].flatten() if args.dataset == "hate": y_true = [ 1 if v == 2 else 0 for v in labels_true_validation ] # label 2 is hate y_pred = [ 1 if v == 2 else 0 for v in labels_pred_validation ] else: y_true = [ 1 if v == 1 else 0 for v in labels_true_validation ] # label 1 is suspended y_pred = [ 1 if v == 1 else 0 for v in labels_pred_validation ] fscore = f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None) recall = recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None) print(confusion_matrix(y_true, y_pred)) print('F1: {}, Recall: {}'.format(fscore, recall)) # If we ever reach really good scores... if fscore > 0.70 and args.dataset == "hate": break if fscore > 0.60 and recall > 0.8 and args.dataset != "hate": break # TODO decompose test code? # For each split, evaluate AUC, accuracy, and F1 on test split. model.eval() val_output = model(test) if args.dataset == "hate": # Prediction score is the difference between hateful and non-hateful scores. labels_pred_score = val_output.data.numpy()[:, 2].flatten( ) - val_output.data.numpy()[:, 0].flatten() else: # Prediction score is the difference between suspended and active scores. labels_pred_score = val_output.data.numpy()[:, 1].flatten( ) - val_output.data.numpy()[:, 0].flatten() labels_true_test = labels[test].flatten() if args.dataset == "hate": y_true = [1 if v == 2 else 0 for v in labels_true_test] else: y_true = [1 if v == 1 else 0 for v in labels_true_test] fpr, tpr, _ = roc_curve(y_true, labels_pred_score) # TODO why is it different inside the training loop? # Prediction is the larger of the two hateful/non-hateful or suspended/active scores. labels_pred_test = labels_pred_score > 0 y_pred = [1 if v else 0 for v in labels_pred_test] auc_test.append(auc(fpr, tpr)) accuracy_test.append(accuracy_score(y_true, y_pred)) f1_test.append(f1_score(y_true, y_pred)) # Print out final accuracy, F1, AUC results. accuracy_test = np.array(accuracy_test) f1_test = np.array(f1_test) auc_test = np.array(auc_test) print("Accuracy %0.4f +- %0.4f" % (accuracy_test.mean(), accuracy_test.std())) print("F1 %0.4f +- %0.4f" % (f1_test.mean(), f1_test.std())) print("AUC %0.4f +- %0.4f" % (auc_test.mean(), auc_test.std()))
def train(dataset, task, args): test_epoch, test_acc_per_epoch = [], [] if task == 'graph': # graph classification: separate dataloader for test set data_size = len(dataset) dataset.shuffle() loader = DataLoader(dataset[:int(data_size * 0.8)], batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(dataset[int(data_size * 0.8):], batch_size=args.batch_size, shuffle=True) elif task == 'node': # use mask to split train/validation/test test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) else: raise RuntimeError('Unknown task') # build model model = models.GNNStack(dataset.num_node_features, args.hidden_dim, dataset.num_classes, args, task=task) scheduler, opt = utils.build_optimizer(args, model.parameters()) # train for epoch in range(args.epochs): total_loss = 0 total_acc = 0 model.train() for batch in loader: opt.zero_grad() pred = model(batch) label = batch.y if task == 'node': pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_acc += pred.max(dim=1)[1].eq(label).float().sum().item() total_loss /= len(loader.dataset) total_acc /= len(loader.dataset) # print(total_loss) if epoch % 1 == 0: test_acc = test(loader, model) print( f'epoch {epoch}: train loss - {total_loss:.4f}, train acc - {total_acc:.2%}, test acc - {test_acc:.2%}' ) test_epoch.append(epoch) test_acc_per_epoch.append(test_acc) f, ax = plt.subplots(1, 1) ax.plot(np.array(test_epoch), np.array(test_acc_per_epoch)) ax.set_title(f'{dataset.name} - {args.model_type}') ax.set_xlabel('epochs') ax.set_ylabel('accuracy') f.savefig(f'{dataset.name}_{args.model_type}.png', bbox_inches='tight', dpi=400)
def train(options): lex_dict = None if options.dict_path is not None: lex_dict = get_lex_dict(options.dict_path) if not os.path.exists(options.model_path): os.makedirs(options.model_path) text_processor = TextProcessor(options.tokenizer_path) assert text_processor.pad_token_id() == 0 image_captioner = Seq2Seq.load(ImageCaptioning, options.pretrained_path, tok_dir=options.tokenizer_path) txt2ImageModel = Caption2Image( text_processor=text_processor, enc_layer=options.encoder_layer, embed_dim=options.embed_dim, intermediate_dim=options.intermediate_layer_dim) print("Model initialization done!") # We assume that the collator function returns a list with the size of number of gpus (in case of cpus, collator = dataset.ImageTextCollator() num_batches = max(1, torch.cuda.device_count()) optimizer = build_optimizer(txt2ImageModel, options.learning_rate, warump_steps=options.warmup) trainer = Caption2ImageTrainer( model=txt2ImageModel, caption_model=image_captioner, mask_prob=options.mask_prob, optimizer=optimizer, clip=options.clip, beam_width=options.beam_width, max_len_a=options.max_len_a, max_len_b=options.max_len_b, len_penalty_ratio=options.len_penalty_ratio, fp16=options.fp16, mm_mode=options.mm_mode) pin_memory = torch.cuda.is_available() img_train_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDatasetwNegSamples, options.train_path, txt2ImageModel, num_batches, options, pin_memory, lex_dict=lex_dict) img_dev_loader = ImageMTTrainer.get_img_loader( collator, dataset.ImageCaptionDatasetwNegSamples, options.dev_path, txt2ImageModel, num_batches, options, pin_memory, lex_dict=lex_dict, shuffle=False, denom=2) step, train_epoch = 0, 1 while options.step > 0 and step < options.step: print("train epoch", train_epoch) step = trainer.train_epoch(img_data_iter=img_train_loader, img_dev_data_iter=img_dev_loader, max_step=options.step, lex_dict=lex_dict, saving_path=options.model_path, step=step) train_epoch += 1
def model_fn(features, labels, mode, params): """The model_fn to be used with TPUEstimator. Args: features: A dict of `Tensor` of batched images and other features. labels: a Tensor or a dict of Tensor representing the batched labels. mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ logging.info('params=%s', params) images = features['image'] if isinstance(features, dict) else features labels = labels['label'] if isinstance(labels, dict) else labels config = params['config'] image_size = params['image_size'] utils.scalar('model/resolution', image_size) if config.model.data_format == 'channels_first': images = tf.transpose(images, [0, 3, 1, 2]) is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (config.train.ema_decay > 0) if FLAGS.use_tpu and not config.model.bn_type: config.model.bn_type = 'tpu_bn' # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) def build_model(in_images): """Build model using the model_name given through the command line.""" config.model.num_classes = config.data.num_classes model = effnetv2_model.EffNetV2Model(config.model.model_name, config.model) logits = model(in_images, training=is_training)[0] return logits pre_num_params, pre_num_flops = utils.num_params_flops( readable_format=True) if config.runtime.mixed_precision: precision = 'mixed_bfloat16' if FLAGS.use_tpu else 'mixed_float16' logits = utils.build_model_with_precision(precision, build_model, images, is_training) logits = tf.cast(logits, tf.float32) else: logits = build_model(images) num_params, num_flops = utils.num_params_flops(readable_format=True) num_params = num_params - pre_num_params num_flops = (num_flops - pre_num_flops) / params['batch_size'] logging.info('backbone params/flops = %.4f M / %.4f B', num_params, num_flops) utils.scalar('model/params', num_params) utils.scalar('model/flops', num_flops) # Calculate loss, which includes softmax cross entropy and L2 regularization. if config.train.loss_type == 'sigmoid': cross_entropy = tf.losses.sigmoid_cross_entropy( multi_class_labels=tf.cast(labels, dtype=logits.dtype), logits=logits, label_smoothing=config.train.label_smoothing) elif config.train.loss_type == 'custom': xent = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast( labels, dtype=logits.dtype), logits=logits) cross_entropy = tf.reduce_mean(tf.reduce_sum(xent, axis=-1)) else: if config.data.multiclass: logging.info('use multi-class loss: %s', config.data.multiclass) labels /= tf.reshape(tf.reduce_sum(labels, axis=1), (-1, 1)) cross_entropy = tf.losses.softmax_cross_entropy( onehot_labels=labels, logits=logits, label_smoothing=config.train.label_smoothing) train_steps = max(config.train.min_steps, config.train.epochs * params['steps_per_epoch']) global_step = tf.train.get_global_step() weight_decay_inc = config.train.weight_decay_inc * ( tf.cast(global_step, tf.float32) / tf.cast(train_steps, tf.float32)) weight_decay = (1 + weight_decay_inc) * config.train.weight_decay utils.scalar('train/weight_decay', weight_decay) # Add weight decay to the loss for non-batch-normalization variables. matcher = re.compile(config.train.weight_decay_exclude) l2loss = weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if not matcher.match(v.name) ]) loss = cross_entropy + l2loss utils.scalar('loss/l2reg', l2loss) utils.scalar('loss/xent', cross_entropy) if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=config.train.ema_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() host_call = None restore_vars_dict = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) utils.scalar('train/epoch', current_epoch) scaled_lr = config.train.lr_base * (config.train.batch_size / 256.0) scaled_lr_min = config.train.lr_min * (config.train.batch_size / 256.0) learning_rate = utils.WarmupLearningRateSchedule( scaled_lr, steps_per_epoch=params['steps_per_epoch'], decay_epochs=config.train.lr_decay_epoch, warmup_epochs=config.train.lr_warmup_epoch, decay_factor=config.train.lr_decay_factor, lr_decay_type=config.train.lr_sched, total_steps=train_steps, minimal_lr=scaled_lr_min)(global_step) utils.scalar('train/lr', learning_rate) optimizer = utils.build_optimizer( learning_rate, optimizer_name=config.train.optimizer) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # filter trainable variables if needed. var_list = tf.trainable_variables() if config.train.varsexp: vars2 = [ v for v in var_list if re.match(config.train.varsexp, v.name) ] if len(vars2) == len(var_list): logging.warning('%s has no match.', config.train.freeze) logging.info('Filter variables: orig=%d, final=%d, delta=%d', len(var_list), len(vars2), len(var_list) - len(vars2)) var_list = vars2 # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if config.train.gclip and is_training: logging.info('clip gradients norm by %f', config.train.gclip) grads_and_vars = optimizer.compute_gradients(loss, var_list) with tf.name_scope('gclip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] utils.scalar('train/gnorm', tf.linalg.global_norm(grads)) utils.scalar('train/gnormmax', tf.math.reduce_max([tf.norm(g) for g in grads])) # First clip each variable's norm, then clip global norm. clip_norm = abs(config.train.gclip) clipped_grads = [ tf.clip_by_norm(g, clip_norm) if g is not None else None for g in grads ] clipped_grads, _ = tf.clip_by_global_norm( clipped_grads, clip_norm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step, var_list=var_list) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not config.runtime.skip_host_call: host_call = utils.get_tpu_host_call( global_step, FLAGS.model_dir, config.runtime.iterations_per_loop) else: train_op = None if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, num_classes]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ metrics = {} if config.data.multiclass: metrics['eval/global_ap'] = tf.metrics.auc( labels, tf.nn.sigmoid(logits), curve='PR', num_thresholds=200, summation_method='careful_interpolation', name='global_ap') # Convert labels to set: be careful, tf.metrics.xx_at_k are horrible. labels = tf.cast(labels, dtype=tf.int64) label_to_repeat = tf.expand_dims(tf.argmax(labels, axis=-1), axis=-1) all_labels_set = tf.range(0, labels.shape[-1], dtype=tf.int64) all_labels_set = tf.expand_dims(all_labels_set, axis=0) labels_set = labels * all_labels_set + ( 1 - labels) * label_to_repeat metrics['eval/precision@1'] = tf.metrics.precision_at_k( labels_set, logits, k=1) metrics['eval/recall@1'] = tf.metrics.recall_at_k(labels_set, logits, k=1) metrics['eval/precision@5'] = tf.metrics.precision_at_k( labels_set, logits, k=5) metrics['eval/recall@5'] = tf.metrics.recall_at_k(labels_set, logits, k=5) # always add accuracy. labels = tf.argmax(labels, axis=1) predictions = tf.argmax(logits, axis=1) metrics['eval/acc_top1'] = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) metrics['eval/acc_top5'] = tf.metrics.mean(in_top_5) metrics['model/resolution'] = tf.metrics.mean(image_size) metrics['model/flops'] = tf.metrics.mean(num_flops) metrics['model/params'] = tf.metrics.mean(num_params) return metrics eval_metrics = (metric_fn, [labels, logits]) if has_moving_average_decay and not is_training: def scaffold_fn(): # read ema for eval jobs. saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) elif config.train.ft_init_ckpt and is_training: def scaffold_fn(): logging.info('restore variables from %s', config.train.ft_init_ckpt) var_map = utils.get_ckpt_var_map( ckpt_path=config.train.ft_init_ckpt, skip_mismatch=True, init_ema=config.train.ft_init_ema) tf.train.init_from_checkpoint(config.train.ft_init_ckpt, var_map) return tf.train.Scaffold() else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def ensmable_train(args, logger, fold_i, train_dataloader, val_dataloader, test_dataloader, fold_path, scaler=None, epoch_steps=None): if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print if args.gpu is not None and args.gpuUSE: torch.cuda.set_device(args.gpu) print(f'USE GPU ID={args.gpu}') debug(pformat(vars(args))) loss_func = get_loss_func(args) metric_func = get_metric_func(metric=args.metric) sum_predicts = [] for model_idx in range(args.ensemble_size): save_dir = os.path.join(fold_path, f'model_{model_idx}') makedirs(save_dir) writer = SummaryWriter(log_dir=save_dir) if args.checkpoint_paths is not None: debug( f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}' ) model = load_checkpoint(args.checkpoint_paths[model_idx], current_args=args, logger=logger) else: debug(f'Building model {model_idx}') model = build_model(args) debug(model) debug( f'mdoel:{model_idx}>>>>Number of parameters = {param_count(model):,}' ) if args.gpuUSE: debug('Moving model to cuda') model = model.cuda() else: print('noGPU use') for name, param in model.named_parameters(): if param.requires_grad: print(name, param.data, f'param.data is GPU{param.data.is_cuda}') save_checkpoint(os.path.join(save_dir, f'model{model_idx}.pt'), model, scaler, args) optimizer = build_optimizer(model, args) scheduler = build_lr_scheduler(optimizer, args, epoch_steps) print( f'args.minimize_score={args.minimize_score},args.metric={args.metric}' ) best_score = float('inf') if args.minimize_score else -float('inf') best_epoch, n_iter = 0, 0 hold_loss, hold_avgVal = [], [] for epoch in trange(1, args.epochs + 1): steps_eachEpoch, args.train_data_size, lastAvageloss, epoch_loss = train_batch( args, fold_i, model, train_dataloader, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, logger=logger, writer=writer) hold_loss.append(epoch_loss) if isinstance(scheduler, ExponentialLR): scheduler.step() _, _, val_scores = evaluate_batch(args, model=model, data=val_dataloader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, Foldth=args.Foldth, predsLog=None, logger=logger) avg_val_score = np.nanmean(val_scores) print(f'val_scores___{val_scores}') hold_avgVal.append(avg_val_score) debug(f'Validation {args.metric} = {avg_val_score:.6f}') writer.add_scalar(f'validation_{args.metric}_epoch', avg_val_score, epoch) writer.add_scalar(f'train_loss_epoch', lastAvageloss, epoch) if args.show_individual_scores: for task_name, val_score in zip(args.task_names, val_scores): debug( f'Validation {task_name} {args.metric} = {val_score:.6f}' ) writer.add_scalar( f'validation_{task_name}_{args.metric}_epoch', val_score, epoch) if args.minimize_score and avg_val_score < best_score or \ not args.minimize_score and avg_val_score > best_score: print( f'debug args.minimize_score:{args.minimize_score} and {avg_val_score} < {best_score} ' ) best_score, best_epoch = avg_val_score, epoch save_checkpoint( os.path.join(save_dir, f'{args.data_filename}_model.pt'), model, scaler, args) info( f'Model {model_idx} the parametrs updated in model.pt as best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) train_valdation_curve(args, fold_i, model_idx, cur_name='train-valdation curve') info( f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}' ) print(f'load model with args.cuda={args.cuda}') model = load_checkpoint(os.path.join(save_dir, f'{args.data_filename}_model.pt'), cuda=args.cuda, logger=logger) test_targets, test_preds, test_scores = evaluate_batch( args, model=model, data=test_dataloader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=logger, Foldth=args.Foldth, predsLog=args.save_dir) if len(test_preds) != 0: sum_predicts.append(np.stack(test_preds, axis=0)) print(f'sum_predicts ={sum_predicts}') avg_test_score = np.nanmean(test_scores) info( f'Model {model_idx} test >>> {args.metric} = {avg_test_score:.6f}' ) writer.add_scalar(f'test_{args.metric}_modelID', avg_test_score, model_idx) if args.show_individual_scores: for task_name, test_score in zip(args.task_names, test_scores): info( f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}' ) writer.add_scalar(f'test_{task_name}_{args.metric}_ModelID', test_score, model_idx) sum_predict = np.zeros(sum_predicts[0].shape) print(len(sum_predicts)) for mode_pred in sum_predicts: sum_predict = sum_predict + mode_pred avg_test_preds = (sum_predict / args.ensemble_size).tolist() if args.dataset_type == 'classification': all_classificationScores = evaluate_predictionsWithAllmetric( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func={ 'auc': roc_auc_score, 'acc': acc_score, 'precision': prec_score, 'recall': rec_score, 'prec_auc': prec_rec_auc }, dataset_type=args.dataset_type, logger=logger) print(f'use the metric {args.metric} for Hyperparameter Optimization') ensemble_scores = all_classificationScores[args.metric] all_metricsScore = all_classificationScores if args.dataset_type == 'regression': all_regressionScores = evaluate_predictionsWithAllmetric( preds=avg_test_preds, targets=test_targets, num_tasks=args.num_tasks, metric_func={ 'rmse': rmse, 'mse': mean_squared_error, 'mae': mean_absolute_error, 'r2': r2_score, 'PC': Pearson_cor }, dataset_type=args.dataset_type, logger=logger) print(f'use the metric {args.metric} for Hyperparameter Optimization') ensemble_scores = all_regressionScores[args.metric] all_metricsScore = all_regressionScores avg_ensemble_test_score = np.nanmean(ensemble_scores) print( f'ensemble_scores={ensemble_scores} and test {args.metric} = {avg_ensemble_test_score:.6f}' ) writer.add_scalar(f'ensemble_test_{args.metric}_fold', avg_ensemble_test_score, fold_i) if args.show_individual_scores: for task_name, ensemble_score in zip(args.task_names, ensemble_scores): info( f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}' ) return ensemble_scores, all_metricsScore
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, ): """Train loop""" if torch.cuda.is_available(): torch.cuda.empty_cache() rank = dist.get_rank() world_size = dist.get_world_size() train_epochs = 8 train_min_len, train_max_len = 0, 75 val_min_len, val_max_len = 0, 150 math_mode = "fp16" # One of `fp16`, `fp32` lang = ("en", "de") # Training train_global_batch_size = 2048 # Global batch size max_bs = 128 # Max batch size for used hardware update_freq = int(max(1, train_global_batch_size // (max_bs * world_size))) train_batch_size = int(train_global_batch_size // (world_size * update_freq)) val_batch_size = 64 # Model attributes model_args = { "hidden_size": 1024, "num_layers": 4, "dropout": 0.2, "share_embedding": True, "fusion": True, } # Criterion criterion_args = {"smoothing": 0.1, "fast_xentropy": True} # Loss scaling loss_scaling = {"init_scale": 1024, "upscale_interval": 128} # Optimizer optimizer_args = { "lr": 2e-3, "grad_clip": 5.0, } # Scheduler scheduler_args = { "warmup_steps": 200, "remain_steps": 0.4, "decay_interval": 0.05, "decay_steps": 4, "decay_factor": 0.5, } # Translator translator_args = { "beam_size": 5, "len_norm_factor": 0.6, "cov_penalty_factor": 0.1, "len_norm_const": 5.0, "max_seq_len": 150, } # Build train/val datsets train_set = WMT16Dataset( dataset_dir, math_precision=math_mode, lang=lang, train=True, download=True, preprocessed=True, min_len=train_min_len, max_len=train_max_len, ) train_set.prepare() val_set = WMT16Dataset( dataset_dir, math_precision=math_mode, lang=lang, validation=True, download=False, min_len=val_min_len, max_len=val_max_len, sort=True, ) tokenizer = train_set.tokenizer # Build model model = GNMT(vocab_size=train_set.vocab_size, **model_args) # Build loss function criterion = LabelSmoothing(padding_idx=wmt16_config.PAD, **criterion_args) # Bilingual Evaluation Understudy Score metrics = [BLEUScore()] # Partition data train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) collate_fn = build_collate_fn(sort=True) train_loader = DataLoader( train_set, batch_size=train_batch_size, collate_fn=collate_fn, num_workers=2, pin_memory=True, drop_last=False, shuffle=True, ) val_loader = DataLoader( val_set, batch_size=val_batch_size, collate_fn=collate_fn, num_workers=2, pin_memory=True, drop_last=False, ) validate_every = update_freq * round( len(train_loader) * 0.30 / update_freq ) # Validate every 30% # Build optimizer & scheduler total_train_iters = (len(train_loader) // update_freq) * train_epochs print("Number of batches per epoch {}".format(len(train_loader))) print("Train iterations per epoch {}".format(total_train_iters / train_epochs)) if use_cuda: model = model.cuda() criterion = criterion.cuda() use_horovod = math_mode == "fp16" and dist.get_backend() == dist.Backend.MPI if use_horovod: hvd.init() logger.info("Using horovod rank={}".format(hvd.rank())) tensor = torch.tensor([1]) res = hvd.allreduce(tensor, op=hvd.Sum) assert res[0] == world_size fp_optimizer, optimizer, model = build_optimizer( model=model, math=math_mode, loss_scaling=loss_scaling, use_cuda=use_cuda, use_horovod=use_horovod, **optimizer_args ) # Create a learning rate scheduler for an optimizer scheduler = ExponentialWarmupMultiStepLR( optimizer, total_train_iters, **scheduler_args ) # Translator translator = Translator(model=model, trg_tokenizer=tokenizer, **translator_args) checkpointer = Checkpointer( ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.BEST ) if not validation_only: if light_target: goal = task4_time_to_bleu_goal(20) else: goal = task4_time_to_bleu_goal(24) num_batches_per_device_train = len(train_loader) tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): if torch.cuda.is_available(): torch.cuda.empty_cache() model.train() tracker.train() for batch_idx, (data, target) in enumerate(train_loader): tracker.batch_start() data, target = prepare_batch(data, target, use_cuda=use_cuda) tracker.record_batch_load() is_last = batch_idx == len(train_loader) update = (batch_idx % update_freq) == update_freq - 1 init = (batch_idx % update_freq) == 0 # Clear gradients in the optimizer. if init: fp_optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = compute_model_output(model, data, target) tracker.record_batch_fwd_pass() # Compute the loss loss, loss_per_token = compute_loss( data, target, output, criterion, update_freq ) tracker.record_batch_comp_loss() # Backprop fp_optimizer.backward_loss(loss) tracker.record_batch_backprop() # Opt step if update or is_last: # For this task, simply sum all gradients updated = fp_optimizer.step(tracker=tracker, denom=1) # Learning rate scheduler if updated: scheduler.step() tracker.batch_end() record_train_batch_stats( batch_idx=batch_idx, loss=loss_per_token, output=target[0], # Use target just for the size metric_results={}, tracker=tracker, num_batches_per_device_train=num_batches_per_device_train, ) # Validation during training if (batch_idx + 1) % validate_every == 0: if torch.cuda.is_available(): torch.cuda.empty_cache() metrics_values, loss = validation_round( val_loader, metrics, model, criterion, update_freq, translator, tracker=tracker, use_cuda=use_cuda, ) record_validation_stats(metrics_values, loss, tracker, rank) if tracker.goal_reached: break model.train() tracker.train() if torch.cuda.is_available(): torch.cuda.empty_cache() metrics_values, loss = validation_round( val_loader, metrics, model, criterion, update_freq, translator, use_cuda=use_cuda, ) is_best = record_validation_stats(metrics_values, loss, tracker, rank) checkpointer.save( tracker, model, fp_optimizer.optimizer, scheduler, tracker.current_epoch, is_best, ) tracker.epoch_end() if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=criterion, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def model_fn(features, labels, mode, params=None): """The model_fn to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of one hot labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features["feature"] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if context.get_hparam("data_format") == "channels_first": assert not context.get_hparam("transpose_input") # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] #if context.get_hparam("transpose_input") and mode != tf.estimator.ModeKeys.PREDICT: # features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC is_training = mode == tf.estimator.ModeKeys.TRAIN has_moving_average_decay = context.get_hparam("moving_average_decay") > 0 # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) logging.info("Using open-source implementation.") override_params = {} #if context.get_hparam("batch_norm_momentum") is not None: # override_params["batch_norm_momentum"] = context.get_hparam("batch_norm_momentum") #if context.get_hparam("batch_norm_epsilon") is not None: # override_params["batch_norm_epsilon"] = context.get_hparam("batch_norm_epsilon") # if context.get_hparam("dropout_rate") is not None: # override_params["dropout_rate"] = context.get_hparam("dropout_rate") # if context.get_hparam("survival_prob") is not None: # override_params["survival_prob"] = context.get_hparam("survival_prob") # if context.get_hparam("data_format"): # override_params["data_format"] = context.get_hparam("data_format") # if context.get_hparam("num_label_classes"): # override_params["num_classes"] = context.get_hparam("num_label_classes") # if context.get_hparam("depth_coefficient"): # override_params["depth_coefficient"] = context.get_hparam("depth_coefficient") # if context.get_hparam("width_coefficient"): # override_params["width_coefficient"] = context.get_hparam("width_coefficient") def normalize_features(features, mean_rgb, stddev_rgb): """Normalize the image given the means and stddevs.""" features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype) features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype) return features def build_model(): """Build model using the model_name given through the command line.""" model_builder = model_builder_factory.get_model_builder( context.get_hparam("model_name"), ) normalized_features = normalize_features( features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB ) logits, _ = model_builder.build_model( normalized_features, model_name=context.get_hparam("model_name"), training=is_training, override_params=override_params, #model_dir=context.get_hparam("model_dir"), ) return logits logits = build_model() # Calculate loss, which includes softmax cross entropy and L2 regularization. cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=context.get_hparam("label_smoothing") ) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + context.get_hparam("weight_decay") * tf.add_n( [ tf.nn.l2_loss(v) for v in tf.trainable_variables() if "batch_normalization" not in v.name ] ) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=context.get_hparam("moving_average_decay"), num_updates=global_step ) ema_vars = utils.get_ema_vars() restore_vars_dict = None train_op = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = tf.cast(global_step, tf.float32) / context.get_hparam("steps_per_epoch") scaled_lr = context.get_hparam("base_learning_rate") * (context.get_hparam("train_batch_size") / 256.0) logging.info("base_learning_rate = %f", context.get_hparam("base_learning_rate")) learning_rate = utils.build_learning_rate( scaled_lr, global_step, context.get_hparam("steps_per_epoch"), ) optimizer = utils.build_optimizer(context, learning_rate) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, num_classes]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ labels = tf.argmax(labels, axis=1) predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { "top_1_accuracy": top_1_accuracy, "top_5_accuracy": top_5_accuracy, } eval_metrics = metric_fn(labels, logits) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) logging.info("number of trainable parameters: %d", num_params) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metrics, )
def model_fn(features, labels, mode, params): """The model_fn to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of one hot labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if FLAGS.data_format == 'channels_first': assert not FLAGS.transpose_input # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (FLAGS.moving_average_decay > 0) # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) logging.info('Using open-source implementation.') override_params = {} if FLAGS.batch_norm_momentum is not None: override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum if FLAGS.batch_norm_epsilon is not None: override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon if FLAGS.dropout_rate is not None: override_params['dropout_rate'] = FLAGS.dropout_rate if FLAGS.survival_prob is not None: override_params['survival_prob'] = FLAGS.survival_prob if FLAGS.data_format: override_params['data_format'] = FLAGS.data_format if FLAGS.num_label_classes: override_params['num_classes'] = FLAGS.num_label_classes if FLAGS.depth_coefficient: override_params['depth_coefficient'] = FLAGS.depth_coefficient if FLAGS.width_coefficient: override_params['width_coefficient'] = FLAGS.width_coefficient def normalize_features(features, mean_rgb, stddev_rgb): """Normalize the image given the means and stddevs.""" features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype) features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype) return features def build_model(): """Build model using the model_name given through the command line.""" model_builder = model_builder_factory.get_model_builder( FLAGS.model_name) normalized_features = normalize_features(features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB) logits, _ = model_builder.build_model(normalized_features, model_name=FLAGS.model_name, training=is_training, override_params=override_params, model_dir=FLAGS.model_dir) return logits if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): logits = tf.cast(build_model(), tf.float32) else: logits = build_model() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + FLAGS.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=FLAGS.moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() host_call = None restore_vars_dict = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0) logging.info('base_learning_rate = %f', FLAGS.base_learning_rate) learning_rate = utils.build_learning_rate( scaled_lr, global_step, params['steps_per_epoch'], decay_epochs=FLAGS.lr_decay_epoch) optimizer = utils.build_optimizer(learning_rate) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not FLAGS.skip_host_call: def host_call_fn(gs, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed FLAGS.iterations_per_loop times after one # TPU loop is finished, setting max_queue value to the same as number of # iterations will make the summary writer only flush the data to storage # once per loop. with tf2.summary.create_file_writer( FLAGS.model_dir, max_queue=FLAGS.iterations_per_loop).as_default(): with tf2.summary.record_if(True): tf2.summary.scalar('learning_rate', lr[0], step=gs) tf2.summary.scalar('current_epoch', ce[0], step=gs) return tf.summary.all_v2_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, lr_t, ce_t]) else: train_op = None if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, num_classes]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ labels = tf.argmax(labels, axis=1) predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) logging.info('number of trainable parameters: %d', num_params) def _scaffold_fn(): saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) if has_moving_average_decay and not is_training: # Only apply scaffold for eval jobs. scaffold_fn = _scaffold_fn else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
model = tagging_model.BertForMultitask.from_pretrained( ARGS.bert_model, cls_num_labels=ARGS.num_categories, tok_num_labels=ARGS.num_tok_labels, cache_dir=ARGS.working_dir + '/cache', tok2id=tok2id) if CUDA: model = model.cuda() print("cuda available") print('PREPPING RUN...') # # # # # # # # ## # # # ## # # OPTIMIZER, LOSS # # # # # # # # ## # # # ## # # optimizer = tagging_utils.build_optimizer( model, int((num_train_examples * ARGS.epochs) / ARGS.train_batch_size), ARGS.learning_rate) loss_fn = tagging_utils.build_loss_fn() # # # # # # # # ## # # # ## # # TRAIN # # # # # # # # ## # # # ## # # writer = SummaryWriter(ARGS.working_dir) print('INITIAL EVAL...') model.eval() results = tagging_utils.run_inference(model, eval_dataloader, loss_fn, tokenizer) writer.add_scalar('eval/tok_loss', np.mean(results['tok_loss']), 0) writer.add_scalar('eval/tok_acc', np.mean(results['labeling_hits']), 0)