def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 train_dataset_builder = ds.WikipediaToxicityDataset( split='train', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ind_dataset_builder = ds.WikipediaToxicityDataset( split='test', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( split='test', data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) train_dataset_builders = { 'wikipedia_toxicity_subtypes': train_dataset_builder } test_dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } if FLAGS.prediction_mode and FLAGS.identity_prediction: for dataset_name in utils.IDENTITY_LABELS: if utils.NUM_EXAMPLES[dataset_name]['test'] > 100: test_dataset_builders[dataset_name] = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=os.path.join( FLAGS.identity_specific_dataset_dir, dataset_name), shuffle_buffer_size=data_buffer_size) for dataset_name in utils.IDENTITY_TYPES: if utils.NUM_EXAMPLES[dataset_name]['test'] > 100: test_dataset_builders[dataset_name] = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=os.path.join( FLAGS.identity_type_dataset_dir, dataset_name), shuffle_buffer_size=data_buffer_size) class_weight = utils.create_class_weight( train_dataset_builders, test_dataset_builders) logging.info('class_weight: %s', str(class_weight)) ds_info = train_dataset_builder.tfds_info # Positive and negative classes. num_classes = ds_info.metadata['num_classes'] train_datasets = {} dataset_steps_per_epoch = {} total_steps_per_epoch = 0 # TODO(jereliu): Apply strategy.experimental_distribute_dataset to the # dataset_builders. for dataset_name, dataset_builder in train_dataset_builders.items(): train_datasets[dataset_name] = dataset_builder.load( batch_size=FLAGS.per_core_batch_size) dataset_steps_per_epoch[dataset_name] = ( dataset_builder.num_examples // batch_size) total_steps_per_epoch += dataset_steps_per_epoch[dataset_name] test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in test_dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.load( batch_size=test_batch_size) if dataset_name in ['ind', 'ood', 'ood_identity']: steps_per_eval[dataset_name] = ( dataset_builder.num_examples // test_batch_size) else: steps_per_eval[dataset_name] = ( utils.NUM_EXAMPLES[dataset_name]['test'] // test_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building BERT %s model', FLAGS.bert_model_type) logging.info('use_gp_layer=%s', FLAGS.use_gp_layer) logging.info('use_spec_norm_att=%s', FLAGS.use_spec_norm_att) logging.info('use_spec_norm_ffn=%s', FLAGS.use_spec_norm_ffn) logging.info('use_layer_norm_att=%s', FLAGS.use_layer_norm_att) logging.info('use_layer_norm_ffn=%s', FLAGS.use_layer_norm_ffn) bert_config_dir, bert_ckpt_dir = utils.resolve_bert_ckpt_and_config_dir( FLAGS.bert_model_type, FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = utils.create_config(bert_config_dir) gp_layer_kwargs = dict( num_inducing=FLAGS.gp_hidden_dim, gp_kernel_scale=FLAGS.gp_scale, gp_output_bias=FLAGS.gp_bias, normalize_input=FLAGS.gp_input_normalization, gp_cov_momentum=FLAGS.gp_cov_discount_factor, gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty) spec_norm_kwargs = dict( iteration=FLAGS.spec_norm_iteration, norm_multiplier=FLAGS.spec_norm_bound) model, bert_encoder = ub.models.SngpBertBuilder( num_classes=num_classes, bert_config=bert_config, gp_layer_kwargs=gp_layer_kwargs, spec_norm_kwargs=spec_norm_kwargs, use_gp_layer=FLAGS.use_gp_layer, use_spec_norm_att=FLAGS.use_spec_norm_att, use_spec_norm_ffn=FLAGS.use_spec_norm_ffn, use_layer_norm_att=FLAGS.use_layer_norm_att, use_layer_norm_ffn=FLAGS.use_layer_norm_ffn, use_spec_norm_plr=FLAGS.use_spec_norm_plr) # Create an AdamW optimizer with beta_2=0.999, epsilon=1e-6. optimizer = utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=total_steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion, beta_1=1.0 - FLAGS.one_minus_momentum) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.Accuracy(), 'train/accuracy_weighted': tf.keras.metrics.Accuracy(), 'train/auroc': tf.keras.metrics.AUC(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': rm.metrics.ExpectedCalibrationError( num_bins=FLAGS.num_bins), 'train/precision': tf.keras.metrics.Precision(), 'train/recall': tf.keras.metrics.Recall(), 'train/f1': tfa_metrics.F1Score( num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if FLAGS.prediction_mode: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.eval_checkpoint_dir) else: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // total_steps_per_epoch else: # load BERT from initial checkpoint bert_encoder, _, _ = utils.load_bert_weight_from_ckpt( bert_model=bert_encoder, bert_ckpt_dir=bert_ckpt_dir, repl_patterns=ub.models.bert_sngp.CHECKPOINT_REPL_PATTERNS) logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) metrics.update({ 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted': tf.keras.metrics.MeanSquaredError(), 'test/ece': rm.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc': tf.keras.metrics.Accuracy(), 'test/acc_weighted': tf.keras.metrics.Accuracy(), 'test/eval_time': tf.keras.metrics.Mean(), 'test/stddev': tf.keras.metrics.Mean(), 'test/precision': tf.keras.metrics.Precision(), 'test/recall': tf.keras.metrics.Recall(), 'test/f1': tfa_metrics.F1Score( num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), 'test/calibration_auroc': tc_metrics.CalibrationAUC(curve='ROC'), 'test/calibration_auprc': tc_metrics.CalibrationAUC(curve='PR') }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): rm.metrics.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) metrics.update({ 'test_abstain_prec/abstain_prec_{}'.format(fraction): tc_metrics.AbstainPrecision(abstain_fraction=float(fraction)) }) metrics.update({ 'test_abstain_recall/abstain_recall_{}'.format(fraction): tc_metrics.AbstainRecall(abstain_fraction=float(fraction)) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): rm.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/acc_weighted_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/eval_time_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/stddev_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/precision_{}'.format(dataset_name): tf.keras.metrics.Precision(), 'test/recall_{}'.format(dataset_name): tf.keras.metrics.Recall(), 'test/f1_{}'.format(dataset_name): tfa_metrics.F1Score( num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), 'test/calibration_auroc_{}'.format(dataset_name): tc_metrics.CalibrationAUC(curve='ROC'), 'test/calibration_auprc_{}'.format(dataset_name): tc_metrics.CalibrationAUC(curve='PR'), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format(fraction, dataset_name): rm.metrics.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) metrics.update({ 'test_abstain_prec/abstain_prec_{}_{}'.format( fraction, dataset_name): tc_metrics.AbstainPrecision(abstain_fraction=float(fraction)) }) metrics.update({ 'test_abstain_recall/abstain_recall_{}_{}'.format( fraction, dataset_name): tc_metrics.AbstainRecall(abstain_fraction=float(fraction)) }) @tf.function def generate_sample_weight(labels, class_weight, label_threshold=0.7): """Generate sample weight for weighted accuracy calculation.""" if label_threshold != 0.7: logging.warning('The class weight was based on `label_threshold` = 0.7, ' 'and weighted accuracy/brier will be meaningless if ' '`label_threshold` is not equal to this value, which is ' 'recommended by Jigsaw Conversation AI team.') labels_int = tf.cast(labels > label_threshold, tf.int32) sample_weight = tf.gather(class_weight, labels_int) return sample_weight @tf.function def train_step(iterator, dataset_name, num_steps): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = utils.create_feature_and_label(inputs) with tf.GradientTape() as tape: logits = model(features, training=True) if isinstance(logits, (list, tuple)): # If model returns a tuple of (logits, covmat), extract logits logits, _ = logits if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) loss_logits = tf.squeeze(logits, axis=1) if FLAGS.loss_type == 'cross_entropy': logging.info('Using cross entropy loss') negative_log_likelihood = tf.nn.sigmoid_cross_entropy_with_logits( labels, loss_logits) elif FLAGS.loss_type == 'focal_cross_entropy': logging.info('Using focal cross entropy loss') negative_log_likelihood = tfa_losses.sigmoid_focal_crossentropy( labels, loss_logits, alpha=FLAGS.focal_loss_alpha, gamma=FLAGS.focal_loss_gamma, from_logits=True) elif FLAGS.loss_type == 'mse': logging.info('Using mean squared error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_squared_error( labels, loss_probs) elif FLAGS.loss_type == 'mae': logging.info('Using mean absolute error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_absolute_error( labels, loss_probs) negative_log_likelihood = tf.reduce_mean(negative_log_likelihood) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) auc_probs = tf.squeeze(probs, axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) sample_weight = generate_sample_weight( labels, class_weight['train/{}'.format(dataset_name)], FLAGS.ece_label_threshold) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, pred_labels) metrics['train/accuracy_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['train/auroc'].update_state(labels, auc_probs) metrics['train/loss'].update_state(loss) metrics['train/ece'].add_batch(ece_probs, label=ece_labels) metrics['train/precision'].update_state(ece_labels, pred_labels) metrics['train/recall'].update_state(ece_labels, pred_labels) metrics['train/f1'].update_state(one_hot_labels, ece_probs) for _ in tf.range(tf.cast(num_steps, tf.int32)): strategy.run(step_fn, args=(next(iterator),)) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = utils.create_feature_and_label(inputs) eval_start_time = time.time() # Compute ensemble prediction over Monte Carlo forward-pass samples. logits_list = [] stddev_list = [] for _ in range(FLAGS.num_mc_samples): logits = model(features, training=False) if isinstance(logits, (list, tuple)): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) stddev = tf.sqrt(tf.linalg.diag_part(covmat)) logits_list.append(logits) stddev_list.append(stddev) eval_time = (time.time() - eval_start_time) / FLAGS.per_core_batch_size # Logits dimension is (num_samples, batch_size, num_classes). logits_list = tf.stack(logits_list, axis=0) stddev_list = tf.stack(stddev_list, axis=0) stddev = tf.reduce_mean(stddev_list, axis=0) probs_list = tf.nn.sigmoid(logits_list) probs = tf.reduce_mean(probs_list, axis=0) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) auc_probs = tf.squeeze(probs, axis=1) # Use normalized binary predictive variance as the confidence score. # Since the prediction variance p*(1-p) is within range (0, 0.25), # normalize it by maximum value so the confidence is between (0, 1). calib_confidence = 1. - probs * (1. - probs) / .25 ce = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.broadcast_to( labels, [FLAGS.num_mc_samples, labels.shape[0]]), logits=tf.squeeze(logits_list, axis=-1) ) negative_log_likelihood = -tf.reduce_logsumexp( -ce, axis=0) + tf.math.log(float(FLAGS.num_mc_samples)) negative_log_likelihood = tf.reduce_mean(negative_log_likelihood) sample_weight = generate_sample_weight( labels, class_weight['test/{}'.format(dataset_name)], FLAGS.ece_label_threshold) if dataset_name == 'ind': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/brier_weighted'].update_state( tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece'].add_batch(ece_probs, label=ece_labels) metrics['test/acc'].update_state(ece_labels, pred_labels) metrics['test/acc_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/eval_time'].update_state(eval_time) metrics['test/stddev'].update_state(stddev) metrics['test/precision'].update_state(ece_labels, pred_labels) metrics['test/recall'].update_state(ece_labels, pred_labels) metrics['test/f1'].update_state(one_hot_labels, ece_probs) metrics['test/calibration_auroc'].update_state(ece_labels, pred_labels, calib_confidence) metrics['test/calibration_auprc'].update_state(ece_labels, pred_labels, calib_confidence) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].add_batch(ece_probs, label=ece_labels) metrics['test_abstain_prec/abstain_prec_{}'.format( fraction)].update_state(ece_labels, pred_labels, calib_confidence) metrics['test_abstain_recall/abstain_recall_{}'.format( fraction)].update_state(ece_labels, pred_labels, calib_confidence) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_weighted_{}'.format(dataset_name)].update_state( tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece_{}'.format(dataset_name)].add_batch( ece_probs, label=ece_labels) metrics['test/acc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/acc_weighted_{}'.format(dataset_name)].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/eval_time_{}'.format(dataset_name)].update_state( eval_time) metrics['test/stddev_{}'.format(dataset_name)].update_state(stddev) metrics['test/precision_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/recall_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/f1_{}'.format(dataset_name)].update_state( one_hot_labels, ece_probs) metrics['test/calibration_auroc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels, calib_confidence) metrics['test/calibration_auprc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels, calib_confidence) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].add_batch(ece_probs, label=ece_labels) metrics['test_abstain_prec/abstain_prec_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, pred_labels, calib_confidence) metrics['test_abstain_recall/abstain_recall_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, pred_labels, calib_confidence) strategy.run(step_fn, args=(next(iterator),)) @tf.function def final_eval_step(iterator): """Final Evaluation StepFn to save prediction to directory.""" def step_fn(inputs): bert_features, labels, additional_labels = utils.create_feature_and_label( inputs) logits = model(bert_features, training=False) if isinstance(logits, (list, tuple)): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) features = inputs['input_ids'] return features, logits, labels, additional_labels (per_replica_texts, per_replica_logits, per_replica_labels, per_replica_additional_labels) = ( strategy.run(step_fn, args=(next(iterator),))) if strategy.num_replicas_in_sync > 1: texts_list = tf.concat(per_replica_texts.values, axis=0) logits_list = tf.concat(per_replica_logits.values, axis=0) labels_list = tf.concat(per_replica_labels.values, axis=0) additional_labels_dict = {} for additional_label in utils.IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[additional_label] = tf.concat( per_replica_additional_labels[additional_label], axis=0) else: texts_list = per_replica_texts logits_list = per_replica_logits labels_list = per_replica_labels additional_labels_dict = {} for additional_label in utils.IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[ additional_label] = per_replica_additional_labels[ additional_label] return texts_list, logits_list, labels_list, additional_labels_dict if FLAGS.prediction_mode: # Prediction and exit. for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types message = 'Final eval on dataset {}'.format(dataset_name) logging.info(message) texts_all = [] logits_all = [] labels_all = [] additional_labels_all_dict = {} if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all_dict[identity_label_name] = [] try: with tf.experimental.async_scope(): for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: message = 'Starting to run eval step {}/{} of dataset: {}'.format( step, steps_per_eval[dataset_name], dataset_name) logging.info(message) (text_step, logits_step, labels_step, additional_labels_dict_step) = final_eval_step(test_iterator) texts_all.append(text_step) logits_all.append(logits_step) labels_all.append(labels_step) if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all_dict[identity_label_name].append( additional_labels_dict_step[identity_label_name]) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with eval on %s', dataset_name) texts_all = tf.concat(texts_all, axis=0) logits_all = tf.concat(logits_all, axis=0) labels_all = tf.concat(labels_all, axis=0) additional_labels_all = [] if additional_labels_all_dict: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all.append( tf.concat( additional_labels_all_dict[identity_label_name], axis=0)) additional_labels_all = tf.convert_to_tensor(additional_labels_all) utils.save_prediction( texts_all.numpy(), path=os.path.join(FLAGS.output_dir, 'texts_{}'.format(dataset_name))) utils.save_prediction( labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'labels_{}'.format(dataset_name))) utils.save_prediction( logits_all.numpy(), path=os.path.join(FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: utils.save_prediction( additional_labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) logging.info('Done with testing on %s', dataset_name) else: # Execute train / eval loop. start_time = time.time() train_iterators = {} for dataset_name, train_dataset in train_datasets.items(): train_iterators[dataset_name] = iter(train_dataset) for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) for dataset_name, train_iterator in train_iterators.items(): try: with tf.experimental.async_scope(): train_step( train_iterator, dataset_name, dataset_steps_per_epoch[dataset_name]) current_step = ( epoch * total_steps_per_epoch + dataset_steps_per_epoch[dataset_name]) max_steps = total_steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with testing on %s', dataset_name) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) logging.info('Testing on dataset %s', dataset_name) try: with tf.experimental.async_scope(): for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info('Starting to run eval step %s/%s of epoch: %s', step, steps_per_eval[dataset_name], epoch) test_step(test_iterator, dataset_name) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, ECE: %.2f, Accuracy: %.2f', metrics['train/loss'].result(), metrics['train/ece'].result(), metrics['train/accuracy'].result()) total_results = { name: metric.result() for name, metric in metrics.items() } # Metrics from Robustness Metrics (like ECE) will return a dict with a # single key/value, instead of a scalar. total_results = { k: (list(v.values())[0] if isinstance(v, dict) else v) for k, v in total_results.items() } with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for metric in metrics.values(): metric.reset_states() checkpoint_interval = min(FLAGS.checkpoint_interval, FLAGS.train_epochs) if checkpoint_interval > 0 and (epoch + 1) % checkpoint_interval == 0: checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name) # Save model in SavedModel format on exit. final_save_name = os.path.join(FLAGS.output_dir, 'model') model.save(final_save_name) logging.info('Saved model to %s', final_save_name) with summary_writer.as_default(): hp.hparams({ 'base_learning_rate': FLAGS.base_learning_rate, 'one_minus_momentum': FLAGS.one_minus_momentum, 'gp_mean_field_factor': FLAGS.gp_mean_field_factor, })
def main(argv): del argv # unused arg if not FLAGS.use_gpu: raise ValueError('Only GPU is currently supported.') if FLAGS.num_cores > 1: raise ValueError('Only a single accelerator is currently supported.') tf.random.set_seed(FLAGS.seed) logging.info('Model checkpoint will be saved at %s', FLAGS.output_dir) tf.io.gfile.makedirs(FLAGS.output_dir) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 ind_dataset_builder = ds.WikipediaToxicityDataset( split='test', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( split='test', data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) test_dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } class_weight = utils.create_class_weight( test_dataset_builders=test_dataset_builders) logging.info('class_weight: %s', str(class_weight)) ds_info = ind_dataset_builder.tfds_info # Positive and negative classes. num_classes = ds_info.metadata['num_classes'] test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in test_dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.load( batch_size=test_batch_size) steps_per_eval[dataset_name] = ( dataset_builder.num_examples // test_batch_size) logging.info('Building %s model', FLAGS.model_family) bert_config_dir, _ = utils.resolve_bert_ckpt_and_config_dir( FLAGS.bert_model_type, FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = utils.create_config(bert_config_dir) gp_layer_kwargs = dict( num_inducing=FLAGS.gp_hidden_dim, gp_kernel_scale=FLAGS.gp_scale, gp_output_bias=FLAGS.gp_bias, normalize_input=FLAGS.gp_input_normalization, gp_cov_momentum=FLAGS.gp_cov_discount_factor, gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty) spec_norm_kwargs = dict( iteration=FLAGS.spec_norm_iteration, norm_multiplier=FLAGS.spec_norm_bound) model, _ = ub.models.SngpBertBuilder( num_classes=num_classes, bert_config=bert_config, gp_layer_kwargs=gp_layer_kwargs, spec_norm_kwargs=spec_norm_kwargs, use_gp_layer=FLAGS.use_gp_layer, use_spec_norm_att=FLAGS.use_spec_norm_att, use_spec_norm_ffn=FLAGS.use_spec_norm_ffn, use_layer_norm_att=FLAGS.use_layer_norm_att, use_layer_norm_ffn=FLAGS.use_layer_norm_ffn, use_spec_norm_plr=FLAGS.use_spec_norm_plr) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) # Search for checkpoints from their index file; then remove the index suffix. ensemble_filenames = tf.io.gfile.glob( os.path.join(FLAGS.checkpoint_dir, '**/*.index')) ensemble_filenames = [filename[:-6] for filename in ensemble_filenames] if FLAGS.num_models > len(ensemble_filenames): raise ValueError('Number of models to be included in the ensemble ' 'should be less than total number of models in ' 'the checkpoint_dir.') ensemble_filenames = ensemble_filenames[:FLAGS.num_models] ensemble_size = len(ensemble_filenames) logging.info('Ensemble size: %s', ensemble_size) logging.info('Ensemble number of weights: %s', ensemble_size * model.count_params()) logging.info('Ensemble filenames: %s', str(ensemble_filenames)) checkpoint = tf.train.Checkpoint(model=model) # Write model predictions to files. num_datasets = len(test_datasets) for m, ensemble_filename in enumerate(ensemble_filenames): checkpoint.restore(ensemble_filename).assert_existing_objects_matched() for n, (dataset_name, test_dataset) in enumerate(test_datasets.items()): filename = '{dataset}_{member}.npy'.format(dataset=dataset_name, member=m) filename = os.path.join(FLAGS.output_dir, filename) if not tf.io.gfile.exists(filename): logits_list = [] test_iterator = iter(test_dataset) for step in range(steps_per_eval[dataset_name]): try: inputs = next(test_iterator) except StopIteration: continue features, labels, _ = utils.create_feature_and_label(inputs) logits = model(features, training=False) if isinstance(logits, (list, tuple)): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor_ensemble) logits_list.append(logits) logits_all = tf.concat(logits_list, axis=0) with tf.io.gfile.GFile(filename, 'w') as f: np.save(f, logits_all.numpy()) percent = (m * num_datasets + (n + 1)) / (ensemble_size * num_datasets) message = ('{:.1%} completion for prediction: ensemble member {:d}/{:d}. ' 'Dataset {:d}/{:d}'.format(percent, m + 1, ensemble_size, n + 1, num_datasets)) logging.info(message) metrics = { 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted': tf.keras.metrics.MeanSquaredError(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc': tf.keras.metrics.Accuracy(), 'test/acc_weighted': tf.keras.metrics.Accuracy(), 'test/precision': tf.keras.metrics.Precision(), 'test/recall': tf.keras.metrics.Recall(), 'test/f1': tfa_metrics.F1Score( num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold) } for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc_weighted_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/acc_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/precision_{}'.format(dataset_name): tf.keras.metrics.Precision(), 'test/recall_{}'.format(dataset_name): tf.keras.metrics.Recall(), 'test/f1_{}'.format(dataset_name): tfa_metrics.F1Score( num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold) }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format(fraction, dataset_name): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) @tf.function def generate_sample_weight(labels, class_weight, label_threshold=0.7): """Generate sample weight for weighted accuracy calculation.""" if label_threshold != 0.7: logging.warning('The class weight was based on `label_threshold` = 0.7, ' 'and weighted accuracy/brier will be meaningless if ' '`label_threshold` is not equal to this value, which is ' 'recommended by Jigsaw Conversation AI team.') labels_int = tf.cast(labels > label_threshold, tf.int32) sample_weight = tf.gather(class_weight, labels_int) return sample_weight # Evaluate model predictions. for n, (dataset_name, test_dataset) in enumerate(test_datasets.items()): logits_dataset = [] for m in range(ensemble_size): filename = '{dataset}_{member}.npy'.format(dataset=dataset_name, member=m) filename = os.path.join(FLAGS.output_dir, filename) with tf.io.gfile.GFile(filename, 'rb') as f: logits_dataset.append(np.load(f)) logits_dataset = tf.convert_to_tensor(logits_dataset) test_iterator = iter(test_dataset) texts_list = [] logits_list = [] labels_list = [] # Use dict to collect additional labels specified by additional label names. # Here we use `OrderedDict` to get consistent ordering for this dict so # we can retrieve the predictions for each identity labels in Colab. additional_labels_dict = collections.OrderedDict() for step in range(steps_per_eval[dataset_name]): try: inputs = next(test_iterator) # type: Mapping[Text, tf.Tensor] # pytype: disable=annotation-type-mismatch except StopIteration: continue features, labels, additional_labels = ( utils.create_feature_and_label(inputs)) logits = logits_dataset[:, (step * batch_size):((step + 1) * batch_size)] loss_logits = tf.squeeze(logits, axis=-1) negative_log_likelihood = um.ensemble_cross_entropy( labels, loss_logits, binary=True) per_probs = tf.nn.sigmoid(logits) probs = tf.reduce_mean(per_probs, axis=0) # Cast labels to discrete for ECE computation ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) auc_probs = tf.squeeze(probs, axis=1) texts_list.append(inputs['input_ids']) logits_list.append(logits) labels_list.append(labels) if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: if identity_label_name not in additional_labels_dict: additional_labels_dict[identity_label_name] = [] additional_labels_dict[identity_label_name].append( additional_labels[identity_label_name].numpy()) sample_weight = generate_sample_weight( labels, class_weight['test/{}'.format(dataset_name)], FLAGS.ece_label_threshold) if dataset_name == 'ind': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/brier_weighted'].update_state( tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece'].add_batch(ece_probs, label=ece_labels) metrics['test/acc'].update_state(ece_labels, pred_labels) metrics['test/acc_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/precision'].update_state(ece_labels, pred_labels) metrics['test/recall'].update_state(ece_labels, pred_labels) metrics['test/f1'].update_state(one_hot_labels, ece_probs) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].update_state(ece_labels, ece_probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_weighted_{}'.format(dataset_name)].update_state( tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece_{}'.format(dataset_name)].add_batch( ece_probs, label=ece_labels) metrics['test/acc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/acc_weighted_{}'.format(dataset_name)].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/precision_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/recall_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/f1_{}'.format(dataset_name)].update_state( one_hot_labels, ece_probs) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, ece_probs) texts_all = tf.concat(texts_list, axis=0) logits_all = tf.concat(logits_list, axis=1) labels_all = tf.concat(labels_list, axis=0) additional_labels_all = [] if additional_labels_dict: additional_labels_all = list(additional_labels_dict.values()) utils.save_prediction( texts_all.numpy(), path=os.path.join(FLAGS.output_dir, 'texts_{}'.format(dataset_name))) utils.save_prediction( labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'labels_{}'.format(dataset_name))) utils.save_prediction( logits_all.numpy(), path=os.path.join(FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: utils.save_prediction( np.array(additional_labels_all), path=os.path.join(FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) message = ('{:.1%} completion for evaluation: dataset {:d}/{:d}'.format( (n + 1) / num_datasets, n + 1, num_datasets)) logging.info(message) total_results = {name: metric.result() for name, metric in metrics.items()} # Metrics from Robustness Metrics (like ECE) will return a dict with a # single key/value, instead of a scalar. total_results = { k: (list(v.values())[0] if isinstance(v, dict) else v) for k, v in total_results.items() } logging.info('Metrics: %s', total_results)
def main(argv): del argv # unused arg if not FLAGS.use_gpu: raise ValueError('Only GPU is currently supported.') if FLAGS.num_cores > 1: raise ValueError('Only a single accelerator is currently supported.') tf.random.set_seed(FLAGS.seed) logging.info('Model checkpoint will be saved at %s', FLAGS.output_dir) tf.io.gfile.makedirs(FLAGS.output_dir) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 ind_dataset_builder = ds.WikipediaToxicityDataset( batch_size=batch_size, eval_batch_size=test_batch_size, data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( batch_size=batch_size, eval_batch_size=test_batch_size, data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( batch_size=batch_size, eval_batch_size=test_batch_size, data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } ds_info = ind_dataset_builder.info feature_size = _MAX_SEQ_LENGTH num_classes = ds_info['num_classes'] # Positive and negative classes. test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.build( split=base.Split.TEST) steps_per_eval[dataset_name] = ( dataset_builder.info['num_test_examples'] // test_batch_size) logging.info('Building %s model', FLAGS.model_family) bert_config_dir, bert_ckpt_dir = deterministic.resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_utils.create_config(bert_config_dir) model, bert_encoder = ub.models.BertBuilder( num_classes=num_classes, max_seq_length=feature_size, bert_config=bert_config) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) # Search for checkpoints from their index file; then remove the index suffix. ensemble_filenames = tf.io.gfile.glob( os.path.join(FLAGS.checkpoint_dir, '**/*.index')) ensemble_filenames = [filename[:-6] for filename in ensemble_filenames] if FLAGS.num_models > len(ensemble_filenames): raise ValueError('Number of models to be included in the ensemble ' 'should be less than total number of models in ' 'the checkpoint_dir.') ensemble_filenames = ensemble_filenames[:FLAGS.num_models] ensemble_size = len(ensemble_filenames) logging.info('Ensemble size: %s', ensemble_size) logging.info('Ensemble number of weights: %s', ensemble_size * model.count_params()) logging.info('Ensemble filenames: %s', str(ensemble_filenames)) checkpoint = tf.train.Checkpoint(model=model) # Write model predictions to files. num_datasets = len(test_datasets) for m, ensemble_filename in enumerate(ensemble_filenames): checkpoint.restore(ensemble_filename).assert_existing_objects_matched() for n, (dataset_name, test_dataset) in enumerate(test_datasets.items()): filename = '{dataset}_{member}.npy'.format(dataset=dataset_name, member=m) filename = os.path.join(FLAGS.output_dir, filename) if not tf.io.gfile.exists(filename): logits = [] test_iterator = iter(test_dataset) for step in range(steps_per_eval[dataset_name]): try: inputs = next(test_iterator) except StopIteration: continue features, labels, _ = deterministic.create_feature_and_label(inputs) logits.append(model(features, training=False)) logits = tf.concat(logits, axis=0) with tf.io.gfile.GFile(filename, 'w') as f: np.save(f, logits.numpy()) percent = (m * num_datasets + (n + 1)) / (ensemble_size * num_datasets) message = ('{:.1%} completion for prediction: ensemble member {:d}/{:d}. ' 'Dataset {:d}/{:d}'.format(percent, m + 1, ensemble_size, n + 1, num_datasets)) logging.info(message) metrics = { 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format(fraction, dataset_name): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) # Evaluate model predictions. for n, (dataset_name, test_dataset) in enumerate(test_datasets.items()): logits_dataset = [] for m in range(ensemble_size): filename = '{dataset}_{member}.npy'.format(dataset=dataset_name, member=m) filename = os.path.join(FLAGS.output_dir, filename) with tf.io.gfile.GFile(filename, 'rb') as f: logits_dataset.append(np.load(f)) logits_dataset = tf.convert_to_tensor(logits_dataset) test_iterator = iter(test_dataset) texts_list = [] logits_list = [] labels_list = [] # Use dict to collect additional labels specified by additional label names. # Here we use `OrderedDict` to get consistent ordering for this dict so # we can retrieve the predictions for each identity labels in Colab. additional_labels_dict = collections.OrderedDict() for step in range(steps_per_eval[dataset_name]): try: inputs = next(test_iterator) # type: Mapping[Text, tf.Tensor] except StopIteration: continue features, labels, additional_labels = ( deterministic.create_feature_and_label(inputs)) logits = logits_dataset[:, (step * batch_size):((step + 1) * batch_size)] loss_logits = tf.squeeze(logits, axis=-1) negative_log_likelihood = um.ensemble_cross_entropy( labels, loss_logits, binary=True) per_probs = tf.nn.sigmoid(logits) probs = tf.reduce_mean(per_probs, axis=0) # Cast labels to discrete for ECE computation ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) ece_probs = tf.concat([1. - probs, probs], axis=1) auc_probs = tf.squeeze(probs, axis=1) texts_list.append(inputs['input_ids']) logits_list.append(logits) labels_list.append(labels) if 'identity' in dataset_name: for identity_label_name in deterministic._IDENTITY_LABELS: # pylint: disable=protected-access if identity_label_name not in additional_labels_dict: additional_labels_dict[identity_label_name] = [] additional_labels_dict[identity_label_name].append( additional_labels[identity_label_name].numpy()) if dataset_name == 'ind': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/ece'].update_state(ece_labels, ece_probs) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].update_state(ece_labels, ece_probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/ece_{}'.format(dataset_name)].update_state( ece_labels, ece_probs) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, ece_probs) texts_all = tf.concat(texts_list, axis=0) logits_all = tf.concat(logits_list, axis=1) labels_all = tf.concat(labels_list, axis=0) additional_labels_all = [] if additional_labels_dict: additional_labels_all = list(additional_labels_dict.values()) deterministic.save_prediction( texts_all.numpy(), path=os.path.join(FLAGS.output_dir, 'texts_{}'.format(dataset_name))) deterministic.save_prediction( labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'labels_{}'.format(dataset_name))) deterministic.save_prediction( logits_all.numpy(), path=os.path.join(FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: deterministic.save_prediction( np.array(additional_labels_all), path=os.path.join(FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) message = ('{:.1%} completion for evaluation: dataset {:d}/{:d}'.format( (n + 1) / num_datasets, n + 1, num_datasets)) logging.info(message) total_results = {name: metric.result() for name, metric in metrics.items()} logging.info('Metrics: %s', total_results)
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Model checkpoint will be saved at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 train_dataset_builder = ds.WikipediaToxicityDataset( split='train', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ind_dataset_builder = ds.WikipediaToxicityDataset( split='test', data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( split='test', data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( split='test', data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) train_dataset_builders = { 'wikipedia_toxicity_subtypes': train_dataset_builder } test_dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } class_weight = utils.create_class_weight(train_dataset_builders, test_dataset_builders) logging.info('class_weight: %s', str(class_weight)) ds_info = train_dataset_builder.tfds_info # Positive and negative classes. num_classes = ds_info.metadata['num_classes'] train_datasets = {} dataset_steps_per_epoch = {} total_steps_per_epoch = 0 for dataset_name, dataset_builder in train_dataset_builders.items(): train_datasets[dataset_name] = dataset_builder.load( batch_size=batch_size) dataset_steps_per_epoch[dataset_name] = ( dataset_builder.num_examples // batch_size) total_steps_per_epoch += dataset_steps_per_epoch[dataset_name] test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in test_dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.load( batch_size=test_batch_size) steps_per_eval[dataset_name] = (dataset_builder.num_examples // test_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building %s model', FLAGS.model_family) bert_config_dir, bert_ckpt_dir = utils.resolve_bert_ckpt_and_config_dir( FLAGS.bert_model_type, FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = utils.create_config(bert_config_dir) model, bert_encoder = ub.models.DropoutBertBuilder( num_classes=num_classes, bert_config=bert_config, use_mc_dropout_mha=FLAGS.use_mc_dropout_mha, use_mc_dropout_att=FLAGS.use_mc_dropout_att, use_mc_dropout_ffn=FLAGS.use_mc_dropout_ffn, use_mc_dropout_output=FLAGS.use_mc_dropout_output, channel_wise_dropout_mha=FLAGS.channel_wise_dropout_mha, channel_wise_dropout_att=FLAGS.channel_wise_dropout_att, channel_wise_dropout_ffn=FLAGS.channel_wise_dropout_ffn) optimizer = utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=total_steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.Accuracy(), 'train/accuracy_weighted': tf.keras.metrics.Accuracy(), 'train/auroc': tf.keras.metrics.AUC(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'train/precision': tf.keras.metrics.Precision(), 'train/recall': tf.keras.metrics.Recall(), 'train/f1': tfa_metrics.F1Score(num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if FLAGS.prediction_mode: latest_checkpoint = tf.train.latest_checkpoint( FLAGS.eval_checkpoint_dir) else: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy( ) // total_steps_per_epoch elif FLAGS.model_family.lower() == 'bert': # load BERT from initial checkpoint bert_checkpoint = tf.train.Checkpoint(model=bert_encoder) bert_checkpoint.restore( bert_ckpt_dir).assert_existing_objects_matched() logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) metrics.update({ 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted': tf.keras.metrics.MeanSquaredError(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc': tf.keras.metrics.Accuracy(), 'test/acc_weighted': tf.keras.metrics.Accuracy(), 'test/eval_time': tf.keras.metrics.Mean(), 'test/precision': tf.keras.metrics.Precision(), 'test/recall': tf.keras.metrics.Recall(), 'test/f1': tfa_metrics.F1Score(num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): um.OracleCollaborativeAccuracy(fraction=float(fraction), num_bins=FLAGS.num_bins) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/brier_weighted_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/acc_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/acc_weighted_{}'.format(dataset_name): tf.keras.metrics.Accuracy(), 'test/eval_time_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/precision_{}'.format(dataset_name): tf.keras.metrics.Precision(), 'test/recall_{}'.format(dataset_name): tf.keras.metrics.Recall(), 'test/f1_{}'.format(dataset_name): tfa_metrics.F1Score(num_classes=num_classes, average='micro', threshold=FLAGS.ece_label_threshold), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) @tf.function def generate_sample_weight(labels, class_weight, label_threshold=0.7): """Generate sample weight for weighted accuracy calculation.""" if label_threshold != 0.7: logging.warning( 'The class weight was based on `label_threshold` = 0.7, ' 'and weighted accuracy/brier will be meaningless if ' '`label_threshold` is not equal to this value, which is ' 'recommended by Jigsaw Conversation AI team.') labels_int = tf.cast(labels > label_threshold, tf.int32) sample_weight = tf.gather(class_weight, labels_int) return sample_weight @tf.function def train_step(iterator, dataset_name): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = utils.create_feature_and_label(inputs) with tf.GradientTape() as tape: logits = model(features, training=True) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) loss_logits = tf.squeeze(logits, axis=1) if FLAGS.loss_type == 'cross_entropy': logging.info('Using cross entropy loss') negative_log_likelihood = tf.nn.sigmoid_cross_entropy_with_logits( labels, loss_logits) elif FLAGS.loss_type == 'focal_cross_entropy': logging.info('Using focal cross entropy loss') negative_log_likelihood = tfa_losses.sigmoid_focal_crossentropy( labels, loss_logits, alpha=FLAGS.focal_loss_alpha, gamma=FLAGS.focal_loss_gamma, from_logits=True) elif FLAGS.loss_type == 'mse': logging.info('Using mean squared error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_squared_error( labels, loss_probs) elif FLAGS.loss_type == 'mae': logging.info('Using mean absolute error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_absolute_error( labels, loss_probs) negative_log_likelihood = tf.reduce_mean( negative_log_likelihood) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) auc_probs = tf.squeeze(probs, axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) sample_weight = generate_sample_weight( labels, class_weight['train/{}'.format(dataset_name)], FLAGS.ece_label_threshold) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, pred_labels) metrics['train/accuracy_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['train/auroc'].update_state(labels, auc_probs) metrics['train/loss'].update_state(loss) metrics['train/ece'].update_state(ece_labels, ece_probs) metrics['train/precision'].update_state(ece_labels, pred_labels) metrics['train/recall'].update_state(ece_labels, pred_labels) metrics['train/f1'].update_state(one_hot_labels, ece_probs) strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn to log metrics.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = utils.create_feature_and_label(inputs) eval_start_time = time.time() logits = model(features, training=False) eval_time = (time.time() - eval_start_time) / FLAGS.per_core_batch_size if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) one_hot_labels = tf.one_hot(tf.cast(ece_labels, tf.int32), depth=num_classes) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) auc_probs = tf.squeeze(probs, axis=1) loss_logits = tf.squeeze(logits, axis=1) negative_log_likelihood = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels, loss_logits)) sample_weight = generate_sample_weight( labels, class_weight['test/{}'.format(dataset_name)], FLAGS.ece_label_threshold) if dataset_name == 'ind': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/brier_weighted'].update_state( tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece'].update_state(ece_labels, ece_probs) metrics['test/acc'].update_state(ece_labels, pred_labels) metrics['test/acc_weighted'].update_state( ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/eval_time'].update_state(eval_time) metrics['test/precision'].update_state(ece_labels, pred_labels) metrics['test/recall'].update_state(ece_labels, pred_labels) metrics['test/f1'].update_state(one_hot_labels, ece_probs) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].update_state(ece_labels, ece_probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_weighted_{}'.format( dataset_name)].update_state(tf.expand_dims(labels, -1), probs, sample_weight=sample_weight) metrics['test/ece_{}'.format(dataset_name)].update_state( ece_labels, ece_probs) metrics['test/acc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/acc_weighted_{}'.format( dataset_name)].update_state(ece_labels, pred_labels, sample_weight=sample_weight) metrics['test/eval_time_{}'.format(dataset_name)].update_state( eval_time) metrics['test/precision_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/recall_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) metrics['test/f1_{}'.format(dataset_name)].update_state( one_hot_labels, ece_probs) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, ece_probs) strategy.run(step_fn, args=(next(iterator), )) @tf.function def final_eval_step(iterator): """Final Evaluation StepFn to save prediction to directory.""" def step_fn(inputs): bert_features, labels, additional_labels = utils.create_feature_and_label( inputs) logits = model(bert_features, training=False) features = inputs['input_ids'] return features, logits, labels, additional_labels (per_replica_texts, per_replica_logits, per_replica_labels, per_replica_additional_labels) = (strategy.run( step_fn, args=(next(iterator), ))) if strategy.num_replicas_in_sync > 1: texts_list = tf.concat(per_replica_texts.values, axis=0) logits_list = tf.concat(per_replica_logits.values, axis=0) labels_list = tf.concat(per_replica_labels.values, axis=0) additional_labels_dict = {} for additional_label in utils.IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[additional_label] = tf.concat( per_replica_additional_labels[additional_label], axis=0) else: texts_list = per_replica_texts logits_list = per_replica_logits labels_list = per_replica_labels additional_labels_dict = {} for additional_label in utils.IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[ additional_label] = per_replica_additional_labels[ additional_label] return texts_list, logits_list, labels_list, additional_labels_dict if FLAGS.prediction_mode: # Prediction and exit. for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types message = 'Final eval on dataset {}'.format(dataset_name) logging.info(message) texts_all = [] logits_all = [] labels_all = [] additional_labels_all_dict = {} if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all_dict[identity_label_name] = [] try: with tf.experimental.async_scope(): for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: message = 'Starting to run eval step {}/{} of dataset: {}'.format( step, steps_per_eval[dataset_name], dataset_name) logging.info(message) (text_step, logits_step, labels_step, additional_labels_dict_step ) = final_eval_step(test_iterator) texts_all.append(text_step) logits_all.append(logits_step) labels_all.append(labels_step) if 'identity' in dataset_name: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all_dict[ identity_label_name].append( additional_labels_dict_step[ identity_label_name]) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with eval on %s', dataset_name) texts_all = tf.concat(texts_all, axis=0) logits_all = tf.concat(logits_all, axis=0) labels_all = tf.concat(labels_all, axis=0) additional_labels_all = [] if additional_labels_all_dict: for identity_label_name in utils.IDENTITY_LABELS: additional_labels_all.append( tf.concat( additional_labels_all_dict[identity_label_name], axis=0)) additional_labels_all = tf.convert_to_tensor(additional_labels_all) utils.save_prediction(texts_all.numpy(), path=os.path.join( FLAGS.output_dir, 'texts_{}'.format(dataset_name))) utils.save_prediction(labels_all.numpy(), path=os.path.join( FLAGS.output_dir, 'labels_{}'.format(dataset_name))) utils.save_prediction(logits_all.numpy(), path=os.path.join( FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: utils.save_prediction( additional_labels_all.numpy(), path=os.path.join( FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) logging.info('Done with testing on %s', dataset_name) else: # Execute train / eval loop. start_time = time.time() train_iterators = {} for dataset_name, train_dataset in train_datasets.items(): train_iterators[dataset_name] = iter(train_dataset) for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) current_step = epoch * total_steps_per_epoch for dataset_name, train_iterator in train_iterators.items(): for step in range(dataset_steps_per_epoch[dataset_name]): train_step(train_iterator, dataset_name) current_step += 1 max_steps = total_steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ( '{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) if step % 20 == 0: logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types logging.info('Testing on dataset %s', dataset_name) try: with tf.experimental.async_scope(): for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info( 'Starting to run eval step %s/%s of epoch: %s', step, steps_per_eval[dataset_name], epoch) test_step(test_iterator, dataset_name) except (StopIteration, tf.errors.OutOfRangeError): tf.experimental.async_clear_error() logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, AUROC: %.4f', metrics['train/loss'].result(), metrics['train/auroc'].result()) logging.info('Test NLL: %.4f, AUROC: %.4f', metrics['test/negative_log_likelihood'].result(), metrics['test/auroc'].result()) # record results total_results = {} for name, metric in metrics.items(): total_results[name] = metric.result() with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for name, metric in metrics.items(): metric.reset_states() checkpoint_interval = min(FLAGS.checkpoint_interval, FLAGS.train_epochs) if checkpoint_interval > 0 and (epoch + 1) % checkpoint_interval == 0: checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name) # Save model in SavedModel format on exit. final_save_name = os.path.join(FLAGS.output_dir, 'model') model.save(final_save_name) logging.info('Saved model to %s', final_save_name)
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 train_dataset_builder = ds.WikipediaToxicityDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ind_dataset_builder = ds.WikipediaToxicityDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } train_dataset = train_dataset_builder.build(split=base.Split.TRAIN) ds_info = train_dataset_builder.info num_classes = ds_info['num_classes'] # Positive and negative classes. steps_per_epoch = ds_info['num_train_examples'] // batch_size test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.build( split=base.Split.TEST) steps_per_eval[dataset_name] = ( dataset_builder.info['num_test_examples'] // test_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building BERT %s model', FLAGS.bert_model_type) logging.info('use_gp_layer=%s', FLAGS.use_gp_layer) logging.info('use_spec_norm_att=%s', FLAGS.use_spec_norm_att) logging.info('use_spec_norm_ffn=%s', FLAGS.use_spec_norm_ffn) logging.info('use_layer_norm_att=%s', FLAGS.use_layer_norm_att) logging.info('use_layer_norm_ffn=%s', FLAGS.use_layer_norm_ffn) bert_config_dir, bert_ckpt_dir = resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_utils.create_config(bert_config_dir) gp_layer_kwargs = dict(num_inducing=FLAGS.gp_hidden_dim, gp_kernel_scale=FLAGS.gp_scale, gp_output_bias=FLAGS.gp_bias, normalize_input=FLAGS.gp_input_normalization, gp_cov_momentum=FLAGS.gp_cov_discount_factor, gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty) spec_norm_kwargs = dict(iteration=FLAGS.spec_norm_iteration, norm_multiplier=FLAGS.spec_norm_bound) model, bert_encoder = ub.models.SngpBertBuilder( num_classes=num_classes, bert_config=bert_config, gp_layer_kwargs=gp_layer_kwargs, spec_norm_kwargs=spec_norm_kwargs, use_gp_layer=FLAGS.use_gp_layer, use_spec_norm_att=FLAGS.use_spec_norm_att, use_spec_norm_ffn=FLAGS.use_spec_norm_ffn, use_layer_norm_att=FLAGS.use_layer_norm_att, use_layer_norm_ffn=FLAGS.use_layer_norm_ffn, use_spec_norm_plr=FLAGS.use_spec_norm_plr) optimizer = bert_utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.Accuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if FLAGS.prediction_mode: latest_checkpoint = tf.train.latest_checkpoint( FLAGS.eval_checkpoint_dir) else: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch else: # load BERT from initial checkpoint bert_encoder, _, _ = bert_utils.load_bert_weight_from_ckpt( bert_model=bert_encoder, bert_ckpt_dir=bert_ckpt_dir, repl_patterns=ub.models.bert_sngp.CHECKPOINT_REPL_PATTERNS) logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) # Finally, define test metrics outside the accelerator scope for CPU eval. metrics.update({ 'test/nll': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/eval_time': tf.keras.metrics.Mean(), 'test/stddev': tf.keras.metrics.Mean(), 'test/acc': tf.keras.metrics.Accuracy(), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): um.OracleCollaborativeAccuracy(fraction=float(fraction), num_bins=FLAGS.num_bins) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/eval_time_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/stddev_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/acc_{}'.format(dataset_name): tf.keras.metrics.Accuracy() }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = create_feature_and_label(inputs) with tf.GradientTape() as tape: logits = model(features, training=True) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract logits logits, _ = logits if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) loss_logits = tf.squeeze(logits, axis=1) if FLAGS.loss_type == 'cross_entropy': logging.info('Using cross entropy loss') negative_log_likelihood = tf.nn.sigmoid_cross_entropy_with_logits( labels, loss_logits) elif FLAGS.loss_type == 'mse': logging.info('Using mean squared error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_squared_error( labels, loss_probs) elif FLAGS.loss_type == 'mae': logging.info('Using mean absolute error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_absolute_error( labels, loss_probs) negative_log_likelihood = tf.reduce_mean( negative_log_likelihood) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) metrics['train/accuracy'].update_state(labels, pred_labels) metrics['train/ece'].update_state(ece_labels, ece_probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = create_feature_and_label(inputs) # Compute ensemble prediction over Monte Carlo forward-pass samples. logits_list = [] stddev_list = [] for _ in range(FLAGS.num_mc_samples): logits = model(features, training=False) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) stddev = tf.sqrt(tf.linalg.diag_part(covmat)) logits_list.append(logits) stddev_list.append(stddev) # Logits dimension is (num_samples, batch_size, num_classes). logits_list = tf.stack(logits_list, axis=0) stddev_list = tf.stack(stddev_list, axis=0) stddev = tf.reduce_mean(stddev_list, axis=0) probs_list = tf.nn.sigmoid(logits_list) probs = tf.reduce_mean(probs_list, axis=0) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) auc_probs = tf.squeeze(probs, axis=1) ce = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.broadcast_to( labels, [FLAGS.num_mc_samples, labels.shape[0]]), logits=tf.squeeze(logits_list, axis=-1)) negative_log_likelihood = -tf.reduce_logsumexp( -ce, axis=0) + tf.math.log(float(FLAGS.num_mc_samples)) negative_log_likelihood = tf.reduce_mean(negative_log_likelihood) if dataset_name == 'ind': metrics['test/nll'].update_state(negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/ece'].update_state(ece_labels, ece_probs) metrics['test/stddev'].update_state(stddev) metrics['test/acc'].update_state(ece_labels, pred_labels) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].update_state(ece_labels, ece_probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/ece_{}'.format(dataset_name)].update_state( ece_labels, ece_probs) metrics['test/stddev_{}'.format(dataset_name)].update_state( stddev) metrics['test/acc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, ece_probs) strategy.run(step_fn, args=(next(iterator), )) @tf.function def final_eval_step(iterator): """Final Evaluation StepFn to save prediction to directory.""" def step_fn(inputs): bert_features, labels, additional_labels = create_feature_and_label( inputs) logits = model(bert_features, training=False) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) features = inputs['input_ids'] return features, logits, labels, additional_labels (per_replica_texts, per_replica_logits, per_replica_labels, per_replica_additional_labels) = (strategy.run( step_fn, args=(next(iterator), ))) if strategy.num_replicas_in_sync > 1: texts_list = tf.concat(per_replica_texts.values, axis=0) logits_list = tf.concat(per_replica_logits.values, axis=0) labels_list = tf.concat(per_replica_labels.values, axis=0) additional_labels_dict = {} for additional_label in _IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[additional_label] = tf.concat( per_replica_additional_labels[additional_label], axis=0) else: texts_list = per_replica_texts logits_list = per_replica_logits labels_list = per_replica_labels additional_labels_dict = {} for additional_label in _IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[ additional_label] = per_replica_additional_labels[ additional_label] return texts_list, logits_list, labels_list, additional_labels_dict if FLAGS.prediction_mode: # Prediction and exit. for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types message = 'Final eval on dataset {}'.format(dataset_name) logging.info(message) texts_all = [] logits_all = [] labels_all = [] additional_labels_all_dict = {} if 'identity' in dataset_name: for identity_label_name in _IDENTITY_LABELS: additional_labels_all_dict[identity_label_name] = [] for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: message = 'Starting to run eval step {}/{} of dataset: {}'.format( step, steps_per_eval[dataset_name], dataset_name) logging.info(message) try: (text_step, logits_step, labels_step, additional_labels_dict_step ) = final_eval_step(test_iterator) except tf.errors.OutOfRangeError: continue texts_all.append(text_step) logits_all.append(logits_step) labels_all.append(labels_step) if 'identity' in dataset_name: for identity_label_name in _IDENTITY_LABELS: additional_labels_all_dict[identity_label_name].append( additional_labels_dict_step[identity_label_name]) texts_all = tf.concat(texts_all, axis=0) logits_all = tf.concat(logits_all, axis=0) labels_all = tf.concat(labels_all, axis=0) additional_labels_all = [] if additional_labels_all_dict: for identity_label_name in _IDENTITY_LABELS: additional_labels_all.append( tf.concat( additional_labels_all_dict[identity_label_name], axis=0)) additional_labels_all = tf.convert_to_tensor(additional_labels_all) save_prediction(texts_all.numpy(), path=os.path.join(FLAGS.output_dir, 'texts_{}'.format(dataset_name))) save_prediction( labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'labels_{}'.format(dataset_name))) save_prediction( logits_all.numpy(), path=os.path.join(FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: save_prediction( additional_labels_all.numpy(), path=os.path.join( FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) logging.info('Done with testing on %s', dataset_name) else: train_iterator = iter(train_dataset) start_time = time.time() for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) for step in range(steps_per_epoch): train_step(train_iterator) current_step = epoch * steps_per_epoch + (step + 1) max_steps = steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ( '{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) if step % 20 == 0: logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) logging.info('Testing on dataset %s', dataset_name) for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info( 'Starting to run eval step %s of epoch: %s', step, epoch) test_step(test_iterator, dataset_name) logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, ECE: %.2f, Accuracy: %.2f', metrics['train/loss'].result(), metrics['train/ece'].result(), metrics['train/accuracy'].result()) total_results = { name: metric.result() for name, metric in metrics.items() } with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for metric in metrics.values(): metric.reset_states() if (FLAGS.checkpoint_interval > 0 and (epoch + 1) % FLAGS.checkpoint_interval == 0): checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name) # Save model in SavedModel format on exit. final_save_name = os.path.join(FLAGS.output_dir, 'model') model.save(final_save_name) logging.info('Saved model to %s', final_save_name)