def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores train_dataset_builder = ub.datasets.ClincIntentDetectionDataset( split='train', data_dir=FLAGS.data_dir, data_mode='ind') ind_dataset_builder = ub.datasets.ClincIntentDetectionDataset( split='test', data_dir=FLAGS.data_dir, data_mode='ind') ood_dataset_builder = ub.datasets.ClincIntentDetectionDataset( split='test', data_dir=FLAGS.data_dir, data_mode='ood') all_dataset_builder = ub.datasets.ClincIntentDetectionDataset( split='test', data_dir=FLAGS.data_dir, data_mode='all') dataset_builders = { 'clean': ind_dataset_builder, 'ood': ood_dataset_builder, 'all': all_dataset_builder } train_dataset = train_dataset_builder.load( batch_size=FLAGS.per_core_batch_size) ds_info = train_dataset_builder.tfds_info feature_size = ds_info.metadata['feature_size'] # num_classes is number of valid intents plus out-of-scope intent num_classes = ds_info.features['intent_label'].num_classes + 1 steps_per_epoch = train_dataset_builder.num_examples // batch_size test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.load( batch_size=FLAGS.eval_batch_size) steps_per_eval[dataset_name] = (dataset_builder.num_examples // FLAGS.eval_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building BERT model') bert_config_dir, bert_ckpt_dir = resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_utils.create_config(bert_config_dir) bert_config.hidden_dropout_prob = FLAGS.dropout_rate bert_config.attention_probs_dropout_prob = FLAGS.dropout_rate model, bert_encoder = ub.models.DropoutBertBuilder( num_classes=num_classes, bert_config=bert_config, use_mc_dropout_mha=FLAGS.use_mc_dropout_mha, use_mc_dropout_att=FLAGS.use_mc_dropout_att, use_mc_dropout_ffn=FLAGS.use_mc_dropout_ffn, use_mc_dropout_output=FLAGS.use_mc_dropout_output, channel_wise_dropout_mha=FLAGS.channel_wise_dropout_mha, channel_wise_dropout_att=FLAGS.channel_wise_dropout_att, channel_wise_dropout_ffn=FLAGS.channel_wise_dropout_ffn) # Create an AdamW optimizer with beta_2=0.999, epsilon=1e-6. optimizer = bert_utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion, beta_1=1.0 - FLAGS.one_minus_momentum) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': rm.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': rm.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'clean': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/accuracy_{}'.format(dataset_name): tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece_{}'.format(dataset_name): rm.metrics.ExpectedCalibrationError( num_bins=FLAGS.num_bins) }) checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch else: # load BERT from initial checkpoint bert_checkpoint = tf.train.Checkpoint(model=bert_encoder) bert_checkpoint.restore( bert_ckpt_dir).assert_existing_objects_matched() logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) # Finally, define OOD metrics outside the accelerator scope for CPU eval. metrics.update({ 'test/auroc_all': tf.keras.metrics.AUC(curve='ROC'), 'test/auprc_all': tf.keras.metrics.AUC(curve='PR') }) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels = bert_utils.create_feature_and_label( inputs, feature_size) with tf.GradientTape() as tape: # Set learning phase to enable dropout etc during training. logits = model(features, training=True) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True)) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.softmax(logits) metrics['train/ece'].add_batch(probs, label=labels) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, logits) for _ in tf.range(tf.cast(steps_per_epoch, tf.int32)): strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name, num_steps): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels = bert_utils.create_feature_and_label( inputs, feature_size) # Compute ensemble prediction over Monte Carlo dropout samples. logits_list = [] for _ in range(FLAGS.num_dropout_samples): logits = model(features, training=False) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) logits_list.append(logits) # Logits dimension is (num_samples, batch_size, num_classes). logits_list = tf.stack(logits_list, axis=0) probs_list = tf.nn.softmax(logits_list) probs = tf.reduce_mean(probs_list, axis=0) labels_broadcasted = tf.broadcast_to( labels, [FLAGS.num_dropout_samples, labels.shape[0]]) log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( labels_broadcasted, logits_list, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[0]) + tf.math.log(float(FLAGS.num_dropout_samples))) if dataset_name == 'clean': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].add_batch(probs, label=labels) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/accuracy_{}'.format(dataset_name)].update_state( labels, probs) metrics['test/ece_{}'.format(dataset_name)].add_batch( probs, label=labels) if dataset_name == 'all': ood_labels = tf.cast(labels == 150, labels.dtype) ood_probs = 1. - tf.reduce_max(probs, axis=-1) metrics['test/auroc_{}'.format(dataset_name)].update_state( ood_labels, ood_probs) metrics['test/auprc_{}'.format(dataset_name)].update_state( ood_labels, ood_probs) for _ in tf.range(tf.cast(num_steps, tf.int32)): step_fn(next(iterator)) train_iterator = iter(train_dataset) start_time = time.time() for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) train_step(train_iterator) current_step = (epoch + 1) * steps_per_epoch max_steps = steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) logging.info('Testing on dataset %s', dataset_name) logging.info('Starting to run eval at epoch: %s', epoch) test_step(test_iterator, dataset_name, steps_per_eval[dataset_name]) logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, Accuracy: %.2f%%', metrics['train/loss'].result(), metrics['train/accuracy'].result() * 100) logging.info('Test NLL: %.4f, Accuracy: %.2f%%', metrics['test/negative_log_likelihood'].result(), metrics['test/accuracy'].result() * 100) total_results = { name: metric.result() for name, metric in metrics.items() } # Metrics from Robustness Metrics (like ECE) will return a dict with a # single key/value, instead of a scalar. total_results = { k: (list(v.values())[0] if isinstance(v, dict) else v) for k, v in total_results.items() } with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for metric in metrics.values(): metric.reset_states() if (FLAGS.checkpoint_interval > 0 and (epoch + 1) % FLAGS.checkpoint_interval == 0): checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name) with summary_writer.as_default(): hp.hparams({ 'base_learning_rate': FLAGS.base_learning_rate, 'one_minus_momentum': FLAGS.one_minus_momentum, 'dropout_rate': FLAGS.dropout_rate, 'num_dropout_samples': FLAGS.num_dropout_samples, })
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores train_dataset_builder = ub.datasets.ClincIntentDetectionDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.data_dir, data_mode='ind') ind_dataset_builder = ub.datasets.ClincIntentDetectionDataset( batch_size=batch_size, eval_batch_size=FLAGS.eval_batch_size, data_dir=FLAGS.data_dir, data_mode='ind') ood_dataset_builder = ub.datasets.ClincIntentDetectionDataset( batch_size=batch_size, eval_batch_size=FLAGS.eval_batch_size, data_dir=FLAGS.data_dir, data_mode='ood') all_dataset_builder = ub.datasets.ClincIntentDetectionDataset( batch_size=batch_size, eval_batch_size=FLAGS.eval_batch_size, data_dir=FLAGS.data_dir, data_mode='all') dataset_builders = { 'clean': ind_dataset_builder, 'ood': ood_dataset_builder, 'all': all_dataset_builder } train_dataset = train_dataset_builder.build( split=ub.datasets.base.Split.TRAIN) ds_info = train_dataset_builder.info feature_size = ds_info['feature_size'] # num_classes is number of valid intents plus out-of-scope intent num_classes = ds_info['num_classes'] + 1 steps_per_epoch = ds_info['num_train_examples'] // batch_size test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.build( split=ub.datasets.base.Split.TEST) steps_per_eval[dataset_name] = ( dataset_builder.info['num_test_examples'] // FLAGS.eval_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building BERT model') logging.info('use_gp_layer=%s', FLAGS.use_gp_layer) logging.info('use_spec_norm_att=%s', FLAGS.use_spec_norm_att) logging.info('use_spec_norm_ffn=%s', FLAGS.use_spec_norm_ffn) logging.info('use_layer_norm_att=%s', FLAGS.use_layer_norm_att) logging.info('use_layer_norm_ffn=%s', FLAGS.use_layer_norm_ffn) bert_config_dir, bert_ckpt_dir = resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_utils.create_config(bert_config_dir) gp_layer_kwargs = dict(num_inducing=FLAGS.gp_hidden_dim, gp_kernel_scale=FLAGS.gp_scale, gp_output_bias=FLAGS.gp_bias, normalize_input=FLAGS.gp_input_normalization, gp_cov_momentum=FLAGS.gp_cov_discount_factor, gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty) spec_norm_kwargs = dict(iteration=FLAGS.spec_norm_iteration, norm_multiplier=FLAGS.spec_norm_bound) model, bert_encoder = ub.models.SngpBertBuilder( num_classes=num_classes, bert_config=bert_config, gp_layer_kwargs=gp_layer_kwargs, spec_norm_kwargs=spec_norm_kwargs, use_gp_layer=FLAGS.use_gp_layer, use_spec_norm_att=FLAGS.use_spec_norm_att, use_spec_norm_ffn=FLAGS.use_spec_norm_ffn, use_layer_norm_att=FLAGS.use_layer_norm_att, use_layer_norm_ffn=FLAGS.use_layer_norm_ffn, use_spec_norm_plr=FLAGS.use_spec_norm_plr) optimizer = bert_utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch else: # load BERT from initial checkpoint bert_encoder, _, _ = bert_utils.load_bert_weight_from_ckpt( bert_model=bert_encoder, bert_ckpt_dir=bert_ckpt_dir, repl_patterns=ub.models.bert_sngp.CHECKPOINT_REPL_PATTERNS) logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) # Finally, define test metrics outside the accelerator scope for CPU eval. metrics.update({ 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/stddev': tf.keras.metrics.Mean(), }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'clean': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/accuracy_{}'.format(dataset_name): tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/stddev_{}'.format(dataset_name): tf.keras.metrics.Mean(), }) metrics.update({ 'test/auroc_all': tf.keras.metrics.AUC(curve='ROC'), 'test/auprc_all': tf.keras.metrics.AUC(curve='PR') }) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels = bert_utils.create_feature_and_label( inputs, feature_size) with tf.GradientTape() as tape: # Set learning phase to enable dropout etc during training. logits = model(features, training=True) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract logits logits, _ = logits if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True)) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.softmax(logits) metrics['train/ece'].update_state(labels, probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, logits) strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels = bert_utils.create_feature_and_label( inputs, feature_size) # Compute ensemble prediction over Monte Carlo forward-pass samples. logits_list = [] stddev_list = [] for _ in range(FLAGS.num_mc_samples): logits = model(features, training=False) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(FLAGS.eval_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) stddev = tf.sqrt(tf.linalg.diag_part(covmat)) logits_list.append(logits) stddev_list.append(stddev) # Logits dimension is (num_samples, batch_size, num_classes). logits_list = tf.stack(logits_list, axis=0) stddev_list = tf.stack(stddev_list, axis=0) stddev = tf.reduce_mean(stddev_list, axis=0) probs_list = tf.nn.softmax(logits_list) probs = tf.reduce_mean(probs_list, axis=0) labels_broadcasted = tf.broadcast_to( labels, [FLAGS.num_mc_samples, labels.shape[0]]) log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( labels_broadcasted, logits_list, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[0]) + tf.math.log(float(FLAGS.num_mc_samples))) if dataset_name == 'clean': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].update_state(labels, probs) metrics['test/stddev'].update_state(stddev) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/accuracy_{}'.format(dataset_name)].update_state( labels, probs) metrics['test/ece_{}'.format(dataset_name)].update_state( labels, probs) metrics['test/stddev_{}'.format(dataset_name)].update_state( stddev) if dataset_name == 'all': ood_labels = tf.cast(labels == 150, labels.dtype) ood_probs = 1. - tf.reduce_max(probs, axis=-1) metrics['test/auroc_{}'.format(dataset_name)].update_state( ood_labels, ood_probs) metrics['test/auprc_{}'.format(dataset_name)].update_state( ood_labels, ood_probs) step_fn(next(iterator)) train_iterator = iter(train_dataset) start_time = time.time() for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) for step in range(steps_per_epoch): train_step(train_iterator) current_step = epoch * steps_per_epoch + (step + 1) max_steps = steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) if step % 20 == 0: logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) logging.info('Testing on dataset %s', dataset_name) for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info( 'Starting to run eval step %s of epoch: %s', step, epoch) test_step(test_iterator, dataset_name) logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, Accuracy: %.2f%%', metrics['train/loss'].result(), metrics['train/accuracy'].result() * 100) logging.info('Test NLL: %.4f, Accuracy: %.2f%%', metrics['test/negative_log_likelihood'].result(), metrics['test/accuracy'].result() * 100) total_results = { name: metric.result() for name, metric in metrics.items() } with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for metric in metrics.values(): metric.reset_states() if (FLAGS.checkpoint_interval > 0 and (epoch + 1) % FLAGS.checkpoint_interval == 0): checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name)
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 train_dataset_builder = ds.WikipediaToxicityDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ind_dataset_builder = ds.WikipediaToxicityDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } train_dataset = train_dataset_builder.build(split=base.Split.TRAIN) ds_info = train_dataset_builder.info num_classes = ds_info['num_classes'] # Positive and negative classes. steps_per_epoch = ds_info['num_train_examples'] // batch_size test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.build( split=base.Split.TEST) steps_per_eval[dataset_name] = ( dataset_builder.info['num_test_examples'] // test_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building BERT %s model', FLAGS.bert_model_type) logging.info('use_gp_layer=%s', FLAGS.use_gp_layer) logging.info('use_spec_norm_att=%s', FLAGS.use_spec_norm_att) logging.info('use_spec_norm_ffn=%s', FLAGS.use_spec_norm_ffn) logging.info('use_layer_norm_att=%s', FLAGS.use_layer_norm_att) logging.info('use_layer_norm_ffn=%s', FLAGS.use_layer_norm_ffn) bert_config_dir, bert_ckpt_dir = resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_utils.create_config(bert_config_dir) gp_layer_kwargs = dict(num_inducing=FLAGS.gp_hidden_dim, gp_kernel_scale=FLAGS.gp_scale, gp_output_bias=FLAGS.gp_bias, normalize_input=FLAGS.gp_input_normalization, gp_cov_momentum=FLAGS.gp_cov_discount_factor, gp_cov_ridge_penalty=FLAGS.gp_cov_ridge_penalty) spec_norm_kwargs = dict(iteration=FLAGS.spec_norm_iteration, norm_multiplier=FLAGS.spec_norm_bound) model, bert_encoder = ub.models.SngpBertBuilder( num_classes=num_classes, bert_config=bert_config, gp_layer_kwargs=gp_layer_kwargs, spec_norm_kwargs=spec_norm_kwargs, use_gp_layer=FLAGS.use_gp_layer, use_spec_norm_att=FLAGS.use_spec_norm_att, use_spec_norm_ffn=FLAGS.use_spec_norm_ffn, use_layer_norm_att=FLAGS.use_layer_norm_att, use_layer_norm_ffn=FLAGS.use_layer_norm_ffn, use_spec_norm_plr=FLAGS.use_spec_norm_plr) optimizer = bert_utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.Accuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if FLAGS.prediction_mode: latest_checkpoint = tf.train.latest_checkpoint( FLAGS.eval_checkpoint_dir) else: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch else: # load BERT from initial checkpoint bert_encoder, _, _ = bert_utils.load_bert_weight_from_ckpt( bert_model=bert_encoder, bert_ckpt_dir=bert_ckpt_dir, repl_patterns=ub.models.bert_sngp.CHECKPOINT_REPL_PATTERNS) logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) # Finally, define test metrics outside the accelerator scope for CPU eval. metrics.update({ 'test/nll': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/eval_time': tf.keras.metrics.Mean(), 'test/stddev': tf.keras.metrics.Mean(), 'test/acc': tf.keras.metrics.Accuracy(), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): um.OracleCollaborativeAccuracy(fraction=float(fraction), num_bins=FLAGS.num_bins) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/eval_time_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/stddev_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/acc_{}'.format(dataset_name): tf.keras.metrics.Accuracy() }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = create_feature_and_label(inputs) with tf.GradientTape() as tape: logits = model(features, training=True) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract logits logits, _ = logits if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) loss_logits = tf.squeeze(logits, axis=1) if FLAGS.loss_type == 'cross_entropy': logging.info('Using cross entropy loss') negative_log_likelihood = tf.nn.sigmoid_cross_entropy_with_logits( labels, loss_logits) elif FLAGS.loss_type == 'mse': logging.info('Using mean squared error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_squared_error( labels, loss_probs) elif FLAGS.loss_type == 'mae': logging.info('Using mean absolute error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_absolute_error( labels, loss_probs) negative_log_likelihood = tf.reduce_mean( negative_log_likelihood) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) metrics['train/accuracy'].update_state(labels, pred_labels) metrics['train/ece'].update_state(ece_labels, ece_probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = create_feature_and_label(inputs) # Compute ensemble prediction over Monte Carlo forward-pass samples. logits_list = [] stddev_list = [] for _ in range(FLAGS.num_mc_samples): logits = model(features, training=False) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) stddev = tf.sqrt(tf.linalg.diag_part(covmat)) logits_list.append(logits) stddev_list.append(stddev) # Logits dimension is (num_samples, batch_size, num_classes). logits_list = tf.stack(logits_list, axis=0) stddev_list = tf.stack(stddev_list, axis=0) stddev = tf.reduce_mean(stddev_list, axis=0) probs_list = tf.nn.sigmoid(logits_list) probs = tf.reduce_mean(probs_list, axis=0) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) ece_probs = tf.concat([1. - probs, probs], axis=1) pred_labels = tf.math.argmax(ece_probs, axis=-1) auc_probs = tf.squeeze(probs, axis=1) ce = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.broadcast_to( labels, [FLAGS.num_mc_samples, labels.shape[0]]), logits=tf.squeeze(logits_list, axis=-1)) negative_log_likelihood = -tf.reduce_logsumexp( -ce, axis=0) + tf.math.log(float(FLAGS.num_mc_samples)) negative_log_likelihood = tf.reduce_mean(negative_log_likelihood) if dataset_name == 'ind': metrics['test/nll'].update_state(negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/ece'].update_state(ece_labels, ece_probs) metrics['test/stddev'].update_state(stddev) metrics['test/acc'].update_state(ece_labels, pred_labels) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].update_state(ece_labels, ece_probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/ece_{}'.format(dataset_name)].update_state( ece_labels, ece_probs) metrics['test/stddev_{}'.format(dataset_name)].update_state( stddev) metrics['test/acc_{}'.format(dataset_name)].update_state( ece_labels, pred_labels) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, ece_probs) strategy.run(step_fn, args=(next(iterator), )) @tf.function def final_eval_step(iterator): """Final Evaluation StepFn to save prediction to directory.""" def step_fn(inputs): bert_features, labels, additional_labels = create_feature_and_label( inputs) logits = model(bert_features, training=False) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract both. logits, covmat = logits else: covmat = tf.eye(test_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) covmat = tf.cast(covmat, tf.float32) logits = ed.layers.utils.mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) features = inputs['input_ids'] return features, logits, labels, additional_labels (per_replica_texts, per_replica_logits, per_replica_labels, per_replica_additional_labels) = (strategy.run( step_fn, args=(next(iterator), ))) if strategy.num_replicas_in_sync > 1: texts_list = tf.concat(per_replica_texts.values, axis=0) logits_list = tf.concat(per_replica_logits.values, axis=0) labels_list = tf.concat(per_replica_labels.values, axis=0) additional_labels_dict = {} for additional_label in _IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[additional_label] = tf.concat( per_replica_additional_labels[additional_label], axis=0) else: texts_list = per_replica_texts logits_list = per_replica_logits labels_list = per_replica_labels additional_labels_dict = {} for additional_label in _IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[ additional_label] = per_replica_additional_labels[ additional_label] return texts_list, logits_list, labels_list, additional_labels_dict if FLAGS.prediction_mode: # Prediction and exit. for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types message = 'Final eval on dataset {}'.format(dataset_name) logging.info(message) texts_all = [] logits_all = [] labels_all = [] additional_labels_all_dict = {} if 'identity' in dataset_name: for identity_label_name in _IDENTITY_LABELS: additional_labels_all_dict[identity_label_name] = [] for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: message = 'Starting to run eval step {}/{} of dataset: {}'.format( step, steps_per_eval[dataset_name], dataset_name) logging.info(message) try: (text_step, logits_step, labels_step, additional_labels_dict_step ) = final_eval_step(test_iterator) except tf.errors.OutOfRangeError: continue texts_all.append(text_step) logits_all.append(logits_step) labels_all.append(labels_step) if 'identity' in dataset_name: for identity_label_name in _IDENTITY_LABELS: additional_labels_all_dict[identity_label_name].append( additional_labels_dict_step[identity_label_name]) texts_all = tf.concat(texts_all, axis=0) logits_all = tf.concat(logits_all, axis=0) labels_all = tf.concat(labels_all, axis=0) additional_labels_all = [] if additional_labels_all_dict: for identity_label_name in _IDENTITY_LABELS: additional_labels_all.append( tf.concat( additional_labels_all_dict[identity_label_name], axis=0)) additional_labels_all = tf.convert_to_tensor(additional_labels_all) save_prediction(texts_all.numpy(), path=os.path.join(FLAGS.output_dir, 'texts_{}'.format(dataset_name))) save_prediction( labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'labels_{}'.format(dataset_name))) save_prediction( logits_all.numpy(), path=os.path.join(FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: save_prediction( additional_labels_all.numpy(), path=os.path.join( FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) logging.info('Done with testing on %s', dataset_name) else: train_iterator = iter(train_dataset) start_time = time.time() for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) for step in range(steps_per_epoch): train_step(train_iterator) current_step = epoch * steps_per_epoch + (step + 1) max_steps = steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ( '{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) if step % 20 == 0: logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) logging.info('Testing on dataset %s', dataset_name) for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info( 'Starting to run eval step %s of epoch: %s', step, epoch) test_step(test_iterator, dataset_name) logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, ECE: %.2f, Accuracy: %.2f', metrics['train/loss'].result(), metrics['train/ece'].result(), metrics['train/accuracy'].result()) total_results = { name: metric.result() for name, metric in metrics.items() } with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for metric in metrics.values(): metric.reset_states() if (FLAGS.checkpoint_interval > 0 and (epoch + 1) % FLAGS.checkpoint_interval == 0): checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name) # Save model in SavedModel format on exit. final_save_name = os.path.join(FLAGS.output_dir, 'model') model.save(final_save_name) logging.info('Saved model to %s', final_save_name)
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Saving checkpoints at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores train_dataset_builder = ub.datasets.ClincIntentDetectionDataset( split='train', data_dir=FLAGS.data_dir, data_mode='ind') ind_dataset_builder = ub.datasets.ClincIntentDetectionDataset( split='test', data_dir=FLAGS.data_dir, data_mode='ind') ood_dataset_builder = ub.datasets.ClincIntentDetectionDataset( split='test', data_dir=FLAGS.data_dir, data_mode='ood') all_dataset_builder = ub.datasets.ClincIntentDetectionDataset( split='test', data_dir=FLAGS.data_dir, data_mode='all') dataset_builders = { 'clean': ind_dataset_builder, 'ood': ood_dataset_builder, 'all': all_dataset_builder } train_dataset = train_dataset_builder.load(batch_size=batch_size) ds_info = train_dataset_builder.tfds_info feature_size = ds_info.metadata['feature_size'] # num_classes is number of valid intents plus out-of-scope intent num_classes = ds_info.features['intent_label'].num_classes + 1 # vocab_size is total number of valid tokens plus the out-of-vocabulary token. vocab_size = ind_dataset_builder.tokenizer.num_words + 1 steps_per_epoch = train_dataset_builder.num_examples // batch_size test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.load( batch_size=FLAGS.eval_batch_size) steps_per_eval[dataset_name] = (dataset_builder.num_examples // FLAGS.eval_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) premade_embedding_array = None if FLAGS.word_embedding_dir: with tf.io.gfile.GFile(FLAGS.word_embedding_dir, 'rb') as embedding_file: premade_embedding_array = np.load(embedding_file) with strategy.scope(): logging.info('Building %s model', FLAGS.model_family) if FLAGS.model_family.lower() == 'textcnn': model = cnn_model.textcnn( filter_sizes=[int(x) for x in FLAGS.filter_sizes], num_filters=FLAGS.num_filters, num_classes=num_classes, feature_size=feature_size, vocab_size=vocab_size, embed_size=FLAGS.embedding_size, dropout_rate=FLAGS.dropout_rate, l2=FLAGS.l2, premade_embedding_arr=premade_embedding_array) optimizer = tf.keras.optimizers.Adam(FLAGS.base_learning_rate) elif FLAGS.model_family.lower() == 'bert': bert_config_dir, bert_ckpt_dir = resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_utils.create_config(bert_config_dir) model, bert_encoder = ub.models.BertBuilder( num_classes=num_classes, max_seq_length=feature_size, bert_config=bert_config) optimizer = bert_utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion) else: raise ValueError( 'model_family ({}) can only be TextCNN or BERT.'.format( FLAGS.model_family)) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'clean': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/accuracy_{}'.format(dataset_name): tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins) }) checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch elif FLAGS.model_family.lower() == 'bert': # load BERT from initial checkpoint bert_checkpoint = tf.train.Checkpoint(model=bert_encoder) bert_checkpoint.restore( bert_ckpt_dir).assert_existing_objects_matched() logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) # Finally, define OOD metrics outside the accelerator scope for CPU eval. metrics.update({ 'test/auroc_all': tf.keras.metrics.AUC(curve='ROC'), 'test/auprc_all': tf.keras.metrics.AUC(curve='PR') }) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels = create_feature_and_label( inputs, feature_size, model_family=FLAGS.model_family) with tf.GradientTape() as tape: # Set learning phase to enable dropout etc during training. logits = model(features, training=True) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True)) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.softmax(logits) metrics['train/ece'].update_state(labels, probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, logits) strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels = create_feature_and_label( inputs, feature_size, model_family=FLAGS.model_family) # Set learning phase to disable dropout etc during eval. logits = model(features, training=False) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) probs = tf.nn.softmax(logits) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy(labels, probs)) if dataset_name == 'clean': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].update_state(labels, probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/accuracy_{}'.format(dataset_name)].update_state( labels, probs) metrics['test/ece_{}'.format(dataset_name)].update_state( labels, probs) if dataset_name == 'all': ood_labels = tf.cast(labels == 150, labels.dtype) ood_probs = 1. - tf.reduce_max(probs, axis=-1) metrics['test/auroc_{}'.format(dataset_name)].update_state( ood_labels, ood_probs) metrics['test/auprc_{}'.format(dataset_name)].update_state( ood_labels, ood_probs) step_fn(next(iterator)) train_iterator = iter(train_dataset) start_time = time.time() for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) for step in range(steps_per_epoch): train_step(train_iterator) current_step = epoch * steps_per_epoch + (step + 1) max_steps = steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) if step % 20 == 0: logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) logging.info('Testing on dataset %s', dataset_name) for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info( 'Starting to run eval step %s of epoch: %s', step, epoch) test_step(test_iterator, dataset_name) logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, Accuracy: %.2f%%', metrics['train/loss'].result(), metrics['train/accuracy'].result() * 100) logging.info('Test NLL: %.4f, Accuracy: %.2f%%', metrics['test/negative_log_likelihood'].result(), metrics['test/accuracy'].result() * 100) total_results = { name: metric.result() for name, metric in metrics.items() } with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for metric in metrics.values(): metric.reset_states() if (FLAGS.checkpoint_interval > 0 and (epoch + 1) % FLAGS.checkpoint_interval == 0): checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name)
def main(argv): del argv # unused arg tf.io.gfile.makedirs(FLAGS.output_dir) logging.info('Model checkpoint will be saved at %s', FLAGS.output_dir) tf.random.set_seed(FLAGS.seed) if FLAGS.use_gpu: logging.info('Use GPU') strategy = tf.distribute.MirroredStrategy() else: logging.info('Use TPU at %s', FLAGS.tpu if FLAGS.tpu is not None else 'local') resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=FLAGS.tpu) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores test_batch_size = batch_size data_buffer_size = batch_size * 10 train_dataset_builder = ds.WikipediaToxicityDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ind_dataset_builder = ds.WikipediaToxicityDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.in_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_dataset_builder = ds.CivilCommentsDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.ood_dataset_dir, shuffle_buffer_size=data_buffer_size) ood_identity_dataset_builder = ds.CivilCommentsIdentitiesDataset( batch_size=FLAGS.per_core_batch_size, eval_batch_size=FLAGS.per_core_batch_size, data_dir=FLAGS.identity_dataset_dir, shuffle_buffer_size=data_buffer_size) dataset_builders = { 'ind': ind_dataset_builder, 'ood': ood_dataset_builder, 'ood_identity': ood_identity_dataset_builder, } train_dataset = train_dataset_builder.build(split=base.Split.TRAIN) ds_info = train_dataset_builder.info feature_size = _MAX_SEQ_LENGTH num_classes = ds_info['num_classes'] # Positive and negative classes. steps_per_epoch = ds_info['num_train_examples'] // batch_size test_datasets = {} steps_per_eval = {} for dataset_name, dataset_builder in dataset_builders.items(): test_datasets[dataset_name] = dataset_builder.build( split=base.Split.TEST) steps_per_eval[dataset_name] = ( dataset_builder.info['num_test_examples'] // test_batch_size) if FLAGS.use_bfloat16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16') tf.keras.mixed_precision.experimental.set_policy(policy) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.output_dir, 'summaries')) with strategy.scope(): logging.info('Building %s model', FLAGS.model_family) bert_config_dir, bert_ckpt_dir = resolve_bert_ckpt_and_config_dir( FLAGS.bert_dir, FLAGS.bert_config_dir, FLAGS.bert_ckpt_dir) bert_config = bert_utils.create_config(bert_config_dir) model, bert_encoder = ub.models.BertBuilder( num_classes=num_classes, max_seq_length=feature_size, bert_config=bert_config) optimizer = bert_utils.create_optimizer( FLAGS.base_learning_rate, steps_per_epoch=steps_per_epoch, epochs=FLAGS.train_epochs, warmup_proportion=FLAGS.warmup_proportion) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) metrics = { 'train/negative_log_likelihood': tf.keras.metrics.Mean(), 'train/auroc': tf.keras.metrics.AUC(), 'train/loss': tf.keras.metrics.Mean(), 'train/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if FLAGS.prediction_mode: latest_checkpoint = tf.train.latest_checkpoint( FLAGS.eval_checkpoint_dir) else: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir) initial_epoch = 0 if latest_checkpoint: # checkpoint.restore must be within a strategy.scope() so that optimizer # slot variables are mirrored. checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) initial_epoch = optimizer.iterations.numpy() // steps_per_epoch elif FLAGS.model_family.lower() == 'bert': # load BERT from initial checkpoint bert_checkpoint = tf.train.Checkpoint(model=bert_encoder) bert_checkpoint.restore( bert_ckpt_dir).assert_existing_objects_matched() logging.info('Loaded BERT checkpoint %s', bert_ckpt_dir) metrics.update({ 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/auroc': tf.keras.metrics.AUC(curve='ROC'), 'test/aupr': tf.keras.metrics.AUC(curve='PR'), 'test/brier': tf.keras.metrics.MeanSquaredError(), 'test/ece': um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/eval_time': tf.keras.metrics.Mean(), }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}'.format(fraction): um.OracleCollaborativeAccuracy(fraction=float(fraction), num_bins=FLAGS.num_bins) }) for dataset_name, test_dataset in test_datasets.items(): if dataset_name != 'ind': metrics.update({ 'test/nll_{}'.format(dataset_name): tf.keras.metrics.Mean(), 'test/auroc_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='ROC'), 'test/aupr_{}'.format(dataset_name): tf.keras.metrics.AUC(curve='PR'), 'test/brier_{}'.format(dataset_name): tf.keras.metrics.MeanSquaredError(), 'test/ece_{}'.format(dataset_name): um.ExpectedCalibrationError(num_bins=FLAGS.num_bins), 'test/eval_time_{}'.format(dataset_name): tf.keras.metrics.Mean() }) for fraction in FLAGS.fractions: metrics.update({ 'test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name): um.OracleCollaborativeAccuracy( fraction=float(fraction), num_bins=FLAGS.num_bins) }) @tf.function def train_step(iterator): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = create_feature_and_label(inputs) with tf.GradientTape() as tape: logits = model(features, training=True) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) logging.info('labels shape %s', labels.shape) logging.info('logits shape %s', logits.shape) loss_logits = tf.squeeze(logits, axis=1) if FLAGS.loss_type == 'cross_entropy': logging.info('Using cross entropy loss') negative_log_likelihood = tf.nn.sigmoid_cross_entropy_with_logits( labels, loss_logits) elif FLAGS.loss_type == 'mse': logging.info('Using mean squared error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_squared_error( labels, loss_probs) elif FLAGS.loss_type == 'mae': logging.info('Using mean absolute error loss') loss_probs = tf.nn.sigmoid(loss_logits) negative_log_likelihood = tf.keras.losses.mean_absolute_error( labels, loss_probs) negative_log_likelihood = tf.reduce_mean( negative_log_likelihood) l2_loss = sum(model.losses) loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) ece_probs = tf.concat([1. - probs, probs], axis=1) auc_probs = tf.squeeze(probs, axis=1) metrics['train/ece'].update_state(ece_labels, ece_probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/auroc'].update_state(labels, auc_probs) strategy.run(step_fn, args=(next(iterator), )) @tf.function def test_step(iterator, dataset_name): """Evaluation StepFn to log metrics.""" def step_fn(inputs): """Per-Replica StepFn.""" features, labels, _ = create_feature_and_label(inputs) eval_start_time = time.time() logits = model(features, training=False) eval_time = (time.time() - eval_start_time) / FLAGS.per_core_batch_size if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) probs = tf.nn.sigmoid(logits) # Cast labels to discrete for ECE computation. ece_labels = tf.cast(labels > FLAGS.ece_label_threshold, tf.float32) ece_probs = tf.concat([1. - probs, probs], axis=1) auc_probs = tf.squeeze(probs, axis=1) loss_logits = tf.squeeze(logits, axis=1) negative_log_likelihood = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels, loss_logits)) if dataset_name == 'ind': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/auroc'].update_state(labels, auc_probs) metrics['test/aupr'].update_state(labels, auc_probs) metrics['test/brier'].update_state(labels, auc_probs) metrics['test/ece'].update_state(ece_labels, ece_probs) metrics['test/eval_time'].update_state(eval_time) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}'.format( fraction)].update_state(ece_labels, ece_probs) else: metrics['test/nll_{}'.format(dataset_name)].update_state( negative_log_likelihood) metrics['test/auroc_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/aupr_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/brier_{}'.format(dataset_name)].update_state( labels, auc_probs) metrics['test/ece_{}'.format(dataset_name)].update_state( ece_labels, ece_probs) metrics['test/eval_time_{}'.format(dataset_name)].update_state( eval_time) for fraction in FLAGS.fractions: metrics['test_collab_acc/collab_acc_{}_{}'.format( fraction, dataset_name)].update_state(ece_labels, ece_probs) strategy.run(step_fn, args=(next(iterator), )) @tf.function def final_eval_step(iterator): """Final Evaluation StepFn to save prediction to directory.""" def step_fn(inputs): bert_features, labels, additional_labels = create_feature_and_label( inputs) logits = model(bert_features, training=False) features = inputs['input_ids'] return features, logits, labels, additional_labels (per_replica_texts, per_replica_logits, per_replica_labels, per_replica_additional_labels) = (strategy.run( step_fn, args=(next(iterator), ))) if strategy.num_replicas_in_sync > 1: texts_list = tf.concat(per_replica_texts.values, axis=0) logits_list = tf.concat(per_replica_logits.values, axis=0) labels_list = tf.concat(per_replica_labels.values, axis=0) additional_labels_dict = {} for additional_label in _IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[additional_label] = tf.concat( per_replica_additional_labels[additional_label], axis=0) else: texts_list = per_replica_texts logits_list = per_replica_logits labels_list = per_replica_labels additional_labels_dict = {} for additional_label in _IDENTITY_LABELS: if additional_label in per_replica_additional_labels: additional_labels_dict[ additional_label] = per_replica_additional_labels[ additional_label] return texts_list, logits_list, labels_list, additional_labels_dict if FLAGS.prediction_mode: # Prediction and exit. for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types message = 'Final eval on dataset {}'.format(dataset_name) logging.info(message) texts_all = [] logits_all = [] labels_all = [] additional_labels_all_dict = {} if 'identity' in dataset_name: for identity_label_name in _IDENTITY_LABELS: additional_labels_all_dict[identity_label_name] = [] for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: message = 'Starting to run eval step {}/{} of dataset: {}'.format( step, steps_per_eval[dataset_name], dataset_name) logging.info(message) try: (text_step, logits_step, labels_step, additional_labels_dict_step ) = final_eval_step(test_iterator) except tf.errors.OutOfRangeError: continue texts_all.append(text_step) logits_all.append(logits_step) labels_all.append(labels_step) if 'identity' in dataset_name: for identity_label_name in _IDENTITY_LABELS: additional_labels_all_dict[identity_label_name].append( additional_labels_dict_step[identity_label_name]) texts_all = tf.concat(texts_all, axis=0) logits_all = tf.concat(logits_all, axis=0) labels_all = tf.concat(labels_all, axis=0) additional_labels_all = [] if additional_labels_all_dict: for identity_label_name in _IDENTITY_LABELS: additional_labels_all.append( tf.concat( additional_labels_all_dict[identity_label_name], axis=0)) additional_labels_all = tf.convert_to_tensor(additional_labels_all) save_prediction(texts_all.numpy(), path=os.path.join(FLAGS.output_dir, 'texts_{}'.format(dataset_name))) save_prediction( labels_all.numpy(), path=os.path.join(FLAGS.output_dir, 'labels_{}'.format(dataset_name))) save_prediction( logits_all.numpy(), path=os.path.join(FLAGS.output_dir, 'logits_{}'.format(dataset_name))) if 'identity' in dataset_name: save_prediction( additional_labels_all.numpy(), path=os.path.join( FLAGS.output_dir, 'additional_labels_{}'.format(dataset_name))) logging.info('Done with testing on %s', dataset_name) else: # Execute train / eval loop. train_iterator = iter(train_dataset) # pytype: disable=wrong-arg-types start_time = time.time() for epoch in range(initial_epoch, FLAGS.train_epochs): logging.info('Starting to run epoch: %s', epoch) for step in range(steps_per_epoch): train_step(train_iterator) current_step = epoch * steps_per_epoch + (step + 1) max_steps = steps_per_epoch * FLAGS.train_epochs time_elapsed = time.time() - start_time steps_per_sec = float(current_step) / time_elapsed eta_seconds = (max_steps - current_step) / steps_per_sec message = ( '{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. ' 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format( current_step / max_steps, epoch + 1, FLAGS.train_epochs, steps_per_sec, eta_seconds / 60, time_elapsed / 60)) if step % 20 == 0: logging.info(message) if epoch % FLAGS.evaluation_interval == 0: for dataset_name, test_dataset in test_datasets.items(): test_iterator = iter(test_dataset) # pytype: disable=wrong-arg-types logging.info('Testing on dataset %s', dataset_name) for step in range(steps_per_eval[dataset_name]): if step % 20 == 0: logging.info( 'Starting to run eval step %s/%s of epoch: %s', step, steps_per_eval[dataset_name], epoch) try: test_step(test_iterator, dataset_name) except StopIteration: continue logging.info('Done with testing on %s', dataset_name) logging.info('Train Loss: %.4f, AUROC: %.4f', metrics['train/loss'].result(), metrics['train/auroc'].result()) logging.info('Test NLL: %.4f, AUROC: %.4f', metrics['test/negative_log_likelihood'].result(), metrics['test/auroc'].result()) # record results total_results = {} for name, metric in metrics.items(): total_results[name] = metric.result() with summary_writer.as_default(): for name, result in total_results.items(): tf.summary.scalar(name, result, step=epoch + 1) for name, metric in metrics.items(): metric.reset_states() if (FLAGS.checkpoint_interval > 0 and (epoch + 1) % FLAGS.checkpoint_interval == 0): checkpoint_name = checkpoint.save( os.path.join(FLAGS.output_dir, 'checkpoint')) logging.info('Saved checkpoint to %s', checkpoint_name) # Save model in SavedModel format on exit. final_save_name = os.path.join(FLAGS.output_dir, 'model') model.save(final_save_name) logging.info('Saved model to %s', final_save_name)