def check(model, builder, eval_steps, ckpt, strategy, topology): """Perform evaluation.""" # Build input pipeline. ds = data_lib.build_distributed_dataset(builder, FLAGS.eval_batch_size, False, strategy, topology) # Build metrics. with strategy.scope(): # Restore checkpoint. logging.info('Restoring from %s', ckpt) checkpoint = tf.train.Checkpoint( model=model, global_step=tf.Variable(0, dtype=tf.int64)) checkpoint.restore(ckpt).expect_partial() global_step = checkpoint.global_step logging.info('Performing eval at step %d', global_step.numpy()) preds_count = np.zeros(5) def single_step(features, labels): _, supervised_head_outputs = model(features, training=False) assert supervised_head_outputs is not None outputs = supervised_head_outputs.numpy() pred = np.argmax(outputs, axis=1) u, counts = np.unique(pred, return_counts=True) for i, j in zip(u, counts): preds_count[i] += j with strategy.scope(): def run_single_step(iterator): images, labels = next(iterator) features, labels = images, {'labels': labels} strategy.run(single_step, (features, labels)) iterator = iter(ds) for i in range(eval_steps): run_single_step(iterator) logging.info('Completed eval for %d / %d steps', i + 1, eval_steps) logging.info('Finished eval for %s', ckpt) logging.info('pred counts {}'.format(preds_count)) return preds_count
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') ''' builder = tfds.builder(FLAGS.dataset, data_dir=FLAGS.data_dir) builder.download_and_prepare() num_train_examples = builder.info.splits[FLAGS.train_split].num_examples num_eval_examples = builder.info.splits[FLAGS.eval_split].num_examples num_classes = builder.info.features['label'].num_classes ''' if tf.executing_eagerly(): print('Eager mode now!') else: print('not Eager mode!') set_seed(FLAGS.seed) # data.pyのbuild_distributed_datasetをうまく書き換えることでbuilderの扱い方をいい感じにする # build_distributed_datasetのなかでデータのオブジェクトを呼び出しているだけ(多分) builder = pd.read_csv(FLAGS.data_path + 'train.csv') #builder_s = builder.sample(frac=1, random_state=FLAGS.seed).reset_index(drop=True) #num_fold = len(builder) // 5 #test_builder = builder[num_fold * FLAGS.fold: num_fold * (FLAGS.fold + 1)] #train_builder = builder.drop(builder.index[num_fold * FLAGS.fold: num_fold * (FLAGS.fold + 1)]) kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=FLAGS.seed) i = 0 for train_idx, test_idx in kf.split(builder['image_id'], builder['label']): if i == FLAGS.fold: train_builder = builder.iloc[train_idx] test_builder = builder.iloc[test_idx] i += 1 #train_builder, test_builder = train_test_split(builder, test_size=FLAGS.test_ratio, stratify=builder['label'], random_state=1) num_train_examples = len(train_builder) num_eval_examples = len(test_builder) num_classes = 5 train_steps = model_lib.get_train_steps(num_train_examples) eval_steps = FLAGS.eval_steps or int( math.ceil(num_eval_examples / FLAGS.eval_batch_size)) if FLAGS.train_mode == 'finetune': # trainの中からsupervisedとして使うデータを抽出 if FLAGS.stratify: train_builder, _ = train_test_split(train_builder, train_size=FLAGS.supervised_ratio, random_state=FLAGS.seed, stratify=train_builder['label']) print(train_builder['label'].value_counts()) else: train_builder, _ = train_test_split(train_builder, train_size=FLAGS.supervised_ratio, random_state=FLAGS.seed) num_train_examples = len(train_builder) train_steps = FLAGS.pretrain_steps + num_train_examples * FLAGS.train_epochs // FLAGS.train_batch_size # これ以降はbuilderはbuild_distributed_datasetの引数としてしか登場しない epoch_steps = int(round(num_train_examples / FLAGS.train_batch_size)) logging.info('# train examples: %d', num_train_examples) logging.info('# train_steps: %d', train_steps) logging.info('# eval examples: %d', num_eval_examples) logging.info('# eval steps: %d', eval_steps) checkpoint_steps = ( FLAGS.checkpoint_steps or (FLAGS.checkpoint_epochs * epoch_steps)) topology = None if FLAGS.use_tpu: if FLAGS.tpu_name: cluster = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: cluster = tf.distribute.cluster_resolver.TPUClusterResolver(FLAGS.master) tf.config.experimental_connect_to_cluster(cluster) topology = tf.tpu.experimental.initialize_tpu_system(cluster) logging.info('Topology:') logging.info('num_tasks: %d', topology.num_tasks) logging.info('num_tpus_per_task: %d', topology.num_tpus_per_task) strategy = tf.distribute.experimental.TPUStrategy(cluster) else: # For (multiple) GPUs. strategy = tf.distribute.MirroredStrategy() logging.info('Running using MirroredStrategy on %d replicas', strategy.num_replicas_in_sync) with strategy.scope(): model = model_lib.Model(num_classes) if FLAGS.mode == 'check': for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=15): result = check(model, test_builder, eval_steps, ckpt, strategy, topology) if result['global_step'] >= train_steps: logging.info('Eval complete. Exiting...') return elif FLAGS.mode == 'eval': for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=15): result = perform_evaluation(model, test_builder, eval_steps, ckpt, strategy, topology) if result['global_step'] >= train_steps: logging.info('Eval complete. Exiting...') return else: summary_writer = tf.summary.create_file_writer(FLAGS.model_dir) with strategy.scope(): # Build input pipeline. ds = data_lib.build_distributed_dataset(train_builder, FLAGS.train_batch_size, True, strategy, topology) # Build LR schedule and optimizer. learning_rate = model_lib.WarmUpAndCosineDecay(FLAGS.learning_rate, num_train_examples) optimizer = model_lib.build_optimizer(learning_rate) # Build metrics. all_metrics = [] # For summaries. weight_decay_metric = tf.keras.metrics.Mean('train/weight_decay') total_loss_metric = tf.keras.metrics.Mean('train/total_loss') all_metrics.extend([weight_decay_metric, total_loss_metric]) if FLAGS.train_mode == 'pretrain': contrast_loss_metric = tf.keras.metrics.Mean('train/contrast_loss') contrast_acc_metric = tf.keras.metrics.Mean('train/contrast_acc') contrast_entropy_metric = tf.keras.metrics.Mean( 'train/contrast_entropy') all_metrics.extend([ contrast_loss_metric, contrast_acc_metric, contrast_entropy_metric ]) if FLAGS.train_mode == 'finetune' or FLAGS.lineareval_while_pretraining: supervised_loss_metric = tf.keras.metrics.Mean('train/supervised_loss') supervised_acc_metric = tf.keras.metrics.Mean('train/supervised_acc') all_metrics.extend([supervised_loss_metric, supervised_acc_metric]) # Restore checkpoint if available. checkpoint_manager = try_restore_from_checkpoint( model, optimizer.iterations, optimizer) steps_per_loop = checkpoint_steps def single_step(features, labels): with tf.GradientTape() as tape: # Log summaries on the last step of the training loop to match # logging frequency of other scalar summaries. # # Notes: # 1. Summary ops on TPUs get outside compiled so they do not affect # performance. # 2. Summaries are recorded only on replica 0. So effectively this # summary would be written once per host when should_record == True. # 3. optimizer.iterations is incremented in the call to apply_gradients. # So we use `iterations + 1` here so that the step number matches # those of scalar summaries. # 4. We intentionally run the summary op before the actual model # training so that it can run in parallel. should_record = tf.equal((optimizer.iterations + 1) % steps_per_loop, 0) with tf.summary.record_if(should_record): # Only log augmented images for the first tower. tf.summary.image( 'image', features[:, :, :, :3], step=optimizer.iterations + 1) projection_head_outputs, supervised_head_outputs = model( features, training=True) loss = None if projection_head_outputs is not None: outputs = projection_head_outputs con_loss, logits_con, labels_con = obj_lib.add_contrastive_loss( outputs, hidden_norm=FLAGS.hidden_norm, temperature=FLAGS.temperature, strategy=strategy) if loss is None: loss = con_loss else: loss += con_loss metrics.update_pretrain_metrics_train(contrast_loss_metric, contrast_acc_metric, contrast_entropy_metric, con_loss, logits_con, labels_con) if supervised_head_outputs is not None: outputs = supervised_head_outputs l = labels['labels'] if FLAGS.train_mode == 'pretrain' and FLAGS.lineareval_while_pretraining: l = tf.concat([l, l], 0) sup_loss = obj_lib.add_supervised_loss(labels=l, logits=outputs) if loss is None: loss = sup_loss else: loss += sup_loss metrics.update_finetune_metrics_train(supervised_loss_metric, supervised_acc_metric, sup_loss, l, outputs) weight_decay = model_lib.add_weight_decay( model, adjust_per_optimizer=True) weight_decay_metric.update_state(weight_decay) loss += weight_decay total_loss_metric.update_state(loss) # The default behavior of `apply_gradients` is to sum gradients from all # replicas so we divide the loss by the number of replicas so that the # mean gradient is applied. loss = loss / strategy.num_replicas_in_sync logging.info('Trainable variables:') for var in model.trainable_variables: logging.info(var.name) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) with strategy.scope(): @tf.function def train_multiple_steps(iterator): # `tf.range` is needed so that this runs in a `tf.while_loop` and is # not unrolled. for _ in tf.range(steps_per_loop): # Drop the "while" prefix created by tf.while_loop which otherwise # gets prefixed to every variable name. This does not affect training # but does affect the checkpoint conversion script. # TODO(b/161712658): Remove this. with tf.name_scope(''): images, labels = next(iterator) features, labels = images, {'labels': labels} strategy.run(single_step, (features, labels)) global_step = optimizer.iterations cur_step = global_step.numpy() iterator = iter(ds) while cur_step + 1 < train_steps: # Calls to tf.summary.xyz lookup the summary writer resource which is # set by the summary writer's context manager. with summary_writer.as_default(): train_multiple_steps(iterator) cur_step = global_step.numpy() checkpoint_manager.save(cur_step) logging.info('Completed: %d / %d steps', cur_step, train_steps) metrics.log_and_write_metrics_to_summary(all_metrics, cur_step) tf.summary.scalar( 'learning_rate', learning_rate(tf.cast(global_step, dtype=tf.float32)), global_step) summary_writer.flush() for metric in all_metrics: metric.reset_states() logging.info('Training complete...') if FLAGS.mode == 'train_then_eval': perform_evaluation(model, test_builder, eval_steps, checkpoint_manager.latest_checkpoint, strategy, topology)
def perform_evaluation(model, builder, eval_steps, ckpt, strategy, topology): """Perform evaluation.""" if FLAGS.train_mode == 'pretrain' and not FLAGS.lineareval_while_pretraining: logging.info('Skipping eval during pretraining without linear eval.') return # Build input pipeline. ds = data_lib.build_distributed_dataset(builder, FLAGS.eval_batch_size, False, strategy, topology) summary_writer = tf.summary.create_file_writer(FLAGS.model_dir) # Build metrics. with strategy.scope(): regularization_loss = tf.keras.metrics.Mean('eval/regularization_loss') label_top_1_accuracy = tf.keras.metrics.Accuracy( 'eval/label_top_1_accuracy') label_top_5_accuracy = tf.keras.metrics.TopKCategoricalAccuracy( 5, 'eval/label_top_5_accuracy') all_metrics = [ regularization_loss, label_top_1_accuracy, label_top_5_accuracy ] # Restore checkpoint. logging.info('Restoring from %s', ckpt) checkpoint = tf.train.Checkpoint( model=model, global_step=tf.Variable(0, dtype=tf.int64)) checkpoint.restore(ckpt).expect_partial() global_step = checkpoint.global_step logging.info('Performing eval at step %d', global_step.numpy()) def single_step(features, labels): _, supervised_head_outputs = model(features, training=False) assert supervised_head_outputs is not None outputs = supervised_head_outputs l = labels['labels'] metrics.update_finetune_metrics_eval(label_top_1_accuracy, label_top_5_accuracy, outputs, l) reg_loss = model_lib.add_weight_decay(model, adjust_per_optimizer=True) regularization_loss.update_state(reg_loss) with strategy.scope(): @tf.function def run_single_step(iterator): images, labels = next(iterator) features, labels = images, {'labels': labels} strategy.run(single_step, (features, labels)) iterator = iter(ds) for i in range(eval_steps): run_single_step(iterator) logging.info('Completed eval for %d / %d steps', i + 1, eval_steps) logging.info('Finished eval for %s', ckpt) # Write summaries cur_step = global_step.numpy() logging.info('Writing summaries for %d step', cur_step) with summary_writer.as_default(): metrics.log_and_write_metrics_to_summary(all_metrics, cur_step) summary_writer.flush() # Record results as JSON. result_json_path = os.path.join(FLAGS.model_dir, 'result.json') result = {metric.name: metric.result().numpy() for metric in all_metrics} result['global_step'] = global_step.numpy() logging.info(result) with tf.io.gfile.GFile(result_json_path, 'w') as f: json.dump({k: float(v) for k, v in result.items()}, f) result_json_path = os.path.join( FLAGS.model_dir, 'result_%d.json'%result['global_step']) with tf.io.gfile.GFile(result_json_path, 'w') as f: json.dump({k: float(v) for k, v in result.items()}, f) flag_json_path = os.path.join(FLAGS.model_dir, 'flags.json') with tf.io.gfile.GFile(flag_json_path, 'w') as f: serializable_flags = {} for key, val in FLAGS.flag_values_dict().items(): # Some flag value types e.g. datetime.timedelta are not json serializable, # filter those out. if json_serializable(val): serializable_flags[key] = val json.dump(serializable_flags, f) # Export as SavedModel for finetuning and inference. if FLAGS.train_mode == 'finetune': save(model, global_step=result['global_step']) return result
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # hub_module = hub.load('style_transfer_content_weights_params') print("LOADING TF-HUB MODULE") hub_module = hub.load('https://tfhub.dev/google/magenta/arbitrary-image-stylization-v1-256/2') style_dataset = tfds.load('dtd', batch_size=50, split='train', decoders={'image': style_preprocessing_decoder(),}) builder = tfds.builder(FLAGS.dataset, data_dir=FLAGS.data_dir) builder.download_and_prepare() num_train_examples = builder.info.splits[FLAGS.train_split].num_examples num_eval_examples = builder.info.splits[FLAGS.eval_split].num_examples num_classes = builder.info.features['label'].num_classes train_steps = model_lib.get_train_steps(num_train_examples) eval_steps = FLAGS.eval_steps or int( math.ceil(num_eval_examples / FLAGS.eval_batch_size)) epoch_steps = int(round(num_train_examples / FLAGS.train_batch_size)) logging.info('# train examples: %d', num_train_examples) logging.info('# train_steps: %d', train_steps) logging.info('# eval examples: %d', num_eval_examples) logging.info('# eval steps: %d', eval_steps) checkpoint_steps = ( FLAGS.checkpoint_steps or (FLAGS.checkpoint_epochs * epoch_steps)) topology = None if FLAGS.use_tpu: if FLAGS.tpu_name: cluster = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: cluster = tf.distribute.cluster_resolver.TPUClusterResolver(FLAGS.master) tf.config.experimental_connect_to_cluster(cluster) topology = tf.tpu.experimental.initialize_tpu_system(cluster) logging.info('Topology:') logging.info('num_tasks: %d', topology.num_tasks) logging.info('num_tpus_per_task: %d', topology.num_tpus_per_task) strategy = tf.distribute.experimental.TPUStrategy(cluster) else: # For (multiple) GPUs. strategy = tf.distribute.MirroredStrategy() logging.info('Running using MirroredStrategy on %d replicas', strategy.num_replicas_in_sync) with strategy.scope(): model = model_lib.Model(num_classes) print("GOT MODEL") if FLAGS.mode == 'eval': for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=15): result = perform_evaluation(model, builder, eval_steps, ckpt, strategy, topology) if result['global_step'] >= train_steps: logging.info('Eval complete. Exiting...') return else: summary_writer = tf.summary.create_file_writer(FLAGS.model_dir) with strategy.scope(): # Build input pipeline. ds = data_lib.build_distributed_dataset(builder, FLAGS.train_batch_size, True, strategy, topology, hub_module, style_dataset) # Build LR schedule and optimizer. learning_rate = model_lib.WarmUpAndCosineDecay(FLAGS.learning_rate, num_train_examples) optimizer = model_lib.build_optimizer(learning_rate) # Build metrics. all_metrics = [] # For summaries. weight_decay_metric = tf.keras.metrics.Mean('train/weight_decay') total_loss_metric = tf.keras.metrics.Mean('train/total_loss') all_metrics.extend([weight_decay_metric, total_loss_metric]) if FLAGS.train_mode == 'pretrain': contrast_loss_metric = tf.keras.metrics.Mean('train/contrast_loss') contrast_acc_metric = tf.keras.metrics.Mean('train/contrast_acc') contrast_entropy_metric = tf.keras.metrics.Mean( 'train/contrast_entropy') all_metrics.extend([ contrast_loss_metric, contrast_acc_metric, contrast_entropy_metric ]) if FLAGS.train_mode == 'finetune' or FLAGS.lineareval_while_pretraining: supervised_loss_metric = tf.keras.metrics.Mean('train/supervised_loss') supervised_acc_metric = tf.keras.metrics.Mean('train/supervised_acc') all_metrics.extend([supervised_loss_metric, supervised_acc_metric]) # Restore checkpoint if available. checkpoint_manager = try_restore_from_checkpoint( model, optimizer.iterations, optimizer) steps_per_loop = checkpoint_steps def single_step(features, labels): with tf.GradientTape() as tape: # Log summaries on the last step of the training loop to match # logging frequency of other scalar summaries. # # Notes: # 1. Summary ops on TPUs get outside compiled so they do not affect # performance. # 2. Summaries are recorded only on replica 0. So effectively this # summary would be written once per host when should_record == True. # 3. optimizer.iterations is incremented in the call to apply_gradients. # So we use `iterations + 1` here so that the step number matches # those of scalar summaries. # 4. We intentionally run the summary op before the actual model # training so that it can run in parallel. should_record = tf.equal((optimizer.iterations + 1) % steps_per_loop, 0) with tf.summary.record_if(should_record): # Only log augmented images for the first tower. tf.summary.image( 'image', features[:, :, :, :3], step=optimizer.iterations + 1) projection_head_outputs, supervised_head_outputs = model( features, training=True) loss = None if projection_head_outputs is not None: outputs = projection_head_outputs con_loss, logits_con, labels_con = obj_lib.add_contrastive_loss( outputs, hidden_norm=FLAGS.hidden_norm, temperature=FLAGS.temperature, strategy=strategy) if loss is None: loss = con_loss else: loss += con_loss metrics.update_pretrain_metrics_train(contrast_loss_metric, contrast_acc_metric, contrast_entropy_metric, con_loss, logits_con, labels_con) if supervised_head_outputs is not None: outputs = supervised_head_outputs l = labels['labels'] if FLAGS.train_mode == 'pretrain' and FLAGS.lineareval_while_pretraining: l = tf.concat([l, l], 0) sup_loss = obj_lib.add_supervised_loss(labels=l, logits=outputs) if loss is None: loss = sup_loss else: loss += sup_loss metrics.update_finetune_metrics_train(supervised_loss_metric, supervised_acc_metric, sup_loss, l, outputs) weight_decay = model_lib.add_weight_decay( model, adjust_per_optimizer=True) weight_decay_metric.update_state(weight_decay) loss += weight_decay total_loss_metric.update_state(loss) # The default behavior of `apply_gradients` is to sum gradients from all # replicas so we divide the loss by the number of replicas so that the # mean gradient is applied. loss = loss / strategy.num_replicas_in_sync logging.info('Trainable variables:') for var in model.trainable_variables: logging.info(var.name) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) with strategy.scope(): @tf.function def train_multiple_steps(iterator): # `tf.range` is needed so that this runs in a `tf.while_loop` and is # not unrolled. for _ in tf.range(steps_per_loop): # Drop the "while" prefix created by tf.while_loop which otherwise # gets prefixed to every variable name. This does not affect training # but does affect the checkpoint conversion script. # TODO(b/161712658): Remove this. with tf.name_scope(''): images, labels = next(iterator) features, labels = images, {'labels': labels} strategy.run(single_step, (features, labels)) global_step = optimizer.iterations cur_step = global_step.numpy() iterator = iter(ds) while cur_step < train_steps: # Calls to tf.summary.xyz lookup the summary writer resource which is # set by the summary writer's context manager. with summary_writer.as_default(): train_multiple_steps(iterator) cur_step = global_step.numpy() checkpoint_manager.save(cur_step) logging.info('Completed: %d / %d steps', cur_step, train_steps) metrics.log_and_write_metrics_to_summary(all_metrics, cur_step) tf.summary.scalar( 'learning_rate', learning_rate(tf.cast(global_step, dtype=tf.float32)), global_step) summary_writer.flush() for metric in all_metrics: metric.reset_states() logging.info('Training complete...') if FLAGS.mode == 'train_then_eval': perform_evaluation(model, builder, eval_steps, checkpoint_manager.latest_checkpoint, strategy, topology)