def train(model_name=None, hparams=None, train_csv_path=None, train_clip_dir=None, class_map_path=None, train_dir=None, sample_rate=None): """Runs the training loop.""" print('\nTraining model:{} with hparams:{} and class map:{}'.format(model_name, hparams, class_map_path)) print('Training data: clip dir {} and labels {}'.format(train_clip_dir, train_csv_path)) print('Training dir {}\n'.format(train_dir)) with tf.Graph().as_default(): # Create the input pipeline. features, labels, num_classes, input_init = inputs.train_input( train_csv_path=train_csv_path, train_clip_dir=train_clip_dir, class_map_path=class_map_path, hparams=hparams, sample_rate=sample_rate) # Create the model in training mode. global_step, prediction, loss_tensor, train_op = model.define_model( model_name=model_name, features=features, labels=labels, num_classes=num_classes, hparams=hparams, training=True) # Define our own checkpoint saving hook, instead of using the built-in one, # so that we can specify additional checkpoint retention settings. saver = tf.train.Saver( max_to_keep=30, keep_checkpoint_every_n_hours=0.25) saver_hook = tf.train.CheckpointSaverHook( save_steps=250, checkpoint_dir=train_dir, saver=saver) summary_op = tf.summary.merge_all() summary_hook = tf.train.SummarySaverHook( save_steps=50, output_dir=train_dir, summary_op=summary_op) with tf.train.SingularMonitoredSession(hooks=[saver_hook, summary_hook], checkpoint_dir=train_dir) as sess: sess.raw_session().run(input_init) while not sess.should_stop(): step, _, pred, loss = sess.run([global_step, train_op, prediction, loss_tensor]) print(step, loss) sys.stdout.flush()
def train(model_name=None, hparams=None, train_csv_path=None, train_clip_dir=None, class_map_path=None, train_dir=None, sample_rate=None): """Runs the training loop.""" print('\nTraining model:{} with hparams:{} and class map:{}'.format( model_name, hparams, class_map_path)) print('Training data: clip dir {} and labels {}'.format( train_clip_dir, train_csv_path)) print('Training dir {}\n'.format(train_dir)) with tf.Graph().as_default(): # Create the input pipeline. features, labels, num_classes, input_init = inputs.train_input( train_csv_path=train_csv_path, train_clip_dir=train_clip_dir, class_map_path=class_map_path, hparams=hparams, sample_rate=sample_rate) # Create the model in training mode. global_step, prediction, loss_tensor, train_op = model.define_model( model_name=model_name, features=features, labels=labels, num_classes=num_classes, hparams=hparams, training=True) # Define our own checkpoint saving hook, instead of using the built-in one, # so that we can specify additional checkpoint retention settings. saver = tf.train.Saver(max_to_keep=30, keep_checkpoint_every_n_hours=0.25) saver_hook = tf.train.CheckpointSaverHook(save_steps=250, checkpoint_dir=train_dir, saver=saver) summary_op = tf.summary.merge_all() summary_hook = tf.train.SummarySaverHook(save_steps=50, output_dir=train_dir, summary_op=summary_op) with tf.train.SingularMonitoredSession( hooks=[saver_hook, summary_hook], checkpoint_dir=train_dir) as sess: sess.raw_session().run(input_init) while not sess.should_stop(): step, _, pred, loss = sess.run( [global_step, train_op, prediction, loss_tensor]) print(step, loss) sys.stdout.flush()
def train(model_name=None, hparams=None, class_map_path=None, train_csv_path=None, train_clip_dir=None, train_dir=None, epoch_batches=None, warmstart_checkpoint=None, warmstart_include_scopes=None, warmstart_exclude_scopes=None): """Runs the training loop.""" print('\nTraining model:{} with hparams:{} and class map:{}'.format( model_name, hparams, class_map_path)) print('Training data: clip dir {} and labels {}'.format( train_clip_dir, train_csv_path)) print('Training dir {}\n'.format(train_dir)) with tf.Graph().as_default(): # Create the input pipeline. features, labels, num_classes, input_init = inputs.train_input( train_csv_path=train_csv_path, train_clip_dir=train_clip_dir, class_map_path=class_map_path, hparams=hparams) # Create the model in training mode. global_step, prediction, loss_tensor, train_op = model.define_model( model_name=model_name, features=features, labels=labels, num_classes=num_classes, hparams=hparams, epoch_batches=epoch_batches, training=True) # Define our own checkpoint saving hook, instead of using the built-in one, # so that we can specify additional checkpoint retention settings. saver = tf.train.Saver(max_to_keep=10000, keep_checkpoint_every_n_hours=0.25) saver_hook = tf.train.CheckpointSaverHook(save_steps=100, checkpoint_dir=train_dir, saver=saver) summary_op = tf.summary.merge_all() summary_hook = tf.train.SummarySaverHook(save_steps=10, output_dir=train_dir, summary_op=summary_op) if hparams.warmstart: var_include_scopes = warmstart_include_scopes if not var_include_scopes: var_include_scopes = None var_exclude_scopes = warmstart_exclude_scopes if not var_exclude_scopes: var_exclude_scopes = None restore_vars = tf.contrib.framework.get_variables_to_restore( include=var_include_scopes, exclude=var_exclude_scopes) # Only restore trainable variables, we don't want to restore # batch-norm or optimizer-specific local variables. trainable_vars = set( tf.contrib.framework.get_trainable_variables()) restore_vars = [ var for var in restore_vars if var in trainable_vars ] print('Warm-start: restoring variables:\n%s\n' % '\n'.join([x.name for x in restore_vars])) print('Warm-start: restoring from ', warmstart_checkpoint) assert restore_vars, 'No warm-start variables to restore!' restore_op, feed_dict = tf.contrib.framework.assign_from_checkpoint( model_path=warmstart_checkpoint, var_list=restore_vars, ignore_missing_vars=True) scaffold = tf.train.Scaffold(init_fn=lambda scaffold, session: session.run(restore_op, feed_dict), summary_op=summary_op, saver=saver) else: scaffold = None with tf.train.SingularMonitoredSession( hooks=[saver_hook, summary_hook], checkpoint_dir=train_dir, scaffold=scaffold) as sess: sess.raw_session().run(input_init) while not sess.should_stop(): step, _, pred, loss = sess.run( [global_step, train_op, prediction, loss_tensor]) print(step, loss) sys.stdout.flush()
def train_and_evaluate(model_name=None, hparams=None, class_map_path=None, train_csv_path=None, train_clip_dir=None, train_dir=None, epoch_batches=None, warmstart_checkpoint=None, warmstart_include_scopes=None, warmstart_exclude_scopes=None, eval_csv_path=None, eval_clip_dir=None, eval_dir=None): """Runs the training loop.""" print('\nTraining model:{} with hparams:{} and class map:{}'.format(model_name, hparams, class_map_path)) print('Training data: clip dir {} and labels {}'.format(train_clip_dir, train_csv_path)) print('Training dir {}\n'.format(train_dir)) class_map = {int(row[0]): row[1] for row in csv.reader(open(class_map_path))} with tf.Graph().as_default(): # Create the input pipeline. features, labels, num_classes, input_init = inputs.train_input( train_csv_path=train_csv_path, train_clip_dir=train_clip_dir, class_map_path=class_map_path, hparams=hparams) # Create the model in training mode. global_step, prediction, loss_tensor, train_op = model.define_model( model_name=model_name, features=features, labels=labels, num_classes=num_classes, hparams=hparams, epoch_batches=epoch_batches, training=True) # evaluation graph label_class_index_table, num_classes = inputs.get_class_map(class_map_path) csv_record = tf.placeholder(tf.string, []) # fed during evaluation loop. eval_features, eval_labels = inputs.record_to_labeled_log_mel_examples( csv_record, clip_dir=eval_clip_dir, hparams=hparams, label_class_index_table=label_class_index_table, num_classes=num_classes) # Create the model in prediction mode. global_step, eval_predictions, eval_loss_tensor, _ = model.define_model( model_name=model_name, features=eval_features, labels=eval_labels, num_classes=num_classes, hparams=hparams, training=False, evaluating=True) # Write evaluation graph to checkpoint directory. tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True), eval_dir, 'eval.pbtxt') eval_writer = tf.summary.FileWriter(eval_dir, tf.get_default_graph()) # Define our own checkpoint saving hook, instead of using the built-in one, # so that we can specify additional checkpoint retention settings. saver = tf.train.Saver( max_to_keep=10, keep_checkpoint_every_n_hours=0.25) saver_hook = tf.train.CheckpointSaverHook( save_steps=100, checkpoint_dir=train_dir, saver=saver) summary_op = tf.summary.merge_all() summary_hook = tf.train.SummarySaverHook( save_steps=10, output_dir=train_dir, summary_op=summary_op) if hparams.warmstart: var_include_scopes = warmstart_include_scopes if not var_include_scopes: var_include_scopes = None var_exclude_scopes = warmstart_exclude_scopes if not var_exclude_scopes: var_exclude_scopes = None restore_vars = tf.contrib.framework.get_variables_to_restore( include=var_include_scopes, exclude=var_exclude_scopes) # Only restore trainable variables, we don't want to restore # batch-norm or optimizer-specific local variables. trainable_vars = set(tf.contrib.framework.get_trainable_variables()) restore_vars = [var for var in restore_vars if var in trainable_vars] print('Warm-start: restoring variables:\n%s\n' % '\n'.join([x.name for x in restore_vars])) print('Warm-start: restoring from ', warmstart_checkpoint) assert restore_vars, 'No warm-start variables to restore!' restore_op, feed_dict = tf.contrib.framework.assign_from_checkpoint( model_path=warmstart_checkpoint, var_list=restore_vars, ignore_missing_vars=True) scaffold = tf.train.Scaffold( init_fn=lambda scaffold, session: session.run(restore_op, feed_dict), summary_op=summary_op, saver=saver) else: scaffold = None with tf.train.SingularMonitoredSession(hooks=[saver_hook, summary_hook], checkpoint_dir=train_dir, scaffold=scaffold, config=tf.ConfigProto(log_device_placement=True)) as sess: sess.raw_session().run(input_init) while not sess.should_stop(): # train step, _, pred, loss = sess.run([global_step, train_op, prediction, loss_tensor]) print(step, loss) sys.stdout.flush() # evaluates every 100 steps if step > 0 and step % 100 == 0: # Loop through all checkpoints in the training directory. checkpoint_state = tf.train.get_checkpoint_state(train_dir) lwlrap = eval_batch(eval_csv_path, sess, eval_labels, eval_predictions, csv_record, step, eval_writer, class_map, eval_loss_tensor)
pipeline_config_path = 'faster_rcnn_resnet50_v1_640x640_coco17_tpu-8.config' configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) train_config = configs['train_config'] model_config = configs['model'] train_input_config = configs['train_input_config'] detection_model = model_build(model_config, True, num_classes=4, min_dim=640, max_dim=640) # print(train_config.add_regularization_loss) train_input = inputs.train_input(train_config=train_config, train_input_config=train_input_config, model_config=model_config, batch_size=2, num_classes=90, min_dim=640, max_dim=640) print(train_input) # detection_model._is_training = is_training # pylint: disable=protected-access # tf.keras.backend.set_learning_phase(is_training) # # pdb.set_trace() # losses_dict, _ = _compute_losses_and_predictions_dicts( # detection_model, features, labels, add_regularization_loss)