def main(model_config, train_config, track_config): # Create training directory train_dir = train_config['train_dir'] if not tf.gfile.IsDirectory(train_dir): tf.logging.info('Creating training directory: %s', train_dir) tf.gfile.MakeDirs(train_dir) # Build the Tensorflow graph g = tf.Graph() with g.as_default(): # Set fixed seed np.random.seed(train_config['seed']) tf.set_random_seed(train_config['seed']) # Build the model model = siamese_model.SiameseModel(model_config, train_config, mode='inference') model.build() # Save configurations for future reference save_cfgs(train_dir, model_config, train_config, track_config) saver = tf.train.Saver(tf.global_variables(), max_to_keep=train_config['max_checkpoints_to_keep']) # Dynamically allocate GPU memory gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) sess = tf.Session(config=sess_config) model_path = tf.train.latest_checkpoint(train_config['train_dir']) if not model_path: # Initialize all variables sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) start_step = 0 # Load pretrained embedding model if needed if model_config['embed_config']['embedding_checkpoint_file']: model.init_fn(sess) else: logging.info('Restore from last checkpoint: {}'.format(model_path)) sess.run(tf.local_variables_initializer()) saver.restore(sess, model_path) start_step = tf.train.global_step(sess, model.global_step.name) + 1 checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=start_step)
def main(model_config, train_config): # Create training directory which will be used to save: configurations, model files, TensorBoard logs train_dir = train_config['train_dir'] if not osp.isdir(train_dir): logging.info('Creating training directory: %s', train_dir) mkdir_p(train_dir) g = tf.Graph() with g.as_default(): # Set fixed seed for reproducible experiments random.seed(train_config['seed']) np.random.seed(train_config['seed']) tf.set_random_seed(train_config['seed']) # Build the training and validation model model = BiseNet(model_config, train_config, train_config['num_classes'], mode="train") model.build(num_gpus=configuration.num_gpus, reuse=tf.AUTO_REUSE) model_va = BiseNet(model_config, train_config, train_config['num_classes'], mode="validation") model_va.build(reuse=True) # Save configurations for future reference save_cfgs(train_dir, model_config, train_config) learning_rate = _configure_learning_rate(train_config, model.global_step) optimizer = _configure_optimizer(train_config, learning_rate) tf.summary.scalar('learning_rate', learning_rate) # Set up the training ops tower_grads = [] for i in range(configuration.num_gpus): with tf.device('/gpu:%d' % i): name_scope = ('clone_%d' % i) if i else '' with tf.name_scope(name_scope) as scope: grads = optimizer.compute_gradients(model.total_loss[i]) tower_grads.append(grads) with tf.device('/cpu:0'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): grads_n_vars = _average_gradients(tower_grads) grad_updates = optimizer.apply_gradients( grads_n_vars, global_step=model.global_step) model.total_loss = tf.reduce_mean(model.total_loss) # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # update_ops.append(grad_updates) # update_op = tf.group(*update_ops) # with tf.control_dependencies(update_ops): # train_op = tf.contrib.layers.optimize_loss(loss=model.total_loss, # global_step=model.global_step, # learning_rate=learning_rate, # optimizer=optimizer, # clip_gradients=train_config['clip_gradients'], # learning_rate_decay_fn=None, # summaries=['learning_rate']) saver = tf.train.Saver( tf.global_variables(), max_to_keep=train_config['max_checkpoints_to_keep']) summary_writer = tf.summary.FileWriter(train_dir, g) summary_op = tf.summary.merge_all() global_variables_init_op = tf.global_variables_initializer() local_variables_init_op = tf.local_variables_initializer() g.finalize() # Finalize graph to avoid adding ops by mistake # Dynamically allocate GPU memory # gpu_options = tf.GPUOptions(allow_growth=True) # sess_config = tf.ConfigProto(gpu_options=gpu_options) # for multi gpu options. 'allow_soft_placement' must be set true sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess_config.gpu_options.allow_growth = False sess = tf.Session(config=sess_config) model_path = tf.train.latest_checkpoint(train_config['train_dir']) if not model_path: sess.run(global_variables_init_op) sess.run(local_variables_init_op) start_step = 0 if model_config['frontend_config'][ 'pretrained_dir'] and model.init_fn: model.init_fn(sess) else: logging.info('Restore from last checkpoint: {}'.format(model_path)) sess.run(local_variables_init_op) saver.restore(sess, model_path) start_step = tf.train.global_step(sess, model.global_step.name) + 1 # Training loop data_config = train_config['train_data_config'] total_steps = int(data_config['epoch'] * data_config['num_examples_per_epoch'] / data_config['batch_size']) logging.info('Train for {} steps'.format(total_steps)) for step in range(start_step, total_steps): start_time = time.time() # _, loss = sess.run([train_op, model.total_loss]) _, loss = sess.run([grad_updates, model.total_loss]) duration = time.time() - start_time if step % 10 == 0: examples_per_sec = data_config['batch_size'] / float(duration) time_remain = data_config['batch_size'] * ( total_steps - step) / examples_per_sec m, s = divmod(time_remain, 60) h, m = divmod(m, 60) format_str = ( '%s: step %d, total loss = %.2f, (%.1f examples/sec; %.3f ' 'sec/batch; %dh:%02dm:%02ds remains)') logging.info(format_str % (datetime.now(), step, loss, examples_per_sec, duration, h, m, s)) if step % 10 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % train_config['save_model_every_n_step'] == 0 or ( step + 1) == total_steps: checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(model_config, train_config, track_config): os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() # Create training directory which will be used to save: configurations, model files, TensorBoard logs train_dir = train_config['train_dir'] if not osp.isdir(train_dir): logging.info('Creating training directory: %s', train_dir) mkdir_p(train_dir) g = tf.Graph() with g.as_default(): # Set fixed seed for reproducible experiments random.seed(train_config['seed']) np.random.seed(train_config['seed']) tf.set_random_seed(train_config['seed']) # Build the training and validation model model = siamese_model.SiameseModel(model_config, train_config, mode='train') model.build() model_va = siamese_model.SiameseModel(model_config, train_config, mode='validation') model_va.build(reuse=True) # Save configurations for future reference save_cfgs(train_dir, model_config, train_config, track_config) learning_rate = _configure_learning_rate(train_config, model.global_step) optimizer = _configure_optimizer(train_config, learning_rate) tf.summary.scalar('learning_rate', learning_rate) # Set up the training ops opt_op = tf.contrib.layers.optimize_loss( loss=model.total_loss, global_step=model.global_step, learning_rate=learning_rate, optimizer=optimizer, clip_gradients=train_config['clip_gradients'], learning_rate_decay_fn=None, summaries=['learning_rate']) with tf.control_dependencies([opt_op]): train_op = tf.no_op(name='train') saver = tf.train.Saver( tf.global_variables(), max_to_keep=train_config['max_checkpoints_to_keep']) summary_writer = tf.summary.FileWriter(train_dir, g) summary_op = tf.summary.merge_all() global_variables_init_op = tf.global_variables_initializer() local_variables_init_op = tf.local_variables_initializer() g.finalize() # Finalize graph to avoid adding ops by mistake # Dynamically allocate GPU memory gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) sess = tf.Session(config=sess_config) model_path = tf.train.latest_checkpoint(train_config['train_dir']) if not model_path: sess.run(global_variables_init_op) sess.run(local_variables_init_op) start_step = 0 if model_config['embed_config']['embedding_checkpoint_file']: model.init_fn(sess) else: logging.info('Restore from last checkpoint: {}'.format(model_path)) sess.run(local_variables_init_op) saver.restore(sess, model_path) start_step = tf.train.global_step(sess, model.global_step.name) + 1 # Training loop data_config = train_config['train_data_config'] total_steps = int(data_config['epoch'] * data_config['num_examples_per_epoch'] / data_config['batch_size']) logging.info('Train for {} steps'.format(total_steps)) for step in range(start_step, total_steps): start_time = time.time() _, loss, batch_loss = sess.run( [train_op, model.total_loss, model.batch_loss]) duration = time.time() - start_time if step % 10 == 0: examples_per_sec = data_config['batch_size'] / float(duration) time_remain = data_config['batch_size'] * ( total_steps - step) / examples_per_sec m, s = divmod(time_remain, 60) h, m = divmod(m, 60) format_str = ( '%s: step %d, total loss = %.2f, batch loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch; %dh:%02dm:%02ds remains)') logging.info(format_str % (datetime.now(), step, loss, batch_loss, examples_per_sec, duration, h, m, s)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % train_config['save_model_every_n_step'] == 0 or ( step + 1) == total_steps: checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(model_config, train_config, track_config): os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() # Create training directory which will be used to save: configurations, model files, TensorBoard logs train_dir = train_config['train_dir'] if not osp.isdir(train_dir): logging.info('Creating training directory: %s', train_dir) mkdir_p(train_dir) if have_cfgs(train_dir): model_config, train_config, track_config = load_cfgs(train_dir) print("=================== load cfg ") else: save_cfgs(train_dir, model_config, train_config, track_config) print("=================== save default cfg, please modify files in {}".format(train_dir)) return g = tf.Graph() with g.as_default(): # Set fixed seed for reproducible experiments random.seed(train_config['seed']) np.random.seed(train_config['seed']) tf.set_random_seed(train_config['seed']) # Build the training and validation model model = siamese_model.SiameseModel(model_config, train_config, track_config, mode='train') model.build() model_va = siamese_model.SiameseModel(model_config, train_config, track_config, mode='validation') model_va.build(reuse=True) learning_rate = _configure_learning_rate(train_config, model.global_step) optimizer = _configure_optimizer(train_config, learning_rate) tf.summary.scalar('learning_rate', learning_rate) # general way for run train: https://qiita.com/horiem/items/00ec6488b23895cc4fe2 # tensorflow 2.1: https://www.tensorflow.org/tutorials/customization/custom_training_walkthrough # Set up the training ops opt_op = tensorflow.contrib.layers.optimize_loss( loss=model.total_loss, global_step=model.global_step, learning_rate=learning_rate, optimizer=optimizer, clip_gradients=train_config['clip_gradients'], learning_rate_decay_fn=None, summaries=['learning_rate']) with tf.control_dependencies([opt_op]): train_op = tf.no_op(name='train') saver = tf.train.Saver(tf.global_variables(), max_to_keep=train_config['max_checkpoints_to_keep']) summary_writer = tf.summary.FileWriter(train_dir, g) summary_op = tf.summary.merge_all() global_variables_init_op = tf.global_variables_initializer() local_variables_init_op = tf.local_variables_initializer() g.finalize() # Finalize graph to avoid adding ops by mistake # Dynamically allocate GPU memory gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) sess = tf.Session(config=sess_config) model_path = tf.train.latest_checkpoint(train_config['train_dir']) if not model_path: sess.run(global_variables_init_op) sess.run(local_variables_init_op) start_step = 0 if model_config['embed_config']['embedding_checkpoint_file']: model.init_fn(sess) else: logging.info('Restore from last checkpoint: {}'.format(model_path)) sess.run(local_variables_init_op) saver.restore(sess, model_path) start_step = tf.train.global_step(sess, model.global_step.name) + 1 # export if train_config["export"]: # still debugging ''' frozen_graph_def = tf.graph_util.convert_variables_to_constants(sess, tf.get_default_graph().as_graph_def(), ["train/detection/add"]) frozen_graph = tf.Graph() with frozen_graph.as_default(): tf.import_graph_def(frozen_graph_def) save_model_dir = osp.join(train_config['train_dir'], 'models') tf.train.write_graph(frozen_graph_def, save_model_dir, 'quantized_frozen_graph.pb', as_text=False) tf.train.write_graph(frozen_graph_def, save_model_dir, 'quantized_frozen_graph.pbtxt', as_text=True) output_op = sess.graph.get_tensor_by_name("validation/detection/add:0") input1_op = sess.graph.get_tensor_by_name("validation/template_image:0") input2_op = sess.graph.get_tensor_by_name("validation/input_image:0") converter = tf.lite.TFLiteConverter.from_session(sess, [input1_op, input2_op], [output_op]) converter.inference_type = tf.lite.constants.QUANTIZED_UINT8 input_arrays = converter.get_input_arrays() converter.quantized_input_stats = {input_arrays[0] : (0., 1.), input_arrays[1] : (0., 1.)} # mean, std_dev converter.default_ranges_stats = (0, 255) tflite_model = converter.convert() open(osp.join(save_model_dir, 'quantized_frozen_graph.tflite'), "wb").write(tflite_model) ''' return # Training loop data_config = train_config['train_data_config'] total_steps = int(data_config['epoch'] * data_config['num_examples_per_epoch'] / data_config['batch_size']) logging.info('Train for {} steps'.format(total_steps)) save_step = int(data_config['num_examples_per_epoch'] / data_config['batch_size']) print("=========== save_step: {}".format(save_step)) for step in range(start_step, total_steps): start_time = time.time() # no "feed_dict" # has "feed_dict" exmaple (mnist): https://qiita.com/SwitchBlade/items/6677c283b2402d060cd0 _, loss, batch_loss, instances, response = sess.run([train_op, model.total_loss, model.batch_loss, model.instances, model.response]) duration = time.time() - start_time if step % 10 == 0: examples_per_sec = data_config['batch_size'] / float(duration) time_remain = data_config['batch_size'] * (total_steps - step) / examples_per_sec m, s = divmod(time_remain, 60) h, m = divmod(m, 60) format_str = ('%s: step %d, total loss = %.2f, batch loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch; %dh:%02dm:%02ds remains)') logging.info(format_str % (datetime.now(), step, loss, batch_loss, examples_per_sec, duration, h, m, s)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % save_step == 0 or (step + 1) == total_steps: checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(model_config, train_config, track_config): os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() # Create training directory which will be used to save: configurations, model files, TensorBoard logs train_dir = train_config['train_dir'] if not osp.isdir(train_dir): logging.info('Creating training directory: %s', train_dir) mkdir_p(train_dir) g = tf.Graph() with g.as_default(): # Set fixed seed for reproducible experiments random.seed(train_config['seed']) np.random.seed(train_config['seed']) tf.set_random_seed(train_config['seed']) # Build the training and validation model model = siamese_model.SiameseModel(model_config, train_config, mode='train') model.build() model_va = siamese_model.SiameseModel(model_config, train_config, mode='validation') model_va.build(reuse=True) # Save configurations for future reference save_cfgs(train_dir, model_config, train_config, track_config) learning_rate = _configure_learning_rate(train_config, model.global_step) optimizer = _configure_optimizer(train_config, learning_rate) tf.summary.scalar('learning_rate', learning_rate) # Set up the training ops opt_op = tf.contrib.layers.optimize_loss( loss=model.total_loss, global_step=model.global_step, learning_rate=learning_rate, optimizer=optimizer, clip_gradients=train_config['clip_gradients'], learning_rate_decay_fn=None, summaries=['learning_rate']) with tf.control_dependencies([opt_op]): train_op = tf.no_op(name='train') saver = tf.train.Saver(tf.global_variables(), max_to_keep=train_config['max_checkpoints_to_keep']) summary_writer = tf.summary.FileWriter(train_dir, g) summary_op = tf.summary.merge_all() global_variables_init_op = tf.global_variables_initializer() local_variables_init_op = tf.local_variables_initializer() g.finalize() # Finalize graph to avoid adding ops by mistake # Dynamically allocate GPU memory gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) sess = tf.Session(config=sess_config) model_path = tf.train.latest_checkpoint(train_config['train_dir']) if not model_path: sess.run(global_variables_init_op) sess.run(local_variables_init_op) start_step = 0 if model_config['embed_config']['embedding_checkpoint_file']: model.init_fn(sess) else: logging.info('Restore from last checkpoint: {}'.format(model_path)) sess.run(local_variables_init_op) saver.restore(sess, model_path) start_step = tf.train.global_step(sess, model.global_step.name) + 1 # Training loop data_config = train_config['train_data_config'] total_steps = int(data_config['epoch'] * data_config['num_examples_per_epoch'] / data_config['batch_size']) logging.info('Train for {} steps'.format(total_steps)) for step in range(start_step, total_steps): start_time = time.time() _, loss, batch_loss = sess.run([train_op, model.total_loss, model.batch_loss]) duration = time.time() - start_time if step % 10 == 0: examples_per_sec = data_config['batch_size'] / float(duration) time_remain = data_config['batch_size'] * (total_steps - step) / examples_per_sec m, s = divmod(time_remain, 60) h, m = divmod(m, 60) format_str = ('%s: step %d, total loss = %.2f, batch loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch; %dh:%02dm:%02ds remains)') logging.info(format_str % (datetime.now(), step, loss, batch_loss, examples_per_sec, duration, h, m, s)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % train_config['save_model_every_n_step'] == 0 or (step + 1) == total_steps: checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(model_config, train_config, track_config): # GPU Config gpu_list = train_config['train_data_config'].get('gpu_ids', '0') num_gpus = len(gpu_list.split(',')) if num_gpus > 1: os.environ['CUDA_VISIBLE_DEVICES'] = gpu_list else: os.environ['CUDA_VISIBLE_DEVICES'] = auto_select_gpu() # Create training directory which will be used to save: configurations, model files, TensorBoard logs train_dir = train_config['train_dir'] if not osp.isdir(train_dir): logging.info('Creating training directory: %s', train_dir) mkdir_p(train_dir) g = tf.Graph() with g.as_default(): # Set fixed seed for reproducible experiments random.seed(train_config['seed']) np.random.seed(train_config['seed']) tf.set_random_seed(train_config['seed']) #Build global step with tf.name_scope('train/'): global_step = tf.Variable(initial_value=0, name='global_step', trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) Model = get_model(model_config['Model']) # build training dataloader and validation dataloader #---train train_dataloader = DataLoader(train_config['train_data_config'], is_training=True) train_dataloader.build() train_inputs = train_dataloader.get_one_batch() #---validation val_dataloader = DataLoader(train_config['validation_data_config'], is_training=False) val_dataloader.build() val_inputs = val_dataloader.get_one_batch() # Save configurations for future reference save_cfgs(train_dir, model_config, train_config, track_config) if train_config['lr_config'].get('lr_warmup', False): warmup_epoch_num = 10 init_lr_ratio = 0.8 warmup_steps = warmup_epoch_num * int( train_config['train_data_config']['num_examples_per_epoch'] ) // train_config['train_data_config']['batch_size'] inc_per_step = ( 1 - init_lr_ratio ) * train_config['lr_config']['initial_lr'] / warmup_steps warmup_lr = train_config['lr_config'][ 'initial_lr'] * init_lr_ratio + inc_per_step * tf.to_float( global_step) learning_rate = tf.cond( tf.less(global_step, warmup_steps), lambda: tf.identity(warmup_lr), lambda: _configure_learning_rate(train_config, global_step - warmup_steps)) else: learning_rate = _configure_learning_rate(train_config, global_step) optimizer = _configure_optimizer(train_config, learning_rate) tf.summary.scalar('learning_rate', learning_rate) # Set up the training ops examplars, instances, gt_examplar_boxes, gt_instance_boxes = tf.split(train_inputs[0],num_gpus), \ tf.split(train_inputs[1],num_gpus), \ tf.split(train_inputs[2],num_gpus), \ tf.split(train_inputs[3],num_gpus) if train_config['train_data_config'].get('time_decay', False): time_intervals = tf.split(train_inputs[4], num_gpus) tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(num_gpus): with tf.device('/gpu:%d' % i): if train_config['train_data_config'].get( 'time_decay', False): inputs = [ examplars[i], instances[i], gt_examplar_boxes[i], gt_instance_boxes[i], time_intervals[i] ] else: inputs = [ examplars[i], instances[i], gt_examplar_boxes[i], gt_instance_boxes[i] ] model = tower_model(Model, inputs, model_config, train_config, mode='train') # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() grads = optimizer.compute_gradients(model.total_loss) tower_grads.append(grads) grads = average_gradients(tower_grads) #Clip gradient gradients, tvars = zip(*grads) clip_gradients, _ = tf.clip_by_global_norm( gradients, train_config['clip_gradients']) train_op = optimizer.apply_gradients(zip(clip_gradients, tvars), global_step=global_step) #Build validation model with tf.device('/gpu:0'): model_va = Model(model_config, train_config, mode='validation', inputs=val_inputs) model_va.build(reuse=True) #Save Model setup saver = tf.train.Saver( tf.global_variables(), max_to_keep=train_config['max_checkpoints_to_keep']) summary_writer = tf.summary.FileWriter(train_dir, g) summary_op = tf.summary.merge_all() global_variables_init_op = tf.global_variables_initializer() local_variables_init_op = tf.local_variables_initializer() # Dynamically allocate GPU memory gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) #inter_op_parallelism_threads = 16, intra_op_parallelism_threads = 16, log_device_placement=True) ######Debug timeline if Debug: from tensorflow.python.client import timeline run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() ######Debug timeline sess = tf.Session(config=sess_config) model_path = tf.train.latest_checkpoint(train_config['train_dir']) if not model_path: sess.run(global_variables_init_op) sess.run(local_variables_init_op) start_step = 0 if model_config['embed_config']['embedding_checkpoint_file']: model.init_fn(sess) elif model_config['finetuned_checkpoint_file']: finetuned_checkpoint_file = tf.train.latest_checkpoint( model_config['finetuned_checkpoint_file']) logging.info('Restore from last checkpoint: {}'.format( finetuned_checkpoint_file)) sess.run(local_variables_init_op) sess.run(global_variables_init_op) restore_op = tf.contrib.slim.assign_from_checkpoint_fn( finetuned_checkpoint_file, tf.global_variables(), ignore_missing_vars=True) restore_op(sess) #reset global step saved in checkpoint global_step_reset_op = global_step.assign(0) sess.run(global_step_reset_op) else: logging.info('Restore from last checkpoint: {}'.format(model_path)) sess.run(local_variables_init_op) sess.run(global_variables_init_op) #saver.restore(sess, model_path) restore_op = tf.contrib.slim.assign_from_checkpoint_fn( model_path, tf.global_variables(), ignore_missing_vars=True) restore_op(sess) start_step = tf.train.global_step(sess, global_step.name) + 1 print_trainable(sess) #help function, can be disenable g.finalize() # Finalize graph to avoid adding ops by mistake # Training loop data_config = train_config['train_data_config'] total_steps = int(data_config['epoch'] * data_config['num_examples_per_epoch'] / data_config['batch_size']) logging.info('Train for {} steps'.format(total_steps)) for step in range(start_step, total_steps): try: start_time = time.time() if Debug: _, loss, batch_loss, current_lr = sess.run( [ train_op, model.total_loss, model.batch_loss, learning_rate ], run_metadata=run_metadata, options=run_options) t1 = timeline.Timeline(run_metadata.step_stats) ctf = t1.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) else: _, loss, batch_loss, current_lr = sess.run([ train_op, model.total_loss, model.batch_loss, learning_rate ]) duration = time.time() - start_time if step % 10 == 0: examples_per_sec = data_config['batch_size'] / float( duration) time_remain = data_config['batch_size'] * ( total_steps - step) / examples_per_sec current_epoch = ( step * data_config['batch_size'] ) // data_config['num_examples_per_epoch'] + 1 m, s = divmod(time_remain, 60) h, m = divmod(m, 60) format_str = ( '%s: epoch %d-step %d,lr = %f, total loss = %.3f, batch loss = %.3f (%.1f examples/sec; %.3f ' 'sec/batch; %dh:%02dm:%02ds remains)') logging.info( format_str % (datetime.now(), current_epoch, step, current_lr, loss, batch_loss, examples_per_sec, duration, h, m, s)) if step % 200 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if step % train_config['save_model_every_n_step'] == 0 or ( step + 1) == total_steps: checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except KeyboardInterrupt: checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) print("save model.ckpt-%d" % (step)) break except: print(traceback.format_exc()) print("Error found in current step, continue")