def evaluate(dataset): """Eval CIFAR-10 for a number of steps.""" with tf.Graph().as_default() as g: # Get images and labels for CIFAR-10. eval_data = FLAGS.eval_data == 'test' images, labels, filenames = image_processing.distorted_inputs(dataset) # Build a Graph that computes the logits predictions from the # inference model. #logits = cifar10.inference(images) # Build inference Graph. logits = DAGResnet.inference(images) logits = tf.nn.softmax(logits) shape = logits.get_shape().as_list() label_predict = tf.argmax(logits, dimension=len(shape) - 1) # Calculate predictions. #top_k_op = tf.nn.in_top_k(logits, labels, 1) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( DAGResnet.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, g) while True: eval_once(saver, summary_writer, label_predict, summary_op, labels, images, filenames, logits) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def build_inputs(self): """Input prefetching, preprocessing and batching. Outputs: inputs: images with 4-D Tensor [batch_size, height, width, channels] labels: labels in each angle class """ # if self.mode == "inference": # # In inference mode, images are fed via placeholder. # with tf.variable_scope('images'): # self.images = tf.placeholder(dtype=tf.float32, # shape=[None, self.num_frames, self.image_size, self.image_size, 3]) if self.mode == 'train': with tf.variable_scope('images_and_labels'): self.images, self.labels = image_processing.distorted_inputs( batch_size=self.batch_size, num_preprocess_threads=self.num_preprocess_threads) # self.images = tf.random_normal([self.batch_size, self.image_size, self.image_size, 3], dtype=tf.float32) # self.labels = tf.random_uniform(shape=[self.batch_size, self.num_classes], maxval=2, dtype=tf.int32) elif self.mode == 'validation': with tf.variable_scope('images_and_labels'): self.images, self.labels = image_processing.inputs( batch_size=self.batch_size_val, num_preprocess_threads=self.num_preprocess_threads) # self.images = tf.random_normal([self.batch_size, self.image_size, self.image_size, 3], dtype=tf.float32) # self.labels = tf.random_uniform(shape=[self.batch_size, self.num_classes], maxval=2, dtype=tf.int32) else: with tf.variable_scope('images_and_labels'): self.images = tf.placeholder(dtype=tf.float32, shape=[1, FLAGS.image_size, FLAGS.image_size, 3]) print('complete build inputs.')
def train_distibuted(args): res, cluster_spec = build_cluster_spec(args) if not res: ata_log("build_cluster_spec error") return res = DistributedConfig(args, cluster_spec) is_chief = (args.task_id == 0) # Ops are assigned to worker by default. with tf.device('/job:worker/task:%d' % args.task_id): # Variables and its related init/assign ops are assigned to ps. with slim.scopes.arg_scope( [slim.variables.variable, slim.variables.global_step], device=slim.variables.VariableDeviceChooser(num_parameter_servers)): ##################### # data fetch config # ##################### images, labels = image_processing.distorted_inputs( dataset, batch_size=args.batch_size, num_preprocess_threads=args.num_preprocess_threads) ######################### # LearningModule config # ######################### LearningModuleConfig(args) ################## # Start Training # ################## StartTraining(args)
def run_training(): #tf.reset_default_graph() #data_files_ = TRAIN_FILE #data_files_ = VALIDATION_FILE data_files_ = data_files() images, labels = image_processing.distorted_inputs( data_files_, FLAGS.num_epochs, batch_size=FLAGS.batch_size) labels = tf.one_hot(labels, 1000) logits = inference(images) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=labels)) tf.summary.scalar('loss', loss) correct_pred = tf.equal(tf.arg_max(logits,1), tf.argmax(labels,1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar('accuracy', accuracy) merged_summary_op = tf.summary.merge_all() train_op = tf.train.AdamOptimizer(epsilon=0.1).minimize(loss) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) summary_writer = tf.summary.FileWriter(FLAGS.log_dir) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) #save/restore model d={} l = ['w1', 'b1', 'w2', 'b2', 'w3', 'b3', 'w4', 'b4', 'w5', 'b5', 'w_fc1', 'b_fc1', 'w_fc2', 'b_fc2', 'w_output', 'b_output'] for i in l: d[i] = [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if v.name == i+':0'][0] saver = tf.train.Saver(d) saver.restore(sess, FLAGS.model_path) try: step = 0 start_time = time.time() while not coord.should_stop(): start_batch = time.time() #train _, loss_value, pred, acc = sess.run( [train_op, loss, correct_pred, accuracy]) duration = time.time() - start_batch if step % 10 == 0: print('Step %d | loss = %.2f | accuracy = %.2f (%.3f sec/batch)')%( step, loss_value, acc, duration) if step % 500 == 0: summary = sess.run(merged_summary_op) summary_writer.add_summary(summary, step*FLAGS.batch_size) if step % 5000 == 0: saver.save(sess, FLAGS.model_path) step +=1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps, %.1f min.' % (FLAGS.num_epochs, step, (time.time()-start_time)/60)) finally: coord.request_stop() coord.join(threads) sess.close()
def tower_loss(scope): """Calculate the total loss on a single tower running the baxNet model. Args: scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' Returns: Tensor of shape [] containing the total loss for a batch of data """ dataset = ImagenetData(subset='train') assert dataset.data_files() # if tf.gfile.Exists(FLAGS.eval_dir): # tf.gfile.DeleteRecursively(FLAGS.eval_dir) # tf.gfile.MakeDirs(FLAGS.eval_dir) num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels = image_processing.distorted_inputs(dataset, num_preprocess_threads=num_preprocess_threads) # Build inference Graph. logits = baxNet.inference(images) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. _ = baxNet.loss(logits, labels) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') # Compute the moving average of all individual losses and the total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % baxNet.TOWER_NAME, '', l.op.name) # Name each loss as '(raw)' and name the moving average version of the loss # as the original loss name. tf.scalar_summary(loss_name +' (raw)', l) tf.scalar_summary(loss_name, loss_averages.average(l)) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) return total_loss
def build_train_graph(config, dataset): with tf.device('/cpu:0'): inputs, labels = image_processing.distorted_inputs( dataset, batch_size=config['parameters']['batch_size'], height=config['input']['height'], width=config['input']['width'], channels=config['input']['channels'], add_variations=config['parameters']['additional_variations'], num_preprocess_threads=8) with tf.device('/gpu:0'): logits, endpoints = cnn_architectures.create_model( config['model']['architecture'], inputs, is_training=True, num_classes=config['input']['classes'], reuse=None) if config['parameters']['loss'] == 'regression': labels = tf.cast(labels - config['parameters']['label_mean'], tf.float32) # if needed,change to type int64 mean_squared_error = tf.losses.mean_squared_error(labels=labels, predictions=logits) loss = tf.add_n([mean_squared_error] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES), name='total_loss') accuracy = tf.constant(0, shape=[], dtype=tf.float32) elif config['parameters']['loss'] == 'classification': labels = tf.cast(labels // 5, tf.int64) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) cross_entropy_mean = tf.reduce_mean(cross_entropy) loss = tf.add_n([cross_entropy_mean] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES), name='total_loss') correct_prediction = tf.equal(tf.argmax(logits, 1), labels) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('loss', loss, collections=['train']) tf.summary.scalar('accuracy', accuracy, collections=['train']) if config['output']['trainable_variables_to_summary']: for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var, collections=['train']) return loss, accuracy, tf.summary.merge_all(key='train')
def val(train_loss, dataset): with tf.name_scope("val_process"): with tf.device('/cpu:0'): val_images, val_labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=FLAGS.num_preprocess_threads) val_logits = _logits(val_images) val_loss = _loss(val_logits, val_labels) val_acc = tf.nn.in_top_k(val_logits, val_labels, 1) val_acc_sum = tf.cast(val_acc, tf.float32) val_acc_sum = tf.reduce_mean(val_acc_sum) with tf.name_scope("loss"): tf.summary.scalar('train_loss', train_loss) tf.summary.scalar('val_loss', val_loss) return val_acc_sum
def train(train_dir, batch_size, num_batches, log_dir, dataset=FilmData('train')): # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) images, labels = image_processing.distorted_inputs( dataset) predictions = simple(images[0]) slim.losses.softmax_cross_entropy(predictions, labels[0]) total_loss = slim.losses.get_total_loss() tf.scalar_summary('loss', total_loss) optimizer = tf.train.RMSPropOptimizer(0.001, 0.9) train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, total_loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(argv=None): ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') tf.logging.info('PS hosts are: %s' % ps_hosts) tf.logging.info('Worker hosts are: %s' % worker_hosts) cluster_spec = tf.train.ClusterSpec({ 'ps': ps_hosts, 'worker': worker_hosts }) server = tf.train.Server({ 'ps': ps_hosts, 'worker': worker_hosts }, job_name=FLAGS.job_name, task_index=FLAGS.task_id, protocol=FLAGS.protocol) sspManager = SspManager(len(worker_hosts), 5) if FLAGS.job_name == 'ps': if FLAGS.task_id == 0: rpcServer = sspManager.create_rpc_server(ps_hosts[0].split(':')[0]) rpcServer.serve() server.join() time.sleep(5) rpcClient = sspManager.create_rpc_client(ps_hosts[0].split(':')[0]) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() is_chief = (FLAGS.task_id == 0) if is_chief: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) with tf.device('/job:worker/task:%d' % FLAGS.task_id): with slim.scopes.arg_scope( [slim.variables.variable, slim.variables.global_step], device=slim.variables.VariableDeviceChooser( num_parameter_servers)): '''Prepare Input''' global_step = slim.variables.global_step() batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') images, labels = image_processing.distorted_inputs( dataset, batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) num_classes = dataset.num_classes() + 1 '''Inference''' logits = inception.inference(images, num_classes, for_training=True) '''Loss''' inception.loss(logits, labels, batch_size) losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses, name='total_loss') if is_chief: loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) '''Optimizer''' exp_moving_averager = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_workers) lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) '''Train Operation''' batchnorm_updates = tf.get_collection( slim.ops.UPDATE_OPS_COLLECTION) assert batchnorm_updates, 'Batchnorm updates are missing' batchnorm_updates_op = tf.group(*batchnorm_updates) with tf.control_dependencies([batchnorm_updates_op]): total_loss = tf.identity(total_loss) naive_grads = opt.compute_gradients(total_loss) grads = [(tf.scalar_mul( tf.cast(batch_size / FLAGS.batch_size, tf.float32), grad), var) for grad, var in naive_grads] apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') '''Supervisor and Session''' saver = tf.train.Saver() init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, recovery_wait_secs=1, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) '''Start Training''' sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) batch_size_num = FLAGS.batch_size for step in range(FLAGS.max_steps): start_time = time.time() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() loss_value, gs = sess.run( [train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) assert not np.isnan( loss_value), 'Model diverged with loss = NaN' duration = time.time() - start_time examples_per_sec = batch_size_num / float(duration) sec_per_batch = float(duration) format_str = ( "time: " + str(time.time()) + '; %s: step %d (gs %d), loss= %.2f (%.1f samples/s; %.3f s/batch)' ) tf.logging.info(format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch)) rpcClient.check_staleness(FLAGS.task_id, step)
def main(_): # print(FLAGS.num_preprocess_threads) trainset = GoodsData('train') assert trainset.data_files() validationset = GoodsData('validation') assert validationset.data_files() # lables_output=load_labels(FLAGS.labels_file) # lables_output.append('unknown') # get_tuned_variables() # get_trainable_variables() num_batches_per_epoch = (trainset.num_examples_per_epoch() / FLAGS.batch_size) num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus if FLAGS.from_official == True: train_batch_size = FLAGS.batch_size * 4 else: train_batch_size = FLAGS.batch_size print('train_batch_size', train_batch_size) images_train, labels_train = image_processing.distorted_inputs( trainset, batch_size=train_batch_size, num_preprocess_threads=num_preprocess_threads) images_validation, labels_validation = image_processing.distorted_inputs( validationset, batch_size=64, num_preprocess_threads=num_preprocess_threads) # images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) # labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = trainset.num_classes() + 1 # print(images_train.shape) # print(labels_train.shape) images = tf.placeholder( tf.float32, [None, images_train.shape[1], images_train.shape[2], 3], name="input_images") labels = tf.placeholder(tf.int64, [None], name="labels") with slim.arg_scope(inception_v3.inception_v3_arg_scope()): logits, _ = inception_v3.inception_v3(images, num_classes=num_classes) if FLAGS.from_official == True: tuned_variables = get_tuned_variables() trainable_variables = get_trainable_variables() checkpoint_path = FLAGS.official_checkpoint_path else: tuned_variables = get_all_variables() trainable_variables = get_all_variables() checkpoint_path = FLAGS.pretrained_model_checkpoint_path # print(trainable_variables) # test_tafafdsa=GraphKeys.TRAINABLE_VARIABLES # 获取需要训练的变量 # trainable_variables = get_trainable_variables() # 定义交叉熵损失 # 优化损失函数 loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels, num_classes), logits, weights=1.0) tf.summary.scalar('loss', loss) optimizer = tf.train.AdamOptimizer() # loss = tf.losses.get_total_loss() train_step = optimizer.minimize(loss, var_list=trainable_variables) # total_loss=tf.losses.softmax_cross_entropy(tf.one_hot(labels, num_classes), logits, weights=1.0) # train_step = tf.train.RMSPropOptimizer(FLAGS.initial_learning_rate).minimize(total_loss) # 计算正确率 with tf.name_scope("evaluation"): correct_prediction = tf.equal(tf.argmax(logits, 1), labels) evaluation_step = tf.reduce_mean( tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('validation_accuracy', evaluation_step) # 导入预训练好的权重 checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) load_fn = slim.assign_from_checkpoint_fn(checkpoint_path, tuned_variables, ignore_missing_vars=True) # 用于存储finetune后的权重 # print(get_tuned_variables()) saver = tf.train.Saver() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # with tf.Session(config=config) as sess: # sess.as_default() init = tf.global_variables_initializer() sess.run(init) merged = tf.summary.merge_all() writer = tf.summary.FileWriter("logs/", sess.graph) print("loading tuned variables from %s" % checkpoint_path) load_fn(sess) # sess.run(load_fn) # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(coord=coord) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # tf.train.batch # start = 0 # end = FLAGS.batch_size if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=0) for step in range(FLAGS.max_steps): # print(0) start_time = time.time() # print(1) # image_batch = sess.run(images_train[start:end]) # print(2) # # label_batch = sess.run(labels_train[start:end]) # label_batch = labels_train[start:end] # # print(3) # images_train, labels_train = image_processing.distorted_inputs(trainset, # num_preprocess_threads=num_preprocess_threads) # images_validation, labels_validation = image_processing.distorted_inputs(validationset, # num_preprocess_threads=num_preprocess_threads) image_batch, label_batch = sess.run([images_train, labels_train]) # print(3) # sess.run(train_step, feed_dict={ # images: image_batch, # labels: label_batch # }) # print(4) # print(1) # print(label_batch) # loss_tensor = tf.losses.get_total_loss() sess.run(train_step, feed_dict={ images: image_batch, labels: label_batch }) # loss_now=sess.run(loss) # print(2) duration = time.time() - start_time # assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 5 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ( '%s: step %d,' # loss = %.2f '(%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % ( datetime.now(), step, # loss_now, examples_per_sec, duration)) tempvar = tf.one_hot(labels, num_classes) print( sess.run(labels, feed_dict={ images: image_batch, labels: label_batch })) print( sess.run(tempvar, feed_dict={ images: image_batch, labels: label_batch })) print( sess.run(tf.nn.softmax(logits), feed_dict={ images: image_batch, labels: label_batch })) if step % 50 == 0: image_batch, label_batch = sess.run( [images_validation, labels_validation]) validation_accuracy = sess.run(evaluation_step, feed_dict={ images: image_batch, labels: label_batch }) result = sess.run(merged, feed_dict={ images: image_batch, labels: label_batch }) writer.add_summary(result, step) print('Step %d: Validation accuracy = %.1f%%' % (step, validation_accuracy * 100.0)) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path)
def train(dataset): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 # Split the batch of images and labels for towers. images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) # Calculate the gradients for each model tower. tower_grads = [] reuse_variables = None for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: # Force all Variables to reside on the CPU. with slim.arg_scope([slim.variables.variable], device='/cpu:0'): # Calculate the loss for one tower of the ImageNet model. This # function constructs the entire ImageNet model but shares the # variables across all towers. loss = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope, reuse_variables) # Reuse variables for the next tower. reuse_variables = True # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Retain the Batch Normalization updates operations only from the # final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. batchnorm_updates = tf.get_collection( slim.ops.UPDATE_OPS_COLLECTION, scope) # Calculate the gradients for the batch of data on this ImageNet # tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = _average_gradients(tower_grads) # Add a summaries for the input processing and global_step. summaries.extend(input_summaries) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=sess.graph) for step in range(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
model_file = "retrained_graph.pb" label_file = "retrained_labels.txt" trainset = GoodsData('train') assert trainset.data_files() validationset = GoodsData('validation') assert validationset.data_files() lables_output = load_labels(FLAGS.labels_file) # get_tuned_variables() # get_trainable_variables() num_batches_per_epoch = (trainset.num_examples_per_epoch() / FLAGS.batch_size) num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images_train, labels_train = image_processing.distorted_inputs( trainset, num_preprocess_threads=num_preprocess_threads) images_validation, labels_validation = image_processing.distorted_inputs( validationset, num_preprocess_threads=num_preprocess_threads) # images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) # labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) num_classes = trainset.num_classes() + 1 print(images_train.shape) print(labels_train.shape) images = tf.placeholder( tf.float32, [None, images_train.shape[1], images_train.shape[2], 3], name="input_images") labels = tf.placeholder(tf.int64, [None], name="labels") with slim.arg_scope(inception_v3.inception_v3_arg_scope()): logits, endpoints = inception_v3.inception_v3(images, num_classes=num_classes)
def train(dataset): """Train on dataset for a number of steps.""" # with tf.Graph().as_default(), tf.device('/cpu:0'): with tf.Graph().as_default(): # ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) # if ckpt and ckpt.model_checkpoint_path: # global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] global_step = tf.Variable(0,trainable=False) # global_step = tf.contrib.framework.get_or_create_global_step() num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus with tf.device('/cpu:0'): images, pitchs, yaws, rolls, names = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) p = tf.expand_dims(pitchs,1) y = tf.expand_dims(yaws,1) r = tf.expand_dims(rolls,1) labels = tf.concat([p, y, r],1) train_output = model.inference(images,FLAGS.is_training) train_loss = model.losses(train_output, labels) add_global = global_step.assign_add(1) train_op = model.trainning(train_loss, FLAGS.learning_rate, global_step) summary_op = tf.summary.merge_all() sess = tf.Session() train_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) saver = tf.train.Saver() # ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) # if ckpt and ckpt.model_checkpoint_path: # if os.path.isabs(ckpt.model_checkpoint_path): # # Restores from checkpoint with absolute path. # saver.restore(sess, ckpt.model_checkpoint_path) # else: # # Restores from checkpoint with relative path. # saver.restore(sess, os.path.join(FLAGS.checkpoint_dir, # ckpt.model_checkpoint_path)) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/imagenet_train/model.ckpt-0, # extract global_step from it. # global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] # print('Successfully loaded model from %s at step=%s.' % # (ckpt.model_checkpoint_path, global_step)) # else: # print('No checkpoint file found') # return sess.run(tf.global_variables_initializer()) """ these codes get the variable in conv1 print(sess.run(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))) w = tf.contrib.framework.get_variables('conv1') t = tf.nn.l2_loss(w[0]) print(sess.run(t)) """ coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for step in np.arange(FLAGS.max_steps): if coord.should_stop(): break _, _, tra_loss= sess.run([add_global, train_op, train_loss]) if step % 50 == 0: print('Step %d, train loss = %.2f' %(step, tra_loss)) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str, step) if step % 2000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads) sess.close()
def main(_): with tf.Graph().as_default(), tf.device('/cpu:0'): dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. learning_rate = tf.train.exponential_decay( FLAGS.learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) tf.summary.scalar('lr', learning_rate) is_training = tf.placeholder(tf.bool) #opt = tf.train.AdamOptimizer(learning_rate) opt = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) with tf.name_scope("create_inputs"): #if tf.gfile.Exists(FLAGS.SNAPSHOT_DIR): # tf.gfile.DeleteRecursively(FLAGS.SNAPSHOT_DIR) #tf.gfile.MakeDirs(FLAGS.SNAPSHOT_DIR) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.gpu_nums == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.gpu_nums) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.gpu_nums images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) #tf.summary.image('images', images, max_outputs = 10) images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.gpu_nums, value=images) labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.gpu_nums, value=tf.one_hot(indices=labels, depth=FLAGS.num_classes)) multi_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(FLAGS.gpu_nums): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('ImageNet', i)) as scope: graph = Model_Graph(num_class=FLAGS.num_classes, is_training=is_training) model = graph._build_defaut_graph( images=images_splits[i]) # Top-1 accuracy top1acc = tf.reduce_mean( tf.cast( tf.nn.in_top_k( model.logits, tf.argmax(labels_splits[i], axis=1), 1), tf.float32)) # Top-n accuracy topnacc = tf.reduce_mean( tf.cast( tf.nn.in_top_k( model.logits, tf.argmax(labels_splits[i], axis=1), FLAGS.top_k), tf.float32)) tf.summary.scalar('top1acc_{}'.format(i), top1acc) tf.summary.scalar('topkacc_{}'.format(i), topnacc) all_trainable = [v for v in tf.trainable_variables()] loss = tf.nn.softmax_cross_entropy_with_logits( logits=model.logits, labels=labels_splits[i]) l2_losses = [ FLAGS.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name ] reduced_loss = tf.reduce_mean(loss) + tf.add_n( l2_losses) tf.summary.scalar('loss_{}'.format(i), reduced_loss) tf.get_variable_scope().reuse_variables() #batchnorm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope) batchnorm_updates = tf.get_collection( tf.GraphKeys.UPDATE_OPS) grads = opt.compute_gradients(reduced_loss, all_trainable) multi_grads.append(grads) grads = average_gradients(multi_grads) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( FLAGS.MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(opt.apply_gradients(grads, global_step), variables_averages_op, batchnorm_updates_op) #grads_value = list(zip(grads, all_trainable)) #for grad, var in grads_value: # tf.summary.histogram(var.name + '/gradient', grad) summary_op = tf.summary.merge_all() # Set up tf session and initialize variables. config = tf.ConfigProto() config.allow_soft_placement = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=2) restore_var = [v for v in tf.trainable_variables()] + [ v for v in tf.global_variables() if 'moving_mean' in v.name or 'moving_variance' in v.name or 'global_step' in v.name ] ckpt = tf.train.get_checkpoint_state(FLAGS.SNAPSHOT_DIR) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load(loader, sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found.') load_step = 0 summary_writer = tf.summary.FileWriter(FLAGS.SNAPSHOT_DIR, graph=sess.graph) # Iterate over training steps. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) for step in range(FLAGS.num_steps): start_time = time.time() feed_dict = {is_training: True} if step % 50000 == 0 and step != 0: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) save(saver, sess, FLAGS.SNAPSHOT_DIR, step) elif step % 100 == 0: summary_str, loss_value, _ = sess.run( [summary_op, reduced_loss, train_op], feed_dict=feed_dict) duration = time.time() - start_time summary_writer.add_summary(summary_str, step) summary_writer.flush() print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) else: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) coord.request_stop() coord.join(threads)
def train(dataset, dataset_val=None): """Train CIFAR-10 for a number of steps.""" #with tf.variable_scope("CRRN", reuse=None): with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * DAGResnet.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(DAGResnet.INITIAL_LEARNING_RATE, global_step, decay_steps, DAGResnet.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # images, labels = cifar10.distorted_inputs() assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels, filenames = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) # Split the batch of images and labels for towers. # images_splits = tf.split(0, FLAGS.num_gpus, images) # labels_splits = tf.split(0, FLAGS.num_gpus, labels) # Modify because of different version of TF. Date June 15, 2017 images_splits = tf.split(images, FLAGS.num_gpus, 0) labels_splits = tf.split(labels, FLAGS.num_gpus, 0) if dataset_val is not None: images_val, labels_val, filenames_val = image_processing.distorted_inputs( dataset_val, num_preprocess_threads=num_preprocess_threads) images_val_splits = tf.split(0, FLAGS.num_gpus, images_val) labels_val_splits = tf.split(0, FLAGS.num_gpus, labels_val) # Calculate the gradients for each model tower. tower_grads = [] loss_val = [] pixel_accuracy = [] for i in xrange(FLAGS.num_gpus): gpu_idx = i + FLAGS.start_gpu_idx with tf.device('/gpu:%d' % gpu_idx): with tf.name_scope('%s_%d' % (DAGResnet.TOWER_NAME, gpu_idx)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(images_splits[i], labels_splits[i], scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # grads = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in grads] # grads = [(tf.clip_by_average_norm(grad, 5), var) for grad, var in grads] # Keep track of the gradients across all towers. tower_grads.append(grads) if dataset_val is not None: #with tf.name_scope('Validation'): logits_val = DAGResnet.inference(images_val_splits[i]) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. loss_val.append( DAGResnet.loss(logits_val, labels_val_splits[i])) label_val = labels_val_splits[i] shape = logits_val.get_shape().as_list() label_predict = tf.argmax(logits_val, dimension=len(shape) - 1) pixel_labeled = tf.reduce_sum( tf.to_float(label_val > 0)) pixel_correct = tf.reduce_sum( tf.to_float( tf.equal(tf.cast(label_val, tf.int64), label_predict)) * tf.to_float(label_val > 0)) pixel_accuracy.append( tf.div(tf.scalar_mul(1.0, pixel_correct), pixel_labeled)) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) loss_val = tf.reduce_mean(loss_val) pixel_accuracy = tf.reduce_mean(pixel_accuracy) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) with tf.variable_scope(tf.get_variable_scope(), reuse=None): # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( DAGResnet.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # train_op = tf.group(apply_gradient_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # restore the previous checkpoint model if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) else: ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print('%s: Model restored from %s' % (datetime.now(), ckpt.model_checkpoint_path)) #global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] # Load the checkpoint model if FLAGS.pretrained_model_checkpoint_path: try: if tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path): t_vars = tf.trainable_variables() variables_to_restore = [ var for var in t_vars if not ('FC_V' in var.name or 'upscore' in var.name) ] restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) except ValueError: print('No checkpoint is loaded') # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) f = open(FLAGS.train_dir + '/' + 'log.txt', 'w') for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ( '%s: step %d, loss = %.6f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if dataset_val is not None and step > 0 and step % FLAGS.do_val == 0: format_str = ( '%s: step %d, [VALIDATION] loss = %.6f pixel acc = %.6f') loss_value_val, pixelAcc = sess.run([loss_val, pixel_accuracy]) print(format_str % (datetime.now(), step, loss_value_val, pixelAcc)) f.write(format_str % (datetime.now(), step, loss_value_val, pixelAcc)) f.write('\n') if step % 100000 == 0 and step > 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) f.close()
def train(): print('[Dataset Configuration]') #print('\tCIFAR-100 dir: %s' % FLAGS.data_dir) print('\tNumber of classes: %d' % FLAGS.num_classes) print('\tNumber of training images: %d' % FLAGS.num_train_instance) print('\tNumber of test images: %d' % FLAGS.num_test_instance) print('[Network Configuration]') #print('\tBatch size: %d' % FLAGS.batch_size) print('\tResidual blocks per group: %d' % FLAGS.num_residual_units) print('\tNetwork width multiplier: %d' % FLAGS.k) print('[Optimization Configuration]') print('\tL2 loss weight: %f' % FLAGS.l2_weight) print('\tThe momentum optimizer: %f' % FLAGS.momentum) print('\tInitial learning rate: %f' % FLAGS.initial_lr) print('\tEpochs per lr step: %f' % FLAGS.lr_step_epoch) print('\tLearning rate decay: %f' % FLAGS.lr_decay) print('[Training Configuration]') print('\tTrain dir: %s' % FLAGS.train_dir) print('\tTraining max steps: %d' % FLAGS.max_steps) print('\tSteps per displaying info: %d' % FLAGS.display) print('\tSteps per testing: %d' % FLAGS.test_interval) print('\tSteps during testing: %d' % FLAGS.test_iter) print('\tSteps per saving checkpoints: %d' % FLAGS.checkpoint_interval) print('\tGPU memory fraction: %f' % FLAGS.gpu_fraction) print('\tLog device placement: %d' % FLAGS.log_device_placement) sys.stdout.flush() with tf.Graph().as_default(): init_step = 0 global_step = tf.Variable(0, trainable=False, name='global_step') # Get images and labels of ImageNet with tf.variable_scope('train_image'): train_images, train_labels = image_processing.distorted_inputs( dataset.Dataset('imagenet', 'train'), num_preprocess_threads=4) with tf.variable_scope('test_image'): test_images, test_labels = image_processing.distorted_inputs( dataset.Dataset('imagenet', 'validation'), num_preprocess_threads=4) # Build a Graph that computes the predictions from the inference model. images = tf.placeholder( tf.float32, [FLAGS.batch_size, FLAGS.image_size, FLAGS.image_size, 3]) labels = tf.placeholder(tf.int32, [FLAGS.batch_size]) # Build model decay_step = FLAGS.lr_step_epoch * FLAGS.num_train_instance / FLAGS.batch_size hp = resnet.HParams(batch_size=FLAGS.batch_size, num_classes=FLAGS.num_classes, num_residual_units=FLAGS.num_residual_units, k=FLAGS.k, weight_decay=FLAGS.l2_weight, initial_lr=FLAGS.initial_lr, decay_step=decay_step, lr_decay=FLAGS.lr_decay, momentum=FLAGS.momentum) network = resnet.ResNet(hp, images, labels, global_step) network.build_model() network.build_train_op() network.count_trainable_params() # Summaries(training) train_summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_fraction), log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Create a saver. saver = tf.train.Saver(tf.all_variables(), max_to_keep=10000) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: print('\tRestore from %s' % ckpt.model_checkpoint_path) # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) init_step = int( ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found. Start from the scratch.') sys.stdout.flush() # Start queue runners & summary_writer tf.train.start_queue_runners(sess=sess) if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) # Training! test_best_acc = 0.0 for step in range(init_step, FLAGS.max_steps): # Test if step % FLAGS.test_interval == 0: test_loss, test_acc = 0.0, 0.0 for i in range(FLAGS.test_iter): test_images_val, test_labels_val = sess.run( [test_images, test_labels]) test_labels_val -= 1 loss_value, acc_value = sess.run( [network.loss, network.acc], feed_dict={ network.is_train: False, images: test_images_val, labels: test_labels_val }) test_loss += loss_value test_acc += acc_value test_loss /= FLAGS.test_iter test_acc /= FLAGS.test_iter test_best_acc = max(test_best_acc, test_acc) format_str = ('%s: (Test) step %d, loss=%.4f, acc=%.4f') print(format_str % (datetime.now(), step, test_loss, test_acc)) sys.stdout.flush() test_summary = tf.Summary() test_summary.value.add(tag='test/loss', simple_value=test_loss) test_summary.value.add(tag='test/acc', simple_value=test_acc) test_summary.value.add(tag='test/best_acc', simple_value=test_best_acc) summary_writer.add_summary(test_summary, step) # test_loss_summary = tf.Summary() # test_loss_summary.value.add(tag='test/loss', simple_value=test_loss) # summary_writer.add_summary(test_loss_summary, step) # test_acc_summary = tf.Summary() # test_acc_summary.value.add(tag='test/acc', simple_value=test_acc) # summary_writer.add_summary(test_acc_summary, step) # test_best_acc_summary = tf.Summary() # test_best_acc_summary.value.add(tag='test/best_acc', simple_value=test_best_acc) # summary_writer.add_summary(test_best_acc_summary, step) summary_writer.flush() # Train start_time = time.time() train_images_val, train_labels_val = sess.run( [train_images, train_labels]) train_labels_val -= 1 _, lr_value, loss_value, acc_value, train_summary_str = \ sess.run([network.train_op, network.lr, network.loss, network.acc, train_summary_op], feed_dict={network.is_train:True, images:train_images_val, labels:train_labels_val}) duration = time.time() - start_time assert not np.isnan(loss_value) # Display & Summary(training) if step % FLAGS.display == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: (Training) step %d, loss=%.4f, acc=%.4f, lr=%f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, acc_value, lr_value, examples_per_sec, sec_per_batch)) sys.stdout.flush() summary_writer.add_summary(train_summary_str, step) # Save the model checkpoint periodically. if (step > init_step and step % FLAGS.checkpoint_interval == 0) or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) # Create queue coordinator. coord = tf.train.Coordinator() # Load reader. #with tf.name_scope("create_inputs"): # reader = ImageReader( # args.data_dir, # args.data_list, # input_size, # args.random_scale, # coord) # image_batch, label_batch = reader.dequeue(args.batch_size) num_preprocess_threads = 4 dataset = SensorFusionData("train") data_files = dataset.data_files() print("Found {} data files!".format(len(data_files))) #with tf.name_scope("create_inputs"): image_batch, label_batch = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) num_classes = dataset.num_classes() + 1 # Create network. net = DeepLabResNetModel({'data': image_batch}, is_training=args.is_training) # For a small batch size, it is better to keep # the statistics of the BN layers (running means and variances) # frozen, and to not update the values provided by the pre-trained model. # If is_training=True, the statistics will be updated during the training. # Note that is_training=False still updates BN parameters gamma (scale) and beta (offset) # if they are presented in var_list of the optimiser definition. # Predictions. raw_output = net.layers['fc1_voc12'] # Which variables to load. Running means and variances are not trainable, # thus all_variables() should be restored. restore_var = tf.global_variables() trainable = tf.trainable_variables() prediction = tf.reshape(raw_output, [-1, n_classes]) label_proc = prepare_label(label_batch, tf.pack(raw_output.get_shape()[1:3])) gt = tf.reshape(label_proc, [-1, n_classes]) # Pixel-wise softmax loss. loss = tf.nn.softmax_cross_entropy_with_logits(prediction, gt) reduced_loss = tf.reduce_mean(loss) # Processed predictions. raw_output_up = tf.image.resize_bilinear(raw_output, tf.shape(image_batch)[1:3, ]) raw_output_up = tf.argmax(raw_output_up, dimension=3) pred = tf.expand_dims(raw_output_up, dim=3) # Image summary. images_summary = tf.py_func(inv_preprocess, [image_batch, args.save_num_images], tf.uint8) labels_summary = tf.py_func(decode_labels, [label_batch, args.save_num_images], tf.uint8) preds_summary = tf.py_func(decode_labels, [pred, args.save_num_images], tf.uint8) total_summary = tf.summary.image( 'images', tf.concat(2, [images_summary, labels_summary, preds_summary]), max_outputs=args.save_num_images) # Concatenate row-wise. summary_writer = tf.summary.FileWriter(args.snapshot_dir) # Define loss and optimisation parameters. optimiser = tf.train.AdamOptimizer(learning_rate=args.learning_rate) optim = optimiser.minimize(reduced_loss, var_list=trainable) # Set up tf session and initialize variables. config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() print("Running Session...") sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=restore_var, max_to_keep=40) # Load variables if the checkpoint is provided. if args.restore_from is not None: loader = tf.train.Saver(var_list=restore_var) load(loader, sess, args.restore_from) print("Starting queue runners...") # Start queue threads. threads = tf.train.start_queue_runners(sess=sess) # Iterate over training steps. for step in range(args.num_steps): start_time = time.time() if step % args.save_pred_every == 0: loss_value, images, labels, preds, summary, _ = sess.run([ reduced_loss, image_batch, label_batch, pred, total_summary, optim ]) summary_writer.add_summary(summary, step) save(saver, sess, args.snapshot_dir, step) else: loss_value, _ = sess.run([reduced_loss, optim]) duration = time.time() - start_time print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) coord.request_stop() coord.join(threads)
def train(dataset): """Train on dataset for a number of steps.""" # with tf.Graph().as_default(), tf.device('/cpu:0'): with tf.Graph().as_default(): # ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) # if ckpt and ckpt.model_checkpoint_path: # global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] global_step = tf.Variable(0, trainable=False) # global_step = tf.contrib.framework.get_or_create_global_step() decay_steps = 7500 LEARNING_RATE_DECAY_FACTOR = 0.1 INITIAL_LEARNING_RATE = 0.000001 lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.1) num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus with tf.device('/cpu:0'): images, pitchs, yaws, rolls, names = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) p = tf.expand_dims(pitchs, 1) y = tf.expand_dims(yaws, 1) r = tf.expand_dims(rolls, 1) labels = tf.concat([p, y, r], 1) batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * FLAGS.num_gpus) tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: image_batch, label_batch = batch_queue.dequeue() loss = tower_loss(scope, image_batch, label_batch) tf.get_variable_scope().reuse_variables() grads = opt.compute_gradients(loss) tower_grads.append(grads) grads = average_gradients(tower_grads) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) variable_averages = tf.train.ExponentialMovingAverage( 0.9999, global_step) variable_averages_op = variable_averages.apply( tf.trainable_variables()) train_op = tf.group(apply_gradient_op, variable_averages_op) saver = tf.train.Saver(tf.global_variables()) init = tf.global_variables_initializer() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init) tf.train.start_queue_runners(sess=sess) for step in np.arange(FLAGS.max_steps): _, loss_value = sess.run([train_op, loss]) if step % 50 == 0: print('Step %d, train loss = %.2f' % (step, loss_value)) if step % 2000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(self): """Train DCGAN""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads images, labels = image_processing.distorted_inputs(self.dataset, num_preprocess_threads=num_preprocess_threads) with tf.device('/gpu:0'): # Set weight_decay for weights in Conv and FC layers. self.build_model(FLAGS.batch_size, images, labels, 12, True, False) d_opt = tf.train.AdamOptimizer(FLAGS.learning_rate, beta1=FLAGS.beta1) \ .minimize(self.d_loss, var_list=self.d_vars) g_opt = tf.train.AdamOptimizer(FLAGS.learning_rate, beta1=FLAGS.beta1) \ .minimize(self.g_loss, var_list=self.g_vars) train_op = tf.group(d_opt, g_opt, g_opt) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) # Add a summaries for the input processing and global_step. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(train_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, ckpt.model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.checkpoint_dir)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter( FLAGS.log_dir, graph=sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() sess.run([train_op]) duration = time.time() - start_time if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('%s: step %d(%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, examples_per_sec, duration)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) samples = sess.run(self.G) save_images(samples, './%s/%d' % (FLAGS.sample_dir, step)) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(dataset): print('START') if FLAGS.issync: raise ValueError("Please set 'issync' to False when non-distribution") global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False) lr = _lr(global_step) with tf.name_scope("train_process"): with tf.device('/cpu:0'): images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=FLAGS.num_preprocess_threads) logits = _logits(images) loss = _loss(logits, labels) train_op = _optimization(loss, global_step, lr, FLAGS.issync) # with tf.name_scope("global_step"): # tf.summary.scalar('global_step', global_step) val_step = int( math.ceil(arg_parsing.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / FLAGS.batch_size)) val_acc_sum = val(loss, dataset) all_hooks = [tf.train.NanTensorHook(loss)] if FLAGS.debug: all_hooks.append(tfdbg.LocalCLIDebugHook(ui_type='curses')) if FLAGS.finetune: print('Finetune from %s' % FLAGS.finetune) saver = tf.train.Saver() config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True with tf.train.MonitoredTrainingSession(checkpoint_dir=FLAGS.model_dir, hooks=all_hooks, config=config, save_summaries_steps=100, save_summaries_secs=None, log_step_count_steps=None) as sess: if FLAGS.finetune: print('Load Pre-trained model...') ckpt = tf.train.get_checkpoint_state(FLAGS.finetune) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise ValueError('Failed to load model.') print('-------------------------') total_loss = 0 start_time = time.time() for i in range(1, FLAGS.max_steps + 1): _, loss_value = sess.run([train_op, loss]) total_loss += loss_value if i % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - start_time eg_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) avg_loss = total_loss / i print( '%s: training step %d cur loss = %.4f avg loss = %.4f (%.1f images/sec %.3f sec/batch)' % (datetime.now(), i, loss_value, avg_loss, eg_per_sec, sec_per_batch)) start_time = time.time() if i % FLAGS.steps_to_val == 0: total_val_accu = 0 for j in range(val_step): total_val_accu += sess.run(val_acc_sum) print( '%s: validation total accuracy = %.4f (%.3f sec %d batches)' % (datetime.now(), total_val_accu / float(val_step), float(time.time() - start_time), val_step)) start_time = time.time()
def train(target, dataset, cluster_spec): """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are infered from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, ( ' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) # Ops are assigned to worker by default. with tf.device('/job:worker/task:%d' % FLAGS.task_id): # Variables and its related init/assign ops are assigned to ps. with slim.scopes.arg_scope( [slim.variables.variable, slim.variables.global_step], device=slim.variables.VariableDeviceChooser( num_parameter_servers)): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. global_step = slim.variables.global_step() # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) images, labels = image_processing.distorted_inputs( dataset, batch_size=FLAGS.batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 logits = inception.inference(images, num_classes, for_training=True) # Add classification loss. inception.loss(logits, labels) # Gather all of the losses including regularization losses. losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses, name='total_loss') if is_chief: # Compute the moving average of all individual losses and the # total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; # do the same for the averaged version of the losses. for l in losses + [total_loss]: loss_name = l.op.name # Name each loss as '(raw)' and name the moving average version of the # loss as the original loss name. tf.summary.scalar(loss_name + ' (raw)', l) tf.summary.scalar(loss_name, loss_averages.average(l)) # Add dependency to compute loss_averages. with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) # Track the moving averages of all trainable variables. # Note that we maintain a 'double-average' of the BatchNormalization # global statistics. # This is not needed when the number of replicas are small but important # for synchronous distributed training with tens of workers/replicas. exp_moving_averager = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) # Add histograms for model variables. for var in variables_to_average: tf.summary.histogram(var.op.name, var) # Create synchronous replica optimizer. opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, replica_id=FLAGS.task_id, total_num_replicas=num_workers, variable_averages=exp_moving_averager, variables_to_average=variables_to_average) batchnorm_updates = tf.get_collection( slim.ops.UPDATE_OPS_COLLECTION) assert batchnorm_updates, 'Batchnorm updates are missing' batchnorm_updates_op = tf.group(*batchnorm_updates) # Add dependency to compute batchnorm_updates. with tf.control_dependencies([batchnorm_updates_op]): total_loss = tf.identity(total_loss) # Compute gradients with respect to the loss. grads = opt.compute_gradients(total_loss) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners, init_tokens and clean_up_op, which is used to # synchronize replicas. # More details can be found in sync_replicas_optimizer. chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() clean_up_op = opt.get_clean_up_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init_op = tf.initialize_all_variables() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. next_summary_time = time.time() + FLAGS.save_summaries_secs while not sv.should_stop(): try: start_time = time.time() loss_value, step = sess.run([train_op, global_step]) assert not np.isnan( loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time if step % 30 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info( format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. if is_chief and next_summary_time < time.time(): tf.logging.info( 'Running Summary operation on the chief.') summary_str = sess.run(summary_op) sv.summary_computed(sess, summary_str) tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('About to execute sync_clean_up_op!') sess.run(clean_up_op) raise # Stop the supervisor. This also waits for service threads to finish. sv.stop() # Save after the training ends. if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
def train(dataset): #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. tf.set_random_seed(time.time()) tf.set_random_seed(198918) global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) bits_ph = [] for i in range(18): bits_ph.append(tf.placeholder(tf.int32)) nm = norm_monitor.norm_monitor(FLAGS.digits, len(bits_ph), FLAGS.rel_res, FLAGS.interval, FLAGS.stride) if FLAGS.layerinfo_file: assert tf.gfile.Exists(FLAGS.layerinfo_file) tmp = pickle.load(open(FLAGS.layerinfo_file,'rb')) nm.set_layerinfo(tmp[-1]) print("Restore layerinfo") print(nm.get_layerinfo()) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) print("num_batches_per_epoch: {}".format(num_batches_per_epoch)) print("use bitpack: {}".format(FLAGS.use_bitpack)) print("learning rate: {}".format(FLAGS.initial_learning_rate)) print("produce trace: {}".format(FLAGS.profile)) print("digits: {}".format(FLAGS.digits)) print("rel_res: {}".format(FLAGS.rel_res)) print("interval: {}".format(FLAGS.interval)) print("stride: {}".format(FLAGS.stride)) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 # Split the batch of images and labels for towers. images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) # Calculate the gradients for each model tower. tower_norms = [] tower_grads = [] tower_preds_1 = [] tower_preds_5 = [] tower_losses = [] reuse_variables = None for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: # Force all Variables to reside on the CPU. # Calculate the loss for one tower of the ImageNet model. This # function constructs the entire ImageNet model but shares the # variables across all towers. #print(images_splits[i]) #print(labels_splits[i]) loss, norms, logits_split = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope, reuse_variables, bits_ph) top_1_correct = tf.nn.in_top_k(logits_split, labels_splits[i], 1) top_5_correct = tf.nn.in_top_k(logits_split, labels_splits[i], 5) # Reuse variables for the next tower. reuse_variables = True # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Retain the Batch Normalization updates operations only from the # final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. #batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Calculate the gradients for the batch of data on this ImageNet # tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) tower_norms.append(norms) tower_preds_1.append(tf.reduce_sum(tf.cast(top_1_correct, tf.int32))) tower_preds_5.append(tf.reduce_sum(tf.cast(top_5_correct, tf.int32))) tower_losses.append(loss) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = _average_gradients(tower_grads) top_1_sum = tf.add_n(tower_preds_1) top_5_sum = tf.add_n(tower_preds_5) losses_sum = tf.add_n(tower_losses) # Add a summaries for the input processing and global_step. summaries.extend(input_summaries) # Add a summary to track the learning rate. summaries.append(tf.summary.scalar('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) # Another possibility is to use tf.slim.get_variables(). variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) #variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(tf.global_variables(), max_to_keep=100) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) #for v in tf.all_variables(): # print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter( FLAGS.train_dir, graph=sess.graph) bits_dict = dict() #run_metadata = tf.RunMetadata() elapse = [] #gweights = [] glayerinfo = [] #wnp_name = 'weights_norm_{}_{}_{}_{}_{}_{}_{}.dat'.format(9, 2048, 0, FLAGS.digits, FLAGS.stride, FLAGS.interval, FLAGS.use_bitpack) lip_name = 'layerinfo_{}_{}_{}_{}_{}_{}_{}.dat'.format(9, 4096, 0, FLAGS.digits, FLAGS.stride, FLAGS.interval, FLAGS.use_bitpack) for step in range(FLAGS.max_steps): run_metadata = tf.RunMetadata() start_time = time.time() info = nm.get_layerinfo() for i, bits in enumerate(bits_ph): bits_dict[bits] = info[i][0] if FLAGS.profile is False: _, loss_value, norms, top_1, top_5 = sess.run([train_op, losses_sum, tower_norms, top_1_sum, top_5_sum], feed_dict=bits_dict) else: _, loss_value, norms = sess.run([train_op, loss, tower_norms], feed_dict=bits_dict, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) top_1 = 5 top_5 = 25 nm.adjust_digits(norms) duration = time.time() - start_time #gweights.append(norms) #glayerinfo.append(copy.deepcopy(nm.get_layerinfo())) elapse.append(duration) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: glayerinfo.append(copy.deepcopy(nm.get_layerinfo())) # Print layerinfo print(info) examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch) elapse %.5f s top_1 %.5f top_5 %.5f') pred_1 = top_1 / (FLAGS.batch_size*FLAGS.num_gpus) pred_5 = top_5 / (FLAGS.batch_size*FLAGS.num_gpus) print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration, sum(elapse), pred_1, pred_5)) sys.stdout.flush() tl = timeline.Timeline(run_metadata.step_stats) if FLAGS.profile is True: if FLAGS.use_bitpack is False: trace_file = tf.gfile.Open(name='timeline%03d.json' % step, mode='w') else: trace_file = tf.gfile.Open(name='bitpack_timeline%03d.json' % step, mode='w') trace_file.write(tl.generate_chrome_trace_format(show_memory=True)) if step % 100 == 0: summary_str = sess.run(summary_op, feed_dict=bits_dict) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 4000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) glayerinfo.append(copy.deepcopy(nm.get_layerinfo())) #pickle.dump(gweights, open(wnp_name,'wb')) pickle.dump(glayerinfo, open(lip_name,'wb'))
def train(): with tf.Graph().as_default(), tf.device('/cpu:0'): # Get images and labels for CIFAR-10. #dataset = CIFARData(subset='train') dataset = ImagenetData(subset='train') assert dataset.data_files() #test_set = CIFARData(subset='validation') test_set = ImagenetData(subset='validation') assert test_set.data_files() epoch1 = .5 * helper.MAX_EPOCHS epoch2 = .75 * helper.MAX_EPOCHS step1 = dataset.num_examples_per_epoch() * epoch1 // ( helper.BATCH_SIZE) step2 = dataset.num_examples_per_epoch() * epoch2 // ( helper.BATCH_SIZE) print('Reducing learning rate at step ' + str(step1) + ' and step ' + str(step2) + ' and ending at ' + str(helper.MAX_STEPS)) # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Learning rate lr = .1 #learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate') dropout = tf.placeholder(tf.float32, shape=[], name='dropout') is_training = tf.placeholder(tf.bool, shape=[], name='is_training') boundaries = [step1, step2] values = [lr, lr / 10, lr / 100] learning_rate = tf.train.piecewise_constant(global_step, boundaries, values, name=None) decayed_lr = tf.train.polynomial_decay(lr, global_step, helper.MAX_STEPS, end_learning_rate=0.0001, power=4.0, cycle=False, name=None) # Create an optimizer that performs gradient descent. with tf.name_scope('Optimizer'): opt = tf.train.MomentumOptimizer(learning_rate=decayed_lr, momentum=0.9, use_nesterov=True) #opt = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True) tf.summary.scalar('decayed_learning_rate', decayed_lr) tf.summary.scalar('learning_rate', learning_rate) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = helper.NUM_THREADS * helper.N_GPUS distorted_images, distorted_labels = image_processing.distorted_inputs( dataset, batch_size=helper.SPLIT_BATCH_SIZE, num_preprocess_threads=num_preprocess_threads) #images, labels = image_processing.inputs(dataset, batch_size=helper.BATCH_SIZE, num_preprocess_threads=num_preprocess_threads) test_images, test_labels = image_processing.inputs( test_set, batch_size=helper.SPLIT_BATCH_SIZE, num_preprocess_threads=num_preprocess_threads) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Split the batch of images and labels for towers. #images_splits = tf.split(axis=0, num_or_size_splits=helper.N_GPUS, value=distorted_images) #labels_splits = tf.split(axis=0, num_or_size_splits=helper.N_GPUS, value=distorted_labels) batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [distorted_images, distorted_labels], capacity=2 * helper.N_GPUS) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(helper.N_GPUS): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (helper.TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. image_batch, label_batch = batch_queue.dequeue() loss = tower_loss(scope, image_batch, label_batch, dropout=dropout, is_training=is_training) #loss = tower_loss(scope, images_splits[i], labels_splits[i], dropout=dropout, is_training=is_training) # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) tf.get_variable_scope().reuse_variables() grads = opt.compute_gradients(loss) tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summaries for the input processing and global_step. summaries.extend(input_summaries) # Apply the gradients to adjust the shared variables. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( helper.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. #train_op = apply_gradient_op train_op = tf.group(apply_gradient_op, variables_averages_op) # Add histograms for trainable variables. #for var in tf.trainable_variables(): # summaries.append(tf.summary.histogram(var.op.name, var)) for grad, var in grads: summaries.append(tf.summary.histogram(var.op.name, var)) #summaries.append(tf.summary.histogram(var.op.name + '_gradient', grad)) # Create a saver. saver = tf.train.Saver(tf.global_variables()) cross_entropy_op = tf.reduce_mean(tf.get_collection('cross_entropies'), name='cross_entropy') accuracy_op = tf.reduce_mean(tf.get_collection('accuracy'), name='accuracies') summaries.append(tf.summary.scalar('cross_entropy', cross_entropy_op)) summaries.append(tf.summary.scalar('accuracy', accuracy_op)) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) #run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #run_metadata = tf.RunMetadata() sess.run(init) tf.train.start_queue_runners(sess=sess) if RESTORE == True: ckpt = tf.train.get_checkpoint_state(SAVE_POINT) saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/imagenet_train/model.ckpt-0, # extract global_step from it. restored_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] print('Successfully loaded model from %s at step=%s.' % (ckpt.model_checkpoint_path, restored_step)) step = int(restored_step) range_step = range(step, helper.MAX_STEPS) tf.get_variable_scope().reuse_variables() global_step = tf.get_variable('global_step', trainable=False) else: range_step = range(helper.MAX_STEPS) summary_writer = tf.summary.FileWriter('summary', graph=sess.graph) num_params = helper.count_params() / 1e6 print('Total number of params = %.2fM' % num_params) print("training") top1_error = [-1.0, -1.0] top1_step = 0 top5_error = [-1.0, -1.0] top5_step = 0 for step in range_step: start_time = time.time() _, loss_value, cross_entropy_value, accuracy_value = sess.run( [train_op, loss, cross_entropy_op, accuracy_op], feed_dict={ dropout: 0.8, is_training: True } ) #, options=run_options, run_metadata=run_metadata)#, learning_rate: lr}) duration = time.time() - start_time if step == step1 or step == step2: print('Decreasing Learning Rate') lr /= 10 if step % 10 == 0: num_examples_per_step = helper.BATCH_SIZE examples_per_sec = num_examples_per_step / duration sec_per_batch = duration format_str = ( 'step %d, loss = %.2f, cross entropy = %.2f, accuracy = %.2f, %.3f sec/batch' ) print(format_str % (step, loss_value, cross_entropy_value, accuracy_value, sec_per_batch)) """ # Create the Timeline object, and write it to a json tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) """ if step % 100 == 0: summary_str = sess.run(summary_op, feed_dict={ dropout: 0.8, is_training: False }) #, learning_rate: lr}) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == helper.MAX_STEPS: if step != 0: checkpoint_path = SAVE_POINT + 'model.ckpt' saver.save(sess, checkpoint_path, global_step=step) print('Model saved') #evaluate(distorted_images, distorted_labels, sess, dropout=dropout, is_training=is_training, train=True) top1, top5 = evaluate(test_images, test_labels, sess, dropout=dropout, is_training=is_training, train=False) if top1 > top1_error[0]: top1_error[0] = top1 top1_error[1] = top5 top1_step = step if top5 > top5_error[1]: top5_error[0] = top1 top5_error[1] = top5 top5_step = step print( "Best top1 model achieved top1: %.4f, top5: %.4f at step %d" % (top1_error[0], top1_error[1], top1_step)) print( "Best top5 model achieved top1: %.4f, top5: %.4f at step %d" % (top5_error[0], top5_error[1], top5_step))
def train(target, dataset, cluster_spec): """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are inferred from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) #batchSizeManager = BatchSizeManager(32, 4) # Ops are assigned to worker by default. tf.logging.info('cccc-num_parameter_servers:'+str(num_parameter_servers)) partitioner = tf.fixed_size_partitioner(num_parameter_servers, 0) device_setter = tf.train.replica_device_setter(ps_tasks=num_parameter_servers) slim = tf.contrib.slim with tf.device('/job:worker/task:%d' % FLAGS.task_id): with tf.variable_scope('root', partitioner=partitioner): # Variables and its related init/assign ops are assigned to ps. # with slim.arg_scope( # [slim.variables.variable, slim.variables.global_step], # device=slim.variables.VariableDeviceChooser(num_parameter_servers)): with tf.device(device_setter): # partitioner=partitioner): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. # global_step = slim.variables.global_step() global_step = tf.Variable(0, trainable=False) # Calculate the learning rate schedule. batch_size = tf.placeholder(dtype=tf.int32, shape=(), name='batch_size') num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. # tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. images, labels = image_processing.distorted_inputs( dataset, batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) print(images.get_shape()) print(labels.get_shape()) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. # num_classes = dataset.num_classes() + 1 num_classes = dataset.num_classes() print(num_classes) # logits = inception.inference(images, num_classes, for_training=True) network_fn = nets_factory.get_network_fn('inception_v3',num_classes=num_classes) (logits,_) = network_fn(images) print(logits.get_shape()) # Add classification loss. # inception.loss(logits, labels, batch_size) # Gather all of the losses including regularization losses. labels = tf.one_hot(labels, 1000, 1, 0) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) # losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) # losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = cross_entropy + _WEIGHT_DECAY * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) # total_loss = tf.add_n(losses, name='total_loss') loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) with tf.control_dependencies([loss_averages_op]): opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) grads0 = opt.compute_gradients(total_loss) grads = [(tf.scalar_mul(tf.cast(batch_size/FLAGS.batch_size, tf.float32), grad), var) for grad, var in grads0] total_loss = tf.identity(total_loss) exp_moving_averager = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = exp_moving_averager.apply(tf.trainable_variables()) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op, variables_averages_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners and init_tokens, which is used to synchronize # replicas. More details can be found in SyncReplicasOptimizer. # chief_queue_runners = [opt.get_chief_queue_runner()] # init_tokens_op = opt.get_init_tokens_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. # summary_op = tf.summary.merge_all() # Build an initialization operation to run below. init_op = tf.global_variables_initializer() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, recovery_wait_secs=1, saver=None, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) # if is_chief: # sv.start_queue_runners(sess, chief_queue_runners) # sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. # next_summary_time = time.time() + FLAGS.save_summaries_secs step = 0 time0 = time.time() batch_size_num = 1 while not sv.should_stop(): try: start_time = time.time() batch_size_num = 32 # batch_size_num = int((int(step)/3*10)) % 100000 + 1 # if step < 5: # batch_size_num = 32 # batch_size_num = (batch_size_num ) % 64 + 1 # else: # batch_size_num = 80 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() my_images, loss_value, step = sess.run([images, train_op, global_step], feed_dict={batch_size: batch_size_num}, options=run_options, run_metadata=run_metadata) b = time.time() # assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time # thread = threading2.Thread(target=get_computation_time, name="get_computation_time",args=(run_metadata.step_stats,step,)) # thread.start() # tl = timeline.Timeline(run_metadata.step_stats) # last_batch_time = tl.get_local_step_duration('sync_token_q_Dequeue') c0 = time.time() # batch_size_num = batchSizeManager.dictate_new_batch_size(FLAGS.task_id, last_batch_time) # batch_size_num = rpcClient.update_batch_size(FLAGS.task_id, last_batch_time, available_cpu, available_memory, step, batch_size_num) # ctf = tl.generate_chrome_trace_format() # with open("timeline.json", 'a') as f: # f.write(ctf) if step % 1 == 0: examples_per_sec = FLAGS.batch_size / float(duration) c = time.time() tf.logging.info("time statistics" + " - train_time: " + str(b-start_time) + " - get_batch_time: " + str(c0-b) + " - get_bs_time: " + str(c-c0) + " - accum_time: " + str(c-time0) + " - batch_size: " + str(batch_size_num)) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. # if is_chief and next_summary_time < time.time(): # tf.logging.info('Running Summary operation on the chief.') # summary_str = sess.run(summary_op) # sv.summary_computed(sess, summary_str) # tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. # next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('Chief got exception while running!') raise # Stop the supervisor. This also waits for service threads to finish. sv.stop()
def train(target, dataset, cluster_spec): """Train Inception on a dataset for a number of steps.""" # Number of workers and parameter servers are infered from the workers and ps # hosts string. num_workers = len(cluster_spec.as_dict()['worker']) num_parameter_servers = len(cluster_spec.as_dict()['ps']) # If no value is given, num_replicas_to_aggregate defaults to be the number of # workers. if FLAGS.num_replicas_to_aggregate == -1: num_replicas_to_aggregate = num_workers else: num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate # Both should be greater than 0 in a distributed training. assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and ' 'num_parameter_servers' ' must be > 0.') # Choose worker 0 as the chief. Note that any worker could be the chief # but there should be only one chief. is_chief = (FLAGS.task_id == 0) # Ops are assigned to worker by default. with tf.device('/job:worker/task:%d' % FLAGS.task_id): # Variables and its related init/assign ops are assigned to ps. with slim.scopes.arg_scope( [slim.variables.variable, slim.variables.global_step], device=slim.variables.VariableDeviceChooser(num_parameter_servers)): # Create a variable to count the number of train() calls. This equals the # number of updates applied to the variables. global_step = slim.variables.global_step() # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) # Decay steps need to be divided by the number of replicas to aggregate. decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay / num_replicas_to_aggregate) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Add a summary to track the learning rate. tf.summary.scalar('learning_rate', lr) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) images, labels = image_processing.distorted_inputs( dataset, batch_size=FLAGS.batch_size, num_preprocess_threads=FLAGS.num_preprocess_threads) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 logits = inception.inference(images, num_classes, for_training=True) # Add classification loss. inception.loss(logits, labels) # Gather all of the losses including regularization losses. losses = tf.get_collection(slim.losses.LOSSES_COLLECTION) losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) total_loss = tf.add_n(losses, name='total_loss') if is_chief: # Compute the moving average of all individual losses and the # total loss. loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') loss_averages_op = loss_averages.apply(losses + [total_loss]) # Attach a scalar summmary to all individual losses and the total loss; # do the same for the averaged version of the losses. for l in losses + [total_loss]: loss_name = l.op.name # Name each loss as '(raw)' and name the moving average version of the # loss as the original loss name. tf.summary.scalar(loss_name + ' (raw)', l) tf.summary.scalar(loss_name, loss_averages.average(l)) # Add dependency to compute loss_averages. with tf.control_dependencies([loss_averages_op]): total_loss = tf.identity(total_loss) # Track the moving averages of all trainable variables. # Note that we maintain a 'double-average' of the BatchNormalization # global statistics. # This is not needed when the number of replicas are small but important # for synchronous distributed training with tens of workers/replicas. exp_moving_averager = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) variables_to_average = ( tf.trainable_variables() + tf.moving_average_variables()) # Add histograms for model variables. for var in variables_to_average: tf.summary.histogram(var.op.name, var) # Create synchronous replica optimizer. opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_replicas_to_aggregate, replica_id=FLAGS.task_id, total_num_replicas=num_workers, variable_averages=exp_moving_averager, variables_to_average=variables_to_average) batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION) assert batchnorm_updates, 'Batchnorm updates are missing' batchnorm_updates_op = tf.group(*batchnorm_updates) # Add dependency to compute batchnorm_updates. with tf.control_dependencies([batchnorm_updates_op]): total_loss = tf.identity(total_loss) # Compute gradients with respect to the loss. grads = opt.compute_gradients(total_loss) # Add histograms for gradients. for grad, var in grads: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) apply_gradients_op = opt.apply_gradients(grads, global_step=global_step) with tf.control_dependencies([apply_gradients_op]): train_op = tf.identity(total_loss, name='train_op') # Get chief queue_runners, init_tokens and clean_up_op, which is used to # synchronize replicas. # More details can be found in sync_replicas_optimizer. chief_queue_runners = [opt.get_chief_queue_runner()] init_tokens_op = opt.get_init_tokens_op() clean_up_op = opt.get_clean_up_op() # Create a saver. saver = tf.train.Saver() # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Build an initialization operation to run below. init_op = tf.initialize_all_variables() # We run the summaries in the same thread as the training operations by # passing in None for summary_op to avoid a summary_thread being started. # Running summaries and training operations in parallel could run out of # GPU memory. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, init_op=init_op, summary_op=None, global_step=global_step, saver=saver, save_model_secs=FLAGS.save_interval_secs) tf.logging.info('%s Supervisor' % datetime.now()) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement) # Get a session. sess = sv.prepare_or_wait_for_session(target, config=sess_config) # Start the queue runners. queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS) sv.start_queue_runners(sess, queue_runners) tf.logging.info('Started %d queues for processing input data.', len(queue_runners)) if is_chief: sv.start_queue_runners(sess, chief_queue_runners) sess.run(init_tokens_op) # Train, checking for Nans. Concurrently run the summary operation at a # specified interval. Note that the summary_op and train_op never run # simultaneously in order to prevent running out of GPU memory. next_summary_time = time.time() + FLAGS.save_summaries_secs while not sv.should_stop(): try: start_time = time.time() loss_value, step = sess.run([train_op, global_step]) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step > FLAGS.max_steps: break duration = time.time() - start_time if step % 30 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('Worker %d: %s: step %d, loss = %.2f' '(%.1f examples/sec; %.3f sec/batch)') tf.logging.info(format_str % (FLAGS.task_id, datetime.now(), step, loss_value, examples_per_sec, duration)) # Determine if the summary_op should be run on the chief worker. if is_chief and next_summary_time < time.time(): tf.logging.info('Running Summary operation on the chief.') summary_str = sess.run(summary_op) sv.summary_computed(sess, summary_str) tf.logging.info('Finished running Summary operation.') # Determine the next time for running the summary. next_summary_time += FLAGS.save_summaries_secs except: if is_chief: tf.logging.info('About to execute sync_clean_up_op!') sess.run(clean_up_op) raise # Stop the supervisor. This also waits for service threads to finish. sv.stop() # Save after the training ends. if is_chief: saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
def train(): with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Learning rate lr = .001 # Create an optimizer that performs gradient descent. opt = tf.train.AdamOptimizer(lr) split_batch_size = int(helper.BATCH_SIZE / helper.N_GPUS) num_preprocess_threads = helper.NUM_THREADS * helper.N_GPUS # Get images and labels for CIFAR-10. dataset = ImagenetData(subset='train') assert dataset.data_files() assert helper.BATCH_SIZE % helper.N_GPUS == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(helper.BATCH_SIZE / helper.N_GPUS) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = helper.NUM_THREADS * helper.N_GPUS images, labels = image_processing.distorted_inputs( dataset, batch_size=helper.BATCH_SIZE, num_preprocess_threads=num_preprocess_threads) # Split the batch of images and labels for towers. images_splits = tf.split(axis=0, num_or_size_splits=helper.N_GPUS, value=images) labels_splits = tf.split(axis=0, num_or_size_splits=helper.N_GPUS, value=labels) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(helper.N_GPUS): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (helper.TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope, images_splits[i], labels_splits[i]) tf.get_variable_scope().reuse_variables() grads = opt.compute_gradients(loss) tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( helper.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(init) tf.train.start_queue_runners(sess=sess) print("training") #for epoch in range(helper.MAX_EPOCH): for epoch in range(helper.MAX_STEPS): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time num_examples_per_step = helper.BATCH_SIZE * helper.N_GPUS examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / helper.N_GPUS format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print(format_str % (datetime.now(), i, loss_value, examples_per_sec, sec_per_batch))
def train_dis_(dataset): ps_hosts = arg_parsing.PS_HOSTS.split(",") worker_hosts = arg_parsing.WORKER_HOSTS.split(",") cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() if FLAGS.job_name == "worker": print('START') with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False) lr = _lr(global_step) with tf.name_scope("train_process"): with tf.device('/cpu:0'): images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=FLAGS.num_preprocess_threads) logits = _logits(images) loss = _loss(logits, labels) train_op = _optimization(loss, global_step, lr, FLAGS.issync, len(worker_hosts)) # with tf.name_scope("global_step"): # tf.summary.scalar('global_step', global_step) val_acc_sum = val(loss, dataset) class _LoggerHook(tf.train.SessionRunHook): def begin(self): self._local_step = 0 self._start_time = time.time() self._total_loss = 0 def before_run(self, run_context): self._local_step += 1 return tf.train.SessionRunArgs(loss) def after_run(self, run_context, run_values): self._step = run_context.session.run(global_step) loss_value = run_values.results self._total_loss += loss_value if self._step % FLAGS.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time avg_loss = self._total_loss / self._local_step eg_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration / FLAGS.log_frequency) print( '%s: training step %d cur loss = %.4f avg loss = %.4f (%.1f images/sec %.3f sec/batch)' % (datetime.now(), self._step, loss_value, avg_loss, eg_per_sec, sec_per_batch)) class _ValHook(tf.train.SessionRunHook): def begin(self): # self._step = 0 self._val_step = int( math.ceil(arg_parsing.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / FLAGS.batch_size)) def before_run(self, run_context): # self._step += 1 self._total_val_accu = 0 self._start_time = time.time() def after_run(self, run_context, run_values): # if FLAGS.issync: self._step = run_context.session.run(global_step) if self._step % FLAGS.steps_to_val == 0: if (FLAGS.task_index == 0 and FLAGS.issync) or not FLAGS.issync: for j in range(self._val_step): self._total_val_accu += run_context.session.run( val_acc_sum) print( '%s: step %d validation accuracy = %.4f (%.3f sec %d batches)' % (datetime.now(), self._step, self._total_val_accu / float(self._val_step), float(time.time() - self._start_time), self._val_step)) class _ExitHook(tf.train.SessionRunHook): # same as StopAtStepHook def begin(self): self._val_step = int( math.ceil(arg_parsing.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL / FLAGS.batch_size)) def before_run(self, run_context): self._total_val_accu = 0 self._start_time = time.time() def after_run(self, run_context, run_values): self._step = run_context.session.run(global_step) if self._step >= FLAGS.max_steps: if FLAGS.task_index == 0 and not FLAGS.issync: for j in range(self._val_step * 2): self._total_val_accu += run_context.session.run( val_acc_sum) print( '%s: last step %d validation final accuracy = %.4f (%.3f sec(2 times) %d batches)' % (datetime.now(), self._step, self._total_val_accu / float(self._val_step * 2), float(time.time() - self._start_time), self._val_step)) run_context.request_stop() # all_hooks=[tf.train.NanTensorHook(loss), tf.train.StopAtStepHook(last_step=FLAGS.max_steps), _LoggerHook(), _ValHook()] all_hooks = [ tf.train.NanTensorHook(loss), _LoggerHook(), _ValHook(), _ExitHook() ] if FLAGS.issync: all_hooks.append(sync_replicas_hook) if FLAGS.debug: all_hooks.append(tfdbg.LocalCLIDebugHook(ui_type='curses')) if FLAGS.finetune: print('Finetune from %s' % FLAGS.finetune) saver = tf.train.Saver() config = tf.ConfigProto( log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True with tf.train.MonitoredTrainingSession( master=server.target, is_chief=(FLAGS.task_index == 0), checkpoint_dir=FLAGS.model_dir, hooks=all_hooks, config=config, save_summaries_steps=100, save_summaries_secs=None, log_step_count_steps=None) as sess: if FLAGS.finetune: print('Load Pretrained model') ckpt = tf.train.get_checkpoint_state(FLAGS.finetune) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print('-------------------------') while not sess.should_stop(): sess.run(train_op) print('DONE')
def train(dataset): """Train on dataset for a number of steps.""" # with tf.Graph().as_default(), tf.device('/cpu:0'): with tf.Graph().as_default(): global_step = tf.Variable(0,trainable=False) num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus with tf.device('/cpu:0'): images, pitchs, yaws, rolls, names = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) p = tf.expand_dims(pitchs,1) y = tf.expand_dims(yaws,1) r = tf.expand_dims(rolls,1) labels = tf.concat([p, y, r],1) train_output = model.inference(images) train_loss = model.losses(train_output, labels) add_global = global_step.assign_add(1) train_op = model.trainning(train_loss, FLAGS.learning_rate, global_step) summary_op = tf.summary.merge_all() sess = tf.Session() train_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) """ these codes get the variable in conv1 print(sess.run(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))) w = tf.contrib.framework.get_variables('conv1') t = tf.nn.l2_loss(w[0]) print(sess.run(t)) """ coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for step in np.arange(FLAGS.max_steps): if coord.should_stop(): break _, _, tra_loss= sess.run([add_global, train_op, train_loss]) if step % 50 == 0: gs = sess.run(global_step) print('Step %d, train loss = %.2f, global_step= %d' %(step, tra_loss, gs)) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str, step) if step % 2000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads) sess.close() # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(sess=sess,coord=coord) # try: # print(sess.run(pitchs)) # except Exception as e: # coord.request_stop(e) # coord.request_stop() # coord.join(threads) # sess.close() # sv = tf.train.Supervisor() # with sv.managed_session() as sess: # print(sess.sun(pitchs))
def main(_): print(FLAGS.num_preprocess_threads) trainset = GoodsData('train') # assert trainset.data_files() validationset = GoodsData('validation') assert validationset.data_files() # get_tuned_variables() # get_trainable_variables() # num_batches_per_epoch = (trainset.num_examples_per_epoch() / # FLAGS.batch_size) num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images_train, labels_train = image_processing.distorted_inputs(trainset, num_preprocess_threads=num_preprocess_threads) images_validation, labels_validation = image_processing.distorted_inputs(validationset,batch_size=64, num_preprocess_threads=num_preprocess_threads) # images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images) # labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = trainset.num_classes() + 1 # print(images_train.shape) # print(labels_train.shape) images = tf.placeholder(tf.float32, [None, images_train.shape[1], images_train.shape[2], 3], name="input_images") labels = tf.placeholder(tf.int64, [None], name="labels") with slim.arg_scope(inception_v3.inception_v3_arg_scope()): logits, _ = inception_v3.inception_v3(images, num_classes=num_classes) tuned_variables = get_all_variables() trainable_variables = get_all_variables() checkpoint_path = FLAGS.pretrained_model_checkpoint_path # 计算正确率 with tf.name_scope("evaluation"): prediction=tf.argmax(logits, 1) correct_prediction = tf.equal(prediction, labels) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # 导入预训练好的权重 checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) load_fn = slim.assign_from_checkpoint_fn(checkpoint_path, tuned_variables, ignore_missing_vars=True) # 用于存储finetune后的权重 # print(get_tuned_variables()) # saver = tf.train.Saver() config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement ) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # with tf.Session(config=config) as sess: # sess.as_default() init = tf.global_variables_initializer() sess.run(init) print("loading tuned variables from %s" % checkpoint_path) load_fn(sess) # sess.run(load_fn) # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(coord=coord) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # tf.train.batch # start = 0 # end = FLAGS.batch_size # if tf.gfile.Exists(FLAGS.train_dir): # tf.gfile.DeleteRecursively(FLAGS.train_dir) # tf.gfile.MakeDirs(FLAGS.train_dir) for step in range(FLAGS.max_steps): # print(0) start_time = time.time() image_batch, label_batch = sess.run([images_validation, labels_validation]) validation_accuracy = sess.run(evaluation_step, feed_dict={images: image_batch, labels: label_batch}) label_prediction=sess.run(prediction,feed_dict={images: image_batch, labels: label_batch}) print(label_prediction) print('Step %d: Validation accuracy = %.1f%%' % (step, validation_accuracy * 100.0)) duration = time.time() - start_time