def __build_layer_ops(self): """Build layer-wise fine-tuning operations. Returns: * layer_ops: list of training and initialization operations for each layer * lrn_rates_pgd: list of layer-wise learning rate * prune_perctls: list of layer-wise pruning percentiles """ layer_ops = [] lrn_rates_pgd = [] # list of layer-wise learning rate prune_perctls = [] # list of layer-wise pruning percentiles for idx, var_prnd in enumerate(self.vars_prnd['maskable']): # create placeholders lrn_rate_pgd = tf.placeholder(tf.float32, shape=[], name='lrn_rate_pgd_%d' % idx) prune_perctl = tf.placeholder(tf.float32, shape=[], name='prune_perctl_%d' % idx) # select channels for the current convolutional layer optimizer = tf.train.GradientDescentOptimizer(lrn_rate_pgd) if FLAGS.enbl_multi_gpu: optimizer = mgw.DistributedOptimizer(optimizer) grads = optimizer.compute_gradients(self.reg_losses[idx], [var_prnd]) with tf.control_dependencies(self.update_ops_all): var_prnd_new = var_prnd - lrn_rate_pgd * grads[0][0] var_norm = tf.sqrt( tf.reduce_sum(tf.square(var_prnd_new), axis=[0, 1, 3], keepdims=True)) threshold = tf.contrib.distributions.percentile( var_norm, prune_perctl) shrk_vec = tf.maximum(1.0 - threshold / var_norm, 0.0) prune_op = var_prnd.assign(var_prnd_new * shrk_vec) # fine-tune with selected channels only optimizer_base = tf.train.AdamOptimizer(FLAGS.cpg_lrn_rate_adam) if not FLAGS.enbl_multi_gpu: optimizer = optimizer_base else: optimizer = mgw.DistributedOptimizer(optimizer_base) grads_origin = optimizer.compute_gradients(self.reg_losses[idx], [var_prnd]) grads_pruned = self.__calc_grads_pruned(grads_origin) with tf.control_dependencies(self.update_ops_all): finetune_op = optimizer.apply_gradients(grads_pruned) init_opt_op = tf.variables_initializer(optimizer_base.variables()) # append layer-wise operations & variables layer_ops += [{ 'prune': prune_op, 'finetune': finetune_op, 'init_opt': init_opt_op }] lrn_rates_pgd += [lrn_rate_pgd] prune_perctls += [prune_perctl] return layer_ops, lrn_rates_pgd, prune_perctls
def __init__(self, sm_writer, model_helper): """Constructor function. Args: * sm_writer: TensorFlow's summary writer * model_helper: model helper with definitions of model & dataset """ # initialize attributes self.sm_writer = sm_writer self.data_scope = 'data' self.model_scope = 'model' # initialize Horovod / TF-Plus for multi-gpu training if FLAGS.enbl_multi_gpu: mgw.init() from mpi4py import MPI self.mpi_comm = MPI.COMM_WORLD else: self.mpi_comm = None # obtain the function interface provided by the model helper self.build_dataset_train = model_helper.build_dataset_train self.build_dataset_eval = model_helper.build_dataset_eval self.forward_train = model_helper.forward_train self.forward_eval = model_helper.forward_eval self.calc_loss = model_helper.calc_loss self.model_name = model_helper.model_name self.dataset_name = model_helper.dataset_name # checkpoint path determined by model's & dataset's names self.ckpt_file = 'models_%s_at_%s.tar.gz' % (self.model_name, self.dataset_name)
def is_primary_worker(scope='global'): """Check whether is the primary worker of all nodes (global) or the current node (local). Args: * scope: check scope ('global' OR 'local') Returns: * flag: whether is the primary worker """ if scope == 'global': return True if not FLAGS.enbl_multi_gpu else mgw.rank() == 0 elif scope == 'local': return True if not FLAGS.enbl_multi_gpu else mgw.local_rank() == 0 else: raise ValueError('unrecognized worker scope: ' + scope)
def setup_lrn_rate(self, global_step): """Setup the learning rate (and number of training iterations).""" batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else mgw.size()) if FLAGS.mobilenet_version == 1: nb_epochs = 100 nb_epochs = 412 idxs_epoch = [12000, 20000] step_rate = [200, 200, 4000] epoch_step = setup_lrn_rate_piecewise_constant( global_step, batch_size, idxs_epoch, step_rate) decay_rates = [0.985, 0.980, 0.505] decay_rate = setup_lrn_rate_piecewise_constant( global_step, batch_size, idxs_epoch, decay_rates) lrn_rate = setup_lrn_rate_exponential_decay( global_step, batch_size, epoch_step, decay_rate) nb_iters = int(30000) elif FLAGS.mobilenet_version == 2: nb_epochs = 412 epoch_step = 500 decay_rate = 0.9 # which is better, 0.98 OR (0.98 ** epoch_step)? lrn_rate = setup_lrn_rate_exponential_decay( global_step, batch_size, epoch_step, decay_rate) nb_iters = int(15000) else: raise ValueError('invalid MobileNet version: {}'.format( FLAGS.mobilenet_version)) return lrn_rate, nb_iters
def __monitor_progress(self, summary, log_rslt, time_prev, idx_iter): # early break for non-primary workers if not self.is_primary_worker(): return None # write summaries for TensorBoard visualization self.sm_writer.add_summary(summary, idx_iter) # display monitored statistics speed = FLAGS.batch_size * FLAGS.summ_step / (timer() - time_prev) if FLAGS.enbl_multi_gpu: speed *= mgw.size() if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5 = log_rslt[0], \ log_rslt[1], log_rslt[2], log_rslt[3], log_rslt[4], log_rslt[5] tf.logging.info('iter #%d: lr = %e | dst_loss = %.4f | model_loss = %.4f | loss = %.4f | acc_top1 = %.4f | acc_top5 = %.4f | speed = %.2f pics / sec' \ % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5, speed)) else: lrn_rate, model_loss, loss, acc_top1, acc_top5 = log_rslt[0], \ log_rslt[1], log_rslt[2], log_rslt[3], log_rslt[4] tf.logging.info('iter #%d: lr = %e | model_loss = %.4f | loss = %.4f | acc_top1 = %.4f | acc_top5 = %.4f | speed = %.2f pics / sec' \ % (idx_iter + 1, lrn_rate, model_loss, loss, acc_top1, acc_top5, speed)) return timer()
def __monitor_progress(self, summary, log_rslt, idx_iter, time_step): """Monitor the training progress. Args: * summary: summary protocol buffer * log_rslt: logging operations' results * idx_iter: index of the training iteration * time_step: time step between two summary operations """ # write summaries for TensorBoard visualization self.sm_writer.add_summary(summary, idx_iter) # compute the training speed speed = FLAGS.batch_size * FLAGS.summ_step / time_step if FLAGS.enbl_multi_gpu: speed *= mgw.size() # display monitored statistics log_str = ' | '.join([ '%s = %.4e' % (name, value) for name, value in zip(self.log_op_names, log_rslt) ]) tf.logging.info('iter #%d: %s | speed = %.2f pics / sec' % (idx_iter + 1, log_str, speed))
def __build_eval(self, model_helper): """Build the evaluation graph for the 'optimal' protocol. Args: * model_helper: model helper with definitions of model & dataset """ with tf.Graph().as_default(): # create a TF session for the current graph config = tf.ConfigProto() config.gpu_options.visible_device_list = str( mgw.local_rank() if FLAGS.enbl_multi_gpu else 0) # pylint: disable=no-member self.sess_eval = tf.Session(config=config) # data input pipeline with tf.variable_scope(self.data_scope): __, iterator = model_helper.build_dataset_train( enbl_trn_val_split=True) images, labels = iterator.get_next() # model definition - weight sparsified network with tf.variable_scope(self.model_scope_prnd): logits = model_helper.forward_eval(images) vars_prnd = get_vars_by_scope(self.model_scope_prnd) self.loss_eval, self.metrics_eval = \ model_helper.calc_loss(labels, logits, vars_prnd['trainable']) self.saver_prnd_eval = tf.train.Saver(vars_prnd['all'])
def __build_layer_ops(self): """Build layer-wise fine-tuning operations. Returns: * layer_train_ops: list of training operations for each layer * layer_init_opt_ops: list of initialization operations for each layer's optimizer * layer_grad_norms: list of gradient norm vectors for each layer """ layer_train_ops = [] layer_init_opt_ops = [] grad_norms = [] for idx, var_prnd in enumerate(self.vars_prnd['maskable']): optimizer_base = tf.train.AdamOptimizer(FLAGS.dcp_lrn_rate_adam) if not FLAGS.enbl_multi_gpu: optimizer = optimizer_base else: optimizer = mgw.DistributedOptimizer(optimizer_base) loss_all = self.reg_losses[idx] + self.dis_losses[self.idxs_layer_to_block[idx]] grads_origin = optimizer.compute_gradients(loss_all, [var_prnd]) grads_pruned = self.__calc_grads_pruned(grads_origin) with tf.control_dependencies(self.update_ops_all): layer_train_ops += [optimizer.apply_gradients(grads_pruned)] layer_init_opt_ops += [tf.variables_initializer(optimizer_base.variables())] grad_norms += [tf.reduce_sum(grads_origin[0][0] ** 2, axis=[0, 1, 3])] return layer_train_ops, layer_init_opt_ops, grad_norms
def __build_block_ops(self): """Build block-wise fine-tuning operations. Returns: * block_train_ops: list of training operations for each block * block_init_opt_ops: list of initialization operations for each block's optimizer """ block_train_ops = [] block_init_opt_ops = [] for dis_loss in self.dis_losses: optimizer_base = tf.train.AdamOptimizer(FLAGS.dcp_lrn_rate_adam) if not FLAGS.enbl_multi_gpu: optimizer = optimizer_base else: optimizer = mgw.DistributedOptimizer(optimizer_base) loss_all = dis_loss + self.dis_losses[ -1] # current stage + final loss grads_origin = optimizer.compute_gradients(loss_all, self.trainable_vars_all) grads_pruned = self.__calc_grads_pruned(grads_origin) with tf.control_dependencies(self.update_ops_all): block_train_ops += [optimizer.apply_gradients(grads_pruned)] block_init_opt_ops += [ tf.variables_initializer(optimizer_base.variables()) ] return block_train_ops, block_init_opt_ops
def setup_bnds_decay_rates(model_name, dataset_name): """ NOTE: The bnd_decay_rates here is mgw_size invariant """ batch_size = FLAGS.batch_size if not FLAGS.enbl_multi_gpu else FLAGS.batch_size * mgw.size( ) nb_batches_per_epoch = int(FLAGS.nb_smpls_train / batch_size) mgw_size = int(mgw.size()) if FLAGS.enbl_multi_gpu else 1 init_lr = FLAGS.lrn_rate_init * FLAGS.batch_size * mgw_size / FLAGS.batch_size_norm if FLAGS.enbl_multi_gpu else FLAGS.lrn_rate_init if dataset_name == 'cifar_10': if model_name.startswith('resnet'): bnds = [nb_batches_per_epoch * 15, nb_batches_per_epoch * 40] decay_rates = [1e-3, 1e-4, 1e-5] elif model_name.startswith('lenet'): bnds = [nb_batches_per_epoch * 5, nb_batches_per_epoch * 30] decay_rates = [1e-4, 1e-5, 1e-6] elif dataset_name == 'ilsvrc_12': if model_name.startswith('resnet'): bnds = [nb_batches_per_epoch * 5, nb_batches_per_epoch * 20] decay_rates = [1e-4, 1e-5, 1e-6] elif model_name.startswith('mobilenet'): bnds = [nb_batches_per_epoch * 5, nb_batches_per_epoch * 30] decay_rates = [1e-4, 1e-5, 1e-6] finetune_steps = nb_batches_per_epoch * FLAGS.uql_quant_epochs init_lr = init_lr if FLAGS.enbl_warm_start else FLAGS.lrn_rate_init return init_lr, bnds, decay_rates, finetune_steps
def __build_eval(self): """Build the evaluation graph.""" with tf.Graph().as_default() as graph: # create a TF session for the current graph config = tf.ConfigProto() config.gpu_options.visible_device_list = str( mgw.local_rank() if FLAGS.enbl_multi_gpu else 0) # pylint: disable=no-member config.gpu_options.allow_growth = True # pylint: disable=no-member self.sess_eval = tf.Session(config=config) # data input pipeline with tf.variable_scope(self.data_scope): iterator = self.build_dataset_eval() images, labels = iterator.get_next() # model definition - uniform quantized model - part 1 with tf.variable_scope(self.model_scope_quan): logits = self.forward_eval(images) if not isinstance(logits, dict): outputs = tf.nn.softmax(logits) else: outputs = tf.nn.softmax(logits['cls_pred']) tf.contrib.quantize.experimental_create_eval_graph( weight_bits=FLAGS.uqtf_weight_bits, activation_bits=FLAGS.uqtf_activation_bits, scope=self.model_scope_quan) for node_name in self.unquant_node_names: insert_quant_op(graph, node_name, is_train=False) vars_quan = get_vars_by_scope(self.model_scope_quan) # model definition - distilled model if FLAGS.enbl_dst: logits_dst = self.helper_dst.calc_logits( self.sess_eval, images) # model definition - uniform quantized model -part 2 with tf.variable_scope(self.model_scope_quan): # loss & extra evaluation metrics loss, metrics = self.calc_loss(labels, logits, vars_quan['trainable']) if FLAGS.enbl_dst: loss += self.helper_dst.calc_loss(logits, logits_dst) # TF operations for evaluation vars_quan = get_vars_by_scope(self.model_scope_quan) self.eval_op = [loss] + list(metrics.values()) self.eval_op_names = ['loss'] + list(metrics.keys()) self.outputs_eval = logits self.saver_quan_eval = tf.train.Saver(vars_quan['all']) # add input & output tensors to certain collections if not isinstance(images, dict): tf.add_to_collection('images_final', images) else: tf.add_to_collection('images_final', images['image']) if not isinstance(logits, dict): tf.add_to_collection('logits_final', logits) else: tf.add_to_collection('logits_final', logits['cls_pred'])
def __monitor_progress(self, idx_iter, log_rslt, time_prev): if not self.__is_primary_worker(): return None # display monitored statistics speed = FLAGS.batch_size * self.tune_global_disp_steps / (timer() - time_prev) if FLAGS.enbl_multi_gpu: speed *= mgw.size() if self.dataset_name == 'coco2017-pose': if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll = log_rslt[:8] tf.logging.info( 'iter #%d: lr = %e | dst_loss = %.4f | model_loss = %.4f | loss = %.4f | ll_paf = %.4f | ll_heat = %.4f | ll = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll, speed)) else: lrn_rate, model_loss, loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll = log_rslt[:7] tf.logging.info( 'iter #%d: lr = %e | model_loss = %.4f | loss = %.4f | ll_paf = %.4f | ll_heat = %.4f | ll = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, model_loss, loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll, speed)) if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5 = log_rslt[0], log_rslt[1], log_rslt[2], log_rslt[3], log_rslt[4], log_rslt[5] tf.logging.info( 'iter #%d: lr = %e | dst_loss = %e | model_loss = %e | loss = %e | acc_top1 = %e | acc_top5 = %e | speed = %.2f pics / sec ' % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5, speed)) else: lrn_rate, model_loss, loss, acc_top1, acc_top5 = log_rslt[0], log_rslt[1], log_rslt[2], log_rslt[3], log_rslt[4] tf.logging.info( 'iter #%d: lr = %e | model_loss = %e | loss = %e | acc_top1 = %e | acc_top5 = %e| speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, model_loss, loss, acc_top1, acc_top5, speed)) return timer()
def setup_lrn_rate(self, global_step): """Setup the learning rate (and number of training iterations).""" batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else mgw.size()) if FLAGS.mobilenet_version == 1: nb_epochs = 100 idxs_epoch = [30, 60, 80, 90] decay_rates = [1.0, 0.1, 0.01, 0.001, 0.0001] lrn_rate = setup_lrn_rate_piecewise_constant( global_step, batch_size, idxs_epoch, decay_rates) nb_iters = int(FLAGS.nb_smpls_train * nb_epochs * FLAGS.nb_epochs_rat / batch_size) elif FLAGS.mobilenet_version == 2: nb_epochs = 412 epoch_step = 2.5 decay_rate = 0.98**epoch_step # which is better, 0.98 OR (0.98 ** epoch_step)? lrn_rate = setup_lrn_rate_exponential_decay( global_step, batch_size, epoch_step, decay_rate) nb_iters = int(FLAGS.nb_smpls_train * nb_epochs * FLAGS.nb_epochs_rat / batch_size) else: raise ValueError('invalid MobileNet version: {}'.format( FLAGS.mobilenet_version)) return lrn_rate, nb_iters
def __build_eval(self): with tf.Graph().as_default(): # TensorFlow session # create a TF session for the current graph config = tf.ConfigProto() config.gpu_options.visible_device_list = str( mgw.local_rank() if FLAGS.enbl_multi_gpu else 0) self.sess_eval = tf.Session(config=config) # data input pipeline with tf.variable_scope(self.data_scope): iterator = self.build_dataset_eval() images, labels = iterator.get_next() # images.set_shape((FLAGS.batch_size, images.shape[1], images.shape[2], images.shape[3])) images.set_shape((FLAGS.batch_size_eval, images.shape[1], images.shape[2], images.shape[3])) self.images_eval = images # model definition - distilled model if FLAGS.enbl_dst: logits_dst = self.helper_dst.calc_logits( self.sess_eval, images) # model definition with tf.variable_scope(self.model_scope, reuse=tf.AUTO_REUSE): # forward pass logits = self.forward_eval(images) self.__quantize_eval_graph() # loss & accuracy loss, metrics = self.calc_loss(labels, logits, self.trainable_vars) if self.dataset_name == 'cifar_10': acc_top1, acc_top5 = metrics['accuracy'], tf.constant(0.) elif self.dataset_name == 'ilsvrc_12': acc_top1, acc_top5 = metrics['acc_top1'], metrics[ 'acc_top5'] elif self.dataset_name == 'coco2017-pose': total_loss = metrics['total_loss_all_layers'] total_loss_ll_paf = metrics['total_loss_last_layer_paf'] total_loss_ll_heat = metrics['total_loss_last_layer_heat'] total_loss_ll = metrics['total_loss_last_layer'] else: raise ValueError("Unrecognized dataset name") if FLAGS.enbl_dst: dst_loss = self.helper_dst.calc_loss(logits, logits_dst) loss += dst_loss # TF operations & model saver if self.dataset_name == 'coco2017-pose': self.ops['eval'] = [ loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll ] else: self.ops['eval'] = [loss, acc_top1, acc_top5] self.saver_eval = tf.train.Saver(self.vars)
def build(self, enbl_trn_val_split=False): '''Build iterator(s) for tf.data.Dataset() object. Args: * enbl_trn_val_split: whether to split into training & validation subsets Returns: * iterator_trn: iterator for the training subset * iterator_val: iterator for the validation subset OR * iterator: iterator for the chosen subset (training OR testing) Example: # build iterator(s) dataset = xxxxDataset(is_train=True) # TF operations are not created iterator = dataset.build() # TF operations are created OR iterator_trn, iterator_val = dataset.build(enbl_trn_val_split=True) # for dataset-train only # use the iterator to obtain a mini-batch of images & labels images, labels = iterator.get_next() ''' # obtain list of data files' names filenames = tf.data.Dataset.list_files(self.file_pattern, shuffle=True) if self.enbl_shard: filenames = filenames.shard(mgw.size(), mgw.rank()) # create a tf.data.Dataset from list of files dataset = filenames.apply( tf.contrib.data.parallel_interleave( self.dataset_fn, cycle_length=FLAGS.cycle_length)) dataset = dataset.map(self.parse_fn, num_parallel_calls=FLAGS.nb_threads) # create iterators for training & validation subsets separately if self.is_train and enbl_trn_val_split: iterator_val = self.__make_iterator( dataset.take(FLAGS.nb_smpls_val)) iterator_trn = self.__make_iterator( dataset.skip(FLAGS.nb_smpls_val)) return iterator_trn, iterator_val return self.__make_iterator(dataset)
def setup_lrn_rate(self, global_step): """Setup the learning rate (and number of training iterations).""" nb_epochs = 100 idxs_epoch = [30, 60, 80, 90] decay_rates = [1.0, 0.1, 0.01, 0.001, 0.0001] batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else mgw.size()) lrn_rate = setup_lrn_rate_piecewise_constant(global_step, batch_size, idxs_epoch, decay_rates) nb_iters = int(FLAGS.nb_smpls_train * nb_epochs * FLAGS.nb_epochs_rat / batch_size) return lrn_rate, nb_iters
def main(unused_argv): """Main entry. Args: * unused_argv: unused arguments (after FLAGS is parsed) """ tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.enbl_multi_gpu: mgw.init() trainer = Trainer(data_path=FLAGS.data_path, netcfg=FLAGS.net_cfg) trainer.build_graph(is_train=True) trainer.build_graph(is_train=False) if FLAGS.eval_only: trainer.eval() else: trainer.train()
def setup_lrn_rate(self, global_step): """Setup the learning rate (and number of training iterations).""" nb_epochs = 100 idxs_epoch = [0.4, 0.8] decay_rates = [0.001, 0.0005, 0.0001] batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else mgw.size()) lrn_rate = setup_lrn_rate_piecewise_constant(global_step, batch_size, idxs_epoch, decay_rates) nb_iters = int(12000) #nb_iters = int(200) return lrn_rate, nb_iters
def __build_eval(self): """Build the evaluation graph.""" with tf.Graph().as_default(): # create a TF session for the current graph config = tf.ConfigProto() config.gpu_options.visible_device_list = str( mgw.local_rank() if FLAGS.enbl_multi_gpu else 0) # pylint: disable=no-member self.sess_eval = tf.Session(config=config) # data input pipeline with tf.variable_scope(self.data_scope): iterator = self.build_dataset_eval() images, labels = iterator.get_next() # model definition - distilled model if FLAGS.enbl_dst: logits_dst = self.helper_dst.calc_logits( self.sess_eval, images) # model definition - channel-pruned model with tf.variable_scope(self.model_scope_prnd): # loss & extra evaluation metrics logits = self.forward_eval(images) vars_prnd = get_vars_by_scope(self.model_scope_prnd) loss, metrics = self.calc_loss(labels, logits, vars_prnd['trainable']) if FLAGS.enbl_dst: loss += self.helper_dst.calc_loss(logits, logits_dst) # overall pruning ratios of trainable & maskable variables pr_trainable = calc_prune_ratio(vars_prnd['trainable']) pr_maskable = calc_prune_ratio(vars_prnd['maskable']) # TF operations for evaluation self.factory_op = [tf.cast(logits, tf.uint8)] self.time_op = [logits] self.out_op = [ tf.cast(images, tf.uint8), tf.cast(logits, tf.uint8), tf.cast(labels, tf.uint8) ] self.eval_op = [loss, pr_trainable, pr_maskable] + list( metrics.values()) self.eval_op_names = ['loss', 'pr_trn', 'pr_msk'] + list( metrics.keys()) self.saver_prnd_eval = tf.train.Saver(vars_prnd['all']) # add input & output tensors to certain collections tf.add_to_collection('images_final', images) tf.add_to_collection('logits_final', logits)
def create_session(): """Create a TensorFlow session. Return: * sess: TensorFlow session """ # create a TensorFlow session config = tf.ConfigProto() config.gpu_options.visible_device_list = str(mgw.local_rank() if FLAGS.enbl_multi_gpu else 0) # pylint: disable=no-member config.gpu_options.allow_growth = True # pylint: disable=no-member sess = tf.Session(config=config) return sess
def setup_lrn_rate(global_step, model_name, dataset_name): """Setup the learning rate for the given dataset. Args: * global_step: training iteration counter * model_name: model's name; must be one of ['lenet', 'resnet_*', 'mobilenet_v1', 'mobilenet_v2'] * dataset_name: dataset's name; must be one of ['cifar_10', 'ilsvrc_12'] Returns: * lrn_rate: learning rate * nb_batches: number of training mini-batches """ # obtain the overall batch size across all GPUs if not FLAGS.enbl_multi_gpu: batch_size = FLAGS.batch_size else: batch_size = FLAGS.batch_size * mgw.size() # choose a learning rate protocol according to the model & dataset combination global_step = tf.cast(global_step, tf.int32) if dataset_name == 'cifar_10': if model_name == 'lenet': lrn_rate, nb_batches = setup_lrn_rate_lenet_cifar10( global_step, batch_size) elif model_name.startswith('resnet'): lrn_rate, nb_batches = setup_lrn_rate_resnet_cifar10( global_step, batch_size) else: raise NotImplementedError('model: {} / dataset: {}'.format( model_name, dataset_name)) elif dataset_name == 'ilsvrc_12': if model_name.startswith('resnet'): lrn_rate, nb_batches = setup_lrn_rate_resnet_ilsvrc12( global_step, batch_size) elif model_name.startswith('mobilenet_v1'): lrn_rate, nb_batches = setup_lrn_rate_mobilenet_v1_ilsvrc12( global_step, batch_size) elif model_name.startswith('mobilenet_v2'): lrn_rate, nb_batches = setup_lrn_rate_mobilenet_v2_ilsvrc12( global_step, batch_size) else: raise NotImplementedError('model: {} / dataset: {}'.format( model_name, dataset_name)) else: raise NotImplementedError('dataset: ' + dataset_name) return lrn_rate, nb_batches
def __monitor_progress(self, summary, log_rslt, time_prev, idx_iter): # early break for non-primary workers if not self.is_primary_worker(): return None # write summaries for TensorBoard visualization self.sm_writer.add_summary(summary, idx_iter) # display monitored statistics speed = FLAGS.batch_size * FLAGS.summ_step / (timer() - time_prev) if FLAGS.enbl_multi_gpu: speed *= mgw.size() # NOTE: for cifar-10, acc_top5 is 0. if self.dataset_name == 'coco2017-pose': if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll = log_rslt[: 8] tf.logging.info( 'iter #%d: lr = %e | dst_loss = %.4f | model_loss = %.4f | loss = %.4f | ll_paf = %.4f | ll_heat = %.4f | ll = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll, speed)) else: lrn_rate, model_loss, loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll = log_rslt[: 7] tf.logging.info( 'iter #%d: lr = %e | model_loss = %.4f | loss = %.4f | ll_paf = %.4f | ll_heat = %.4f | ll = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, model_loss, loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll, speed)) else: if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5 = log_rslt[: 6] tf.logging.info( 'iter #%d: lr = %e | dst_loss = %.4f | model_loss = %.4f | loss = %.4f | acc_top1 = %.4f | acc_top5 = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5, speed)) else: lrn_rate, model_loss, loss, acc_top1, acc_top5 = log_rslt[:5] tf.logging.info( 'iter #%d: lr = %e | model_loss = %.4f | loss = %.4f | acc_top1 = %.4f | acc_top5 = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, model_loss, loss, acc_top1, acc_top5, speed)) return timer()
def __monitor_progress(self, summary, log_rslt): # early break for non-primary workers if not self.__is_primary_worker(): return # write summaries for TensorBoard visualization self.sm_writer.add_summary(summary, self.idx_iter) # display monitored statistics lrn_rate, loss, accuracy = log_rslt[0], log_rslt[1], log_rslt[2] speed = FLAGS.batch_size * FLAGS.summ_step / (timer() - self.time_prev) if FLAGS.enbl_multi_gpu: speed *= mgw.size() tf.logging.info('iter #%d: lr = %e | loss = %e | speed = %.2f pics / sec' % (self.idx_iter + 1, lrn_rate, loss, speed)) for i in range(len(self.accuracy_keys)): tf.logging.info('{} = {}'.format(self.accuracy_keys[i], accuracy[i])) self.time_prev = timer()
def __build_network_ops(self, loss, lrn_rate): """Build network training operations. Returns: * train_op: training operation of the whole network * init_opt_op: initialization operation of the whole network's optimizer """ optimizer_base = tf.train.MomentumOptimizer(lrn_rate, FLAGS.momentum) if not FLAGS.enbl_multi_gpu: optimizer = optimizer_base else: optimizer = mgw.DistributedOptimizer(optimizer_base) grads_origin = optimizer.compute_gradients(loss, self.trainable_vars_all) grads_pruned = self.__calc_grads_pruned(grads_origin) with tf.control_dependencies(self.update_ops_all): train_op = optimizer.apply_gradients(grads_pruned, global_step=self.global_step) init_opt_op = tf.variables_initializer(optimizer_base.variables()) return train_op, init_opt_op
def __build_eval(self): """Build the evaluation graph.""" with tf.Graph().as_default(): # create a TF session for the current graph config = tf.ConfigProto() if FLAGS.enbl_multi_gpu: config.gpu_options.visible_device_list = str(mgw.local_rank()) # pylint: disable=no-member else: config.gpu_options.visible_device_list = '0' # pylint: disable=no-member self.sess_eval = tf.Session(config=config) # data input pipeline with tf.variable_scope(self.data_scope): iterator = self.build_dataset_eval() images, labels = iterator.get_next() # model definition - distilled model if FLAGS.enbl_dst: logits_dst = self.helper_dst.calc_logits( self.sess_eval, images) # model definition - weight-sparsified model with tf.variable_scope(self.model_scope): # loss & extra evaluation metrics logits = self.forward_eval(images) loss, metrics = self.calc_loss(labels, logits, self.trainable_vars) if FLAGS.enbl_dst: loss += self.helper_dst.calc_loss(logits, logits_dst) # overall pruning ratios of trainable & maskable variables pr_trainable = calc_prune_ratio(self.trainable_vars) pr_maskable = calc_prune_ratio(self.maskable_vars) # TF operations for evaluation self.eval_op = [loss, pr_trainable, pr_maskable] + list( metrics.values()) self.eval_op_names = ['loss', 'pr_trn', 'pr_msk'] + list( metrics.keys()) self.saver_eval = tf.train.Saver(self.vars)
def __build_pruned_evaluate_model(self, path=None): ''' build a evaluation model from pruned model ''' # early break for non-primary workers if not self.__is_primary_worker(): return if path is None: path = FLAGS.save_path if not tf.train.checkpoint_exists(path): return with tf.Graph().as_default(): config = tf.ConfigProto() config.gpu_options.visible_device_list = str( # pylint: disable=no-member mgw.local_rank() if FLAGS.enbl_multi_gpu else 0) self.sess_eval = tf.Session(config=config) self.saver_eval = tf.train.import_meta_graph(path + '.meta') self.saver_eval.restore(self.sess_eval, path) eval_logits = tf.get_collection('logits')[0] tf.add_to_collection('logits_final', eval_logits) eval_images = tf.get_collection('eval_images')[0] tf.add_to_collection('images_final', eval_images) eval_labels = tf.get_collection('eval_labels')[0] mem_images = tf.get_collection('mem_images')[0] mem_labels = tf.get_collection('mem_labels')[0] self.sess_eval.close() graph_editor.reroute_ts(eval_images, mem_images) graph_editor.reroute_ts(eval_labels, mem_labels) self.sess_eval = tf.Session(config=config) self.saver_eval.restore(self.sess_eval, path) trainable_vars = self.trainable_vars loss, accuracy = self.calc_loss(eval_labels, eval_logits, trainable_vars) self.eval_op = [loss] + list(accuracy.values()) self.sm_writer.add_graph(self.sess_eval.graph)
def __build_network_ft_ops(self, loss): """Build operations for network fine-tuning. Args: * loss: loss function's value Returns: * init_op: initialization operation * train_op: training operation """ optimizer_base = tf.train.AdamOptimizer(FLAGS.ws_lrn_rate_ft) if FLAGS.enbl_multi_gpu: optimizer = mgw.DistributedOptimizer(optimizer_base) else: optimizer = optimizer_base grads_origin = optimizer.compute_gradients(loss, self.vars_prnd['trainable']) grads_pruned = self.__calc_grads_pruned(grads_origin) train_op = optimizer.apply_gradients(grads_pruned) init_op = tf.variables_initializer(optimizer_base.variables()) return init_op, train_op
def __build_layer_rg_ops(self): """Build operations for layerwise regression. Returns: * init_op: initialization operation * train_ops: list of training operations, one per layer """ # obtain lists of core operations in both networks if self.model_name.startswith('mobilenet'): patterns = ['pointwise/Conv2D', 'Conv2d_1c_1x1/Conv2D'] else: patterns = ['Conv2D', 'MatMul'] core_ops_full = get_ops_by_scope_n_patterns(self.model_scope_full, patterns) core_ops_prnd = get_ops_by_scope_n_patterns(self.model_scope_prnd, patterns) # construct initialization & training operations init_ops, train_ops = [], [] for idx, (core_op_full, core_op_prnd) in enumerate(zip(core_ops_full, core_ops_prnd)): loss = tf.nn.l2_loss(core_op_prnd.outputs[0] - core_op_full.outputs[0]) optimizer_base = tf.train.AdamOptimizer(FLAGS.ws_lrn_rate_rg) if FLAGS.enbl_multi_gpu: optimizer = mgw.DistributedOptimizer(optimizer_base) else: optimizer = optimizer_base grads_origin = optimizer.compute_gradients( loss, [self.vars_prnd['maskable'][idx]]) grads_pruned = self.__calc_grads_pruned(grads_origin) train_ops += [optimizer.apply_gradients(grads_pruned)] init_ops += [tf.variables_initializer(optimizer_base.variables())] return tf.group(init_ops), train_ops
def __build_minimal(self, model_helper): """Build the minimal graph for 'uniform' & 'heurist' protocols. Args: * model_helper: model helper with definitions of model & dataset """ with tf.Graph().as_default(): # create a TF session for the current graph config = tf.ConfigProto() config.gpu_options.visible_device_list = str( mgw.local_rank() if FLAGS.enbl_multi_gpu else 0) # pylint: disable=no-member self.sess = tf.Session(config=config) # data input pipeline with tf.variable_scope(self.data_scope): iterator = model_helper.build_dataset_train() images, __ = iterator.get_next() # model definition - full-precision network with tf.variable_scope(self.model_scope_full): __ = model_helper.forward_eval( images) # DO NOT USE forward_train() HERE!!! self.vars_full = get_vars_by_scope(self.model_scope_full)
def __retrain_network(self): """Retrain the network with layerwise regression & network fine-tuning.""" # determine how many iterations to be executed for regression & fine-tuning nb_workers = mgw.size() if FLAGS.enbl_multi_gpu else 1 nb_iters_rg = int(math.ceil(FLAGS.ws_nb_iters_rg / nb_workers)) nb_iters_ft = int(math.ceil(FLAGS.ws_nb_iters_ft / nb_workers)) # re-train the network with layerwise regression time_prev = timer() for rg_train_op in self.rg_train_ops: for __ in range(nb_iters_rg): self.sess_train.run(rg_train_op) time_rg = timer() - time_prev # re-train the network with global fine-tuning time_prev = timer() for __ in range(nb_iters_ft): self.sess_train.run(self.ft_train_op) time_ft = timer() - time_prev # display the time consumption tf.logging.info('time consumption: %.4f (s) - RG | %.4f (s) - FT' % (time_rg, time_ft))