def main(_): # if FLAGS.num_gpus == 0: # dev = '/cpu:0' # elif FLAGS.num_gpus == 1: # dev = '/gpu:0' # else: # raise ValueError('Only support 0 or 1 gpu.') if FLAGS.mode == 'train': batch_size = 25 elif FLAGS.mode == 'eval': batch_size = 1 # if FLAGS.dataset == 'cifar10': # num_classes = 10 # elif FLAGS.dataset == 'cifar100': # num_classes = 100 hps = resnet_model.HParams(batch_size=batch_size, num_classes=3006, min_lrn_rate=0.0001, lrn_rate=0.01, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='adam') with tf.device('gpu:0'): if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': runTest(hps)
def main(_): if FLAGS.mode == 'train': batch_size = FLAGS.train_batch_size elif FLAGS.mode == 'eval': batch_size = FLAGS.eval_batch_size else: raise ValueError('Only support two modes: train or eval') if FLAGS.dataset == 'cifar10': num_classes = 10 elif FLAGS.dataset == 'cifar100': num_classes = 100 elif FLAGS.dataset == 'fdc': num_classes = FLAGS.target_classes else: raise ValueError( 'Only support three datasets: cifar10, cifar100 or fdc') hps = resnet_model.HParams(dataset_name=FLAGS.dataset, batch_size=batch_size, num_classes=num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': evaluate(hps)
def __init__(self, data, eval_batch_count): hps = resnet_model.HParams(batch_size=100, num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom', num_gpus=0) data = ray.get(data) total_images = np.concatenate([data[0], data[1], data[2]]) with tf.Graph().as_default(): with tf.device('/cpu:0'): images, labels = cifar_input.build_input( [total_images, data[3]], hps.batch_size, False) self.model = resnet_model.ResNet(hps, images, labels, 'eval') self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) self.model.variables.set_session(sess) self.coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord=self.coord) init = tf.global_variables_initializer() sess.run(init) self.best_precision = 0.0 self.eval_batch_count = eval_batch_count
def main(_): resnet_model.maybe_download_and_extract() if FLAGS.num_gpus == 0: dev = '/CPU:0' elif FLAGS.num_gpus == 1: dev = '/GPU:0' else: raise ValueError('Only support 0 or 1 gpu') # if FLAGS.mode == 'train': # batch_size = 128 # elif FLAGS.mode == 'eval': # batch_size = FLAGS.eval_batch_size if FLAGS.dataset == 'cifar10': num_class = 10 elif FLAGS.dataset == 'cifar100': num_class = 100 hps = resnet_model.HParams(num_class=num_class, lrn_rate=0.1, num_residual_units=6, use_bottleneck=False, weight_decay_rate=0.0005, dropout_rate=0.3, relu_leakiness=0.1, optimizer='mom', width=10, data_dir=FLAGS.train_data_path) with tf.device(dev): if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': test(hps)
def __init__(self, data, dataset, eval_batch_count, eval_dir): os.environ["CUDA_VISIBLE_DEVICES"] = "" hps = resnet_model.HParams( batch_size=100, num_classes=100 if dataset == "cifar100" else 10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer="mom", num_gpus=0) with tf.device("/cpu:0"): # Builds the testing network. images, labels = cifar_input.build_input(data, hps.batch_size, dataset, False) self.model = resnet_model.ResNet(hps, images, labels, "eval") self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) self.model.variables.set_session(sess) init = tf.global_variables_initializer() sess.run(init) # Initializing parameters for tensorboard. self.best_precision = 0.0 self.eval_batch_count = eval_batch_count self.summary_writer = tf.summary.FileWriter(eval_dir, sess.graph) # The IP address where tensorboard logs will be on. self.ip_addr = ray.services.get_node_ip_address()
def main(_): if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' else: raise ValueError('Only support 0 or 1 gpu.') if FLAGS.mode == 'train': batch_size = 128 elif FLAGS.mode == 'eval': batch_size = 100 if FLAGS.dataset == 'cifar10': num_classes = 10 elif FLAGS.dataset == 'cifar100': num_classes = 100 hps = resnet_model.HParams(num_classes=num_classes, lrn_rate=0.1, weight_decay_rate=0.0002, optimizer='mom') with tf.device(dev): if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': evaluate(hps)
def main(_): if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' else: raise ValueError('Only support 0 or 1 gpu.') if FLAGS.mode == 'train': batch_size = 128 elif FLAGS.mode == 'eval': batch_size = 100 if FLAGS.dataset == 'cifar10': num_classes = 10 elif FLAGS.dataset == 'cifar100': num_classes = 100 hps = resnet_model.HParams(batch_size=batch_size, num_classes=num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') with tf.device(dev): if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': evaluate(hps)
def main(_): #dvice判断 if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' else: raise ValueError('Only support 0 or 1 gpu.') #hparams设置 hps = resnet_model.HParams(batch_size=FLAGS.batch_size, num_classes=FLAGS.num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') #训练模型 with tf.device(dev): if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': evaluate(hps)
def main(_): decay_steps = int(FLAGS.num_examples_train / FLAGS.batch_size * FLAGS.lr_decay_epoches) hps = resnet_model.HParams( batch_size=FLAGS.batch_size, num_classes=FLAGS.num_classes, num_gpus=FLAGS.num_gpus, initial_learning_rate=FLAGS.initial_learning_rate, lr_decay_steps=decay_steps, lr_decay_factor=FLAGS.lr_decay_factor, optimizer=FLAGS.optimizer, num_layers=FLAGS.num_layers, prob_depth=0.5, use_bottleneck=True, weight_decay_rate=0.0001, relu_leakiness=0) if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) #with tf.device(dev): # if FLAGS.mode == 'train': # train(hps) # elif FLAGS.mode == 'eval': # evaluate(hps) train(hps)
def __init__(self, data, num_gpus): if num_gpus > 0: os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( [str(i) for i in ray.get_gpu_ids()]) hps = resnet_model.HParams(batch_size=128, num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom', num_gpus=num_gpus) data = ray.get(data) total_images = np.concatenate([data[0], data[1], data[2]]) with tf.Graph().as_default(): if num_gpus > 0: tf.set_random_seed(ray.get_gpu_ids()[0] + 1) else: tf.set_random_seed(1) with tf.device('/gpu:0' if num_gpus > 0 else '/cpu:0'): images, labels = cifar_input.build_input( [total_images, data[3]], hps.batch_size, True) self.model = resnet_model.ResNet(hps, images, labels, 'train') self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) self.model.variables.set_session(sess) self.coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord=self.coord) init = tf.global_variables_initializer() sess.run(init)
def main(_): if FLAGS.dataset == 'cifar10': num_classes = 10 elif FLAGS.dataset == 'cifar100': num_classes = 100 hps = resnet_model.HParams(num_classes=num_classes, lrn_rate=0.1, weight_decay_rate=0.002, optimizer='mom') # add cluster information if FLAGS.job_name is None or FLAGS.job_name == "": raise ValueError("Must specify an explicit `job_name`") if FLAGS.task_index is None or FLAGS.task_index =="": raise ValueError("Must specify an explicit `task_index`") print("job name = %s" % FLAGS.job_name) print("task index = %d" % FLAGS.task_index) #Construct the cluster and start the server ps_spec = FLAGS.ps_hosts.split(",") worker_spec = FLAGS.worker_hosts.split(",") # Get the number of workers. num_workers = len(worker_spec) FLAGS.replicas_to_aggregate = num_workers cluster = tf.train.ClusterSpec({ "ps": ps_spec, "worker": worker_spec}) if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. server = tf.train.Server( cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() if FLAGS.num_gpus > 0: # Avoid gpu allocation conflict: now allocate task_num -> #gpu # for each worker in the corresponding machine gpu = (FLAGS.task_index % FLAGS.num_gpus) worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu) elif FLAGS.num_gpus == 0: # Just allocate the CPU to worker server cpu = 0 worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu) with tf.device( tf.train.replica_device_setter( worker_device=worker_device, # ps_device="/job:ps/cpu:0", cluster=cluster)): if FLAGS.mode == 'train': train(hps, server)
def main(_): if FLAGS.model == '': raise Exception('--model must be specified.') if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' else: raise ValueError('Only support 0 or 1 gpu.') if FLAGS.batch_size == -1: if FLAGS.mode == 'train': batch_size = 128 elif FLAGS.mode == 'eval': # SimonChange: default batch_size from 100 to FLAGS.batch_size batch_size = FLAGS.batch_size else: batch_size = FLAGS.batch_size if FLAGS.dataset == 'cifar10': num_classes = 10 elif FLAGS.dataset == 'cifar100': num_classes = 100 if FLAGS.model == 'resnet20': num_residual_units = 3 elif FLAGS.model == 'resnet56': num_residual_units = 9 elif FLAGS.model == 'resnet164' and FLAGS.use_bottleneck: num_residual_units = 18 elif FLAGS.model == 'resnet164' and not FLAGS.use_bottleneck: num_residual_units = 27 else: raise Exception( "Invalid model -- only resnet20, resnet56 and resnet164 supported") data_format = FLAGS.data_format hps = resnet_model.HParams(batch_size=batch_size, num_classes=num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=num_residual_units, use_bottleneck=FLAGS.use_bottleneck, weight_decay_rate=0.0005, relu_leakiness=0.1, optimizer='mom', data_format=data_format) with tf.device(dev): if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': evaluate(hps)
def main(_): hps = resnet_model.HParams(batch_size=FLAGS.epoch, num_classes=num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='sgd') train(hps)
def main(_): hps = resnet_model.HParams(batch_size=100, num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1) evaluate(hps)
def getQfeature(filepath): image = read(filepath) labels = [3] hps = resnet_model.HParams(batch_size=1, num_classes=4, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') model = resnet_model.ResNet(hps, image, labels, FLAGS.mode) model.build_graph() logits = tf.get_default_graph().get_tensor_by_name("logit/xw_plus_b:0") print(logits) logits_norm = tf.nn.l2_normalize(logits, 1) # Run our model steps = 1 # *** Maybe exist some duplicate image features, next dict op will clear it. # Restore the moving average version of the learned variables for better effect. # for name in variables_to_restore: # print(name) saver = tf.train.Saver() with tf.Session() as sess: # Restore model from checkpoint. # Note!: checkpoint file not a single file, so don't use like this: # saver.restore(sess, '/path/to/model.ckpt-1000.index') xxx # Don't forget launch queue, use coordinator to avoid harmless 'Enqueue operation was cancelled ERROR'(of course you can also just start) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # ckpt correspond to 'checkpoint' file. ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) # model_checkpoint_path looks something like: /path/to/model.ckpt-1000 print(ckpt.model_checkpoint_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) # fc1_list=fc2_list=fc3_list=[] # the same object! logits_list = [] _logits = sess.run([logits_norm]) # return nd-array print('................') print(_logits) print('................') put_2darray(_logits, logits_list) return logits_list
def main(_): if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' elif FLAGS.num_gpus > 1: devices = ['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3'] else: raise ValueError('Only support 0 or 1 gpu.') if FLAGS.mode == 'train': batch_size = 128 elif FLAGS.mode == 'eval': batch_size = 100 if FLAGS.dataset == 'cifar10': num_classes = 10 elif FLAGS.dataset == 'cifar100': num_classes = 100 elif FLAGS.dataset == 'amazon': num_classes = 33 elif FLAGS.dataset == 'naver': num_classes = 27 hps = resnet_model.HParams(batch_size=batch_size, num_classes=num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') if FLAGS.num_gpus > 1: for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ("gpu", i)) as scope: if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': evaluate(hps) else: with tf.device(dev): if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': evaluate(hps)
def main(_): if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' else: raise ValueError('Only support 0 or 1 gpu.') if FLAGS.mode == 'train': batch_size = 128 elif FLAGS.mode == 'eval': batch_size = 100 elif FLAGS.mode == 'infer': batch_size = 100 # Change values bellow based on your own setting. hps = resnet_model.HParams(batch_size=batch_size, image_size=32, depth=3, num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.01, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.004, relu_leakiness=0.1, optimizer='mom', fine_tune=False) with tf.device(dev): if FLAGS.mode == 'train': X_train = helper.load_data(FLAGS.train_data_path) y_train = helper.load_data(FLAGS.train_labels_path) y_train = y_train - 1 train(hps, X_train, y_train) elif FLAGS.mode == 'eval': X_val = helper.load_data(FLAGS.eval_data_path) y_val = helper.load_data(FLAGS.eval_labels_path) y_val = y_val - 1 evaluate(hps, X_val, y_val) elif FLAGS.mode == 'infer': X_infer = helper.load_data(FLAGS.infer_data_path) y_infer = np.ones((X_infer.shape[0], )) infer(hps, X_infer, y_infer)
def __init__(self, data, dataset, num_gpus): if num_gpus > 0: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in ray.get_gpu_ids()]) hps = resnet_model.HParams( batch_size=128, num_classes=100 if dataset == "cifar100" else 10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer="mom", num_gpus=num_gpus) # We seed each actor differently so that each actor operates on a # different subset of data. if num_gpus > 0: tf.set_random_seed(ray.get_gpu_ids()[0] + 1) else: # Only a single actor in this case. tf.set_random_seed(1) input_images = data[0] input_labels = data[1] with tf.device("/gpu:0" if num_gpus > 0 else "/cpu:0"): # Build the model. images, labels = cifar_input.build_input([input_images, input_labels], hps.batch_size, dataset, False) self.model = resnet_model.ResNet(hps, images, labels, "train") self.model.build_graph() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) self.model.variables.set_session(sess) self.coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord=self.coord) init = tf.global_variables_initializer() sess.run(init) self.steps = 10
def main(_): if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' else: raise ValueError('Only support 0 or 1 gpu.') if FLAGS.mode == 'train': batch_size = 128 elif FLAGS.mode == 'eval': batch_size = 100 if FLAGS.dataset == 'cifar10': num_classes = 10 elif FLAGS.dataset == 'cifar100': num_classes = 100 # print('log_root', FLAGS.log_root) # print('train_dir', FLAGS.train_dir) hps = resnet_model.HParams( batch_size=batch_size, num_classes=num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, # num_residual_units=5, num_residual_units=3, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') with tf.device(dev): if FLAGS.mode == 'train': train_start = time.time() train(hps) train_duration = time.time() - train_start elif FLAGS.mode == 'eval': evaluate(hps) print('train=%.4fh' % (train_duration / 3600))
def _my_model_fn(features, labels, mode, params): del params # unused, but needed for TPU training # # Model - Here we use pre-built 'resnet_model' # model_params = resnet_model.HParams( batch_size=int( batch_size / FLAGS.num_replica), # because batch is divided by TPU replicas num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, # 5 x (3 x sub 2) + 2 = 32 layers use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') train_model = resnet_model.ResNet(model_params, features, labels, 'train') train_model.build_graph(tpu_opt=True) # create evaluation metrices #truth = tf.argmax(train_model.labels, axis=1) #predictions = tf.argmax(train_model.predictions, axis=1) #precision = tf.reduce_mean( # tf.to_float(tf.equal(predictions, truth)), # name="precision") #accuracy = tf.metrics.accuracy(truth, predictions) #tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard # define operations (Here we assume only training operation !) #prediction_outputs = { # "precision": precision #} return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=train_model.cost, train_op=train_model.train_op, #predictions=prediction_outputs, eval_metrics=(metric_fn, [train_model.labels, train_model.predictions]))
def main(_): if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' else: raise ValueError('Only support 0 or 1 gpu.') # if FLAGS.mode == 'train': # batch_size = 128 # elif FLAGS.mode == 'eval': # batch_size = 100 batch_size = FLAGS.batch_size # if FLAGS.dataset == 'cifar10': num_classes = 10 # elif FLAGS.dataset == 'cifar100': # num_classes = 100 hps = resnet_model.HParams(batch_size=batch_size, num_classes=num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') '''sess = tf.Session() tf.train.start_queue_runners(sess=sess) custom_runner.start_threads(sess) images_batch, labels_batch = sess.run([images, labels]) print(images_batch.shape) print(labels_batch.shape)''' with tf.device(dev): if FLAGS.mode == 'train': train(hps) elif FLAGS.mode == 'eval': evaluate(hps)
def main(_): if FLAGS.num_gpus == 0: dev = '/cpu:0' elif FLAGS.num_gpus == 1: dev = '/gpu:0' else: raise ValueError('Only support 0 or 1 gpu.') batch_size = 128 if FLAGS.dataset == 'cifar10': num_classes = 10 elif FLAGS.dataset == 'cifar100': num_classes = 100 weight_decay_rate = FLAGS.weight_decay pool_type = FLAGS.pool_type hps = resnet_model.HParams(batch_size=batch_size, num_classes=num_classes, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=weight_decay_rate, relu_leakiness=0.1, optimizer='mom', pool_type=pool_type) if not os.path.exists(FLAGS.result_path): os.makedirs(FLAGS.result_path) config_str = json.dumps(hps._asdict()) config_file = os.path.join(FLAGS.result_path, 'config') config_file_object = open(config_file, 'w') config_file_object.write(config_str) config_file_object.close() with tf.device(dev): train(hps)
def train_resnet_baseline(max_step_run): """Trains the resnet baseline model. Args: max_step_run: The maximum number of gradient steps. """ if not os.path.exists(FLAGS.train_log_dir): os.makedirs(FLAGS.train_log_dir) g = tf.Graph() with g.as_default(): with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): tf_global_step = tf.train.get_or_create_global_step() # pylint: disable=line-too-long images, one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.provide_resnet_data( FLAGS.dataset_name, 'train', FLAGS.batch_size, dataset_dir=FLAGS.data_dir) hps = resnet_model.HParams(batch_size=FLAGS.batch_size, num_classes=num_of_classes, min_lrn_rate=0.0001, lrn_rate=FLAGS.learning_rate, num_residual_units=9, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') images.set_shape([FLAGS.batch_size, 32, 32, 3]) tf.logging.info('num_of_example={}'.format(num_samples_per_epoch)) # Define the model: resnet = resnet_model.ResNet(hps, images, one_hot_labels, mode='train') logits = resnet.build_model() # Specify the loss function: total_loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) total_loss = tf.reduce_mean(total_loss, name='xent') total_loss += resnet.decay() # decay tf.add_to_collection('total_loss', total_loss) decay_steps = int(num_samples_per_epoch / FLAGS.batch_size * FLAGS.num_epochs_per_decay) boundaries = [19531, 25000, 30000] values = [FLAGS.learning_rate * t for t in [1, 0.1, 0.01, 0.001]] lr = tf.train.piecewise_constant(tf_global_step, boundaries, values) slim.summaries.add_scalar_summary(lr, 'learning_rate', print_summary=True) lr = tf.train.exponential_decay(FLAGS.learning_rate, tf_global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) slim.summaries.add_scalar_summary(total_loss, 'total_loss', print_summary=True) # Set up training. trainable_variables = tf.trainable_variables() grads = tf.gradients(total_loss, trainable_variables) optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9) apply_op = optimizer.apply_gradients(zip(grads, trainable_variables), global_step=tf_global_step, name='train_step') train_ops = [apply_op] + resnet.extra_train_ops train_op = tf.group(*train_ops) saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=24) # Run training. slim.learning.train(train_op=train_op, train_step_fn=resnet_train_step, logdir=FLAGS.train_log_dir, master=FLAGS.master, saver=saver, is_chief=FLAGS.task == 0, number_of_steps=max_step_run, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
def train_resnet_mentormix(max_step_run): """Trains the mentornet with the student resnet model. Args: max_step_run: The maximum number of gradient steps. """ if not os.path.exists(FLAGS.train_log_dir): os.makedirs(FLAGS.train_log_dir) g = tf.Graph() with g.as_default(): with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): tf_global_step = tf.train.get_or_create_global_step() (images, one_hot_labels, num_samples_per_epoch, num_of_classes) = cifar_data_provider.provide_resnet_data( FLAGS.dataset_name, 'train', FLAGS.batch_size, dataset_dir=FLAGS.data_dir) hps = resnet_model.HParams(batch_size=FLAGS.batch_size, num_classes=num_of_classes, min_lrn_rate=0.0001, lrn_rate=FLAGS.learning_rate, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') images.set_shape([FLAGS.batch_size, 32, 32, 3]) # Define the model: resnet = resnet_model.ResNet(hps, images, one_hot_labels, mode='train') with tf.variable_scope('ResNet32'): logits = resnet.build_model() # Specify the loss function: loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) dropout_rates = utils.parse_dropout_rate_list( FLAGS.example_dropout_rates) example_dropout_rates = tf.convert_to_tensor( dropout_rates, np.float32, name='example_dropout_rates') loss_p_percentile = tf.convert_to_tensor(np.array( [FLAGS.loss_p_percentile] * 100), np.float32, name='loss_p_percentile') loss = tf.reshape(loss, [-1, 1]) epoch_step = tf.to_int32( tf.floor(tf.divide(tf_global_step, max_step_run) * 100)) zero_labels = tf.zeros([tf.shape(loss)[0], 1], tf.float32) mentornet_net_hparams = utils.get_mentornet_network_hyperparameter( FLAGS.trained_mentornet_dir) # In the simplest case, this function can be replaced with a thresholding # function. See loss_thresholding_function in utils.py. v = utils.mentornet(epoch_step, loss, zero_labels, loss_p_percentile, example_dropout_rates, burn_in_epoch=FLAGS.burn_in_epoch, mentornet_net_hparams=mentornet_net_hparams, avg_name='individual') v = tf.stop_gradient(v) loss = tf.stop_gradient(tf.identity(loss)) logits = tf.stop_gradient(tf.identity(logits)) # Perform MentorMix images_mix, labels_mix = utils.mentor_mix_up( images, one_hot_labels, v, FLAGS.mixup_alpha) resnet = resnet_model.ResNet(hps, images_mix, labels_mix, mode='train') with tf.variable_scope('ResNet32', reuse=True): logits_mix = resnet.build_model() loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels_mix, logits=logits_mix) decay_loss = resnet.decay() # second weighting if FLAGS.second_reweight: loss = tf.reshape(loss, [-1, 1]) v = utils.mentornet( epoch_step, loss, zero_labels, loss_p_percentile, example_dropout_rates, burn_in_epoch=FLAGS.burn_in_epoch, mentornet_net_hparams=mentornet_net_hparams, avg_name='mixed') v = tf.stop_gradient(v) weighted_loss_vector = tf.multiply(loss, v) loss = tf.reduce_mean(weighted_loss_vector) # reproduced with the following decay loss which should be 0. decay_loss = tf.losses.get_regularization_loss() decay_loss = decay_loss * (tf.reduce_sum(v) / FLAGS.batch_size) # Log data utilization data_util = utils.summarize_data_utilization( v, tf_global_step, FLAGS.batch_size) loss = tf.reduce_mean(loss) slim.summaries.add_scalar_summary(tf.reduce_mean(loss), 'mentormix/mix_loss') weighted_total_loss = loss + decay_loss slim.summaries.add_scalar_summary(weighted_total_loss, 'total_loss') tf.add_to_collection('total_loss', weighted_total_loss) # Set up the moving averages: moving_average_variables = tf.trainable_variables() moving_average_variables = tf.contrib.framework.filter_variables( moving_average_variables, exclude_patterns=['mentornet']) variable_averages = tf.train.ExponentialMovingAverage( 0.9999, tf_global_step) tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, variable_averages.apply(moving_average_variables)) decay_steps = FLAGS.num_epochs_per_decay * num_samples_per_epoch / FLAGS.batch_size lr = tf.train.exponential_decay(FLAGS.learning_rate, tf_global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) lr = tf.squeeze(lr) slim.summaries.add_scalar_summary(lr, 'learning_rate') # Specify the optimization scheme: with tf.control_dependencies([weighted_total_loss, data_util]): # Set up training. trainable_variables = tf.trainable_variables() trainable_variables = tf.contrib.framework.filter_variables( trainable_variables, exclude_patterns=['mentornet']) grads = tf.gradients(weighted_total_loss, trainable_variables) optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9) apply_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=tf_global_step, name='train_step') train_ops = [apply_op ] + resnet.extra_train_ops + tf.get_collection( tf.GraphKeys.UPDATE_OPS) train_op = tf.group(*train_ops) # Parameter restore setup if FLAGS.trained_mentornet_dir is not None: ckpt_model = FLAGS.trained_mentornet_dir if os.path.isdir(FLAGS.trained_mentornet_dir): ckpt_model = tf.train.latest_checkpoint(ckpt_model) # Fix the mentornet parameters variables_to_restore = slim.get_variables_to_restore( include=['mentornet', 'mentornet_inputs']) iassign_op1, ifeed_dict1 = tf.contrib.framework.assign_from_checkpoint( ckpt_model, variables_to_restore) # Create an initial assignment function. def init_assign_fn(sess): tf.logging.info('Restore using customer initializer %s', '.' * 10) sess.run(iassign_op1, ifeed_dict1) else: init_assign_fn = None tf.logging.info('-' * 20 + 'MentorMix' + '-' * 20) tf.logging.info('loss_p_percentile=%3f', FLAGS.loss_p_percentile) tf.logging.info('mixup_alpha=%d', FLAGS.mixup_alpha) tf.logging.info('-' * 20) saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=24) # Run training. slim.learning.train(train_op=train_op, train_step_fn=resnet_train_step, logdir=FLAGS.train_log_dir, master=FLAGS.master, is_chief=FLAGS.task == 0, saver=saver, number_of_steps=max_step_run, init_fn=init_assign_fn, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
for batch in tl.iterate.minibatches(inputs=self.x_test, targets=self.y_test, batch_size=50, shuffle=False): feed_dict_eval = {img: batch[0]} preds = sess.run(model.predict, feed_dict=feed_dict_eval) for pred in preds: test_predicted.append(pred) csv_content = [["ID", "Label"]] for ind, data in enumerate(test_predicted): csv_content.append([ind + 1, data + 1]) with open("cifar_prediction.csv", "w") as f: writer = csv.writer(f) writer.writerows(csv_content) run = CNNEnv() hps = resnet_model.HParams(batch_size=run.batch_num, num_classes=run.nb_classes, min_lrn_rate=0.0001, lrn_rate=args.lr, num_residual_units=args.n_resid_units, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') run.train(hps)
def _my_model_fn(features, labels, mode): """ device is automatically detected and assigned """ #device = '/job:localhost/replica:0/task:0/device:GPU:0' #with tf.device(device): # # Model - Here we use pre-built 'resnet_model' # params = resnet_model.HParams( batch_size=batch_size, num_classes=10, min_lrn_rate=0.0001, lrn_rate=0.1, num_residual_units=5, # 5 x (3 x sub 2) + 2 = 32 layers use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') train_model = resnet_model.ResNet( params, features, labels, 'train') train_model.build_graph() # create evaluation metrices """ Please umcomment """ """ when you output precision and accuracy to TensorBoard or use INFER """ #truth = tf.argmax(train_model.labels, axis=1) #predictions = tf.argmax(train_model.predictions, axis=1) #precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) #accuracy = tf.metrics.accuracy(truth, predictions) #tf.summary.scalar('precision', precision) # output to TensorBoard #tf.summary.scalar('accuracy', accuracy[1]) # output to TensorBoard # define operations if mode == tf.estimator.ModeKeys.TRAIN: """ We don't use tf.train.LoggingTensorHook because it doesn't work when distributed tensorflow. """ #logging_hook = tf.train.LoggingTensorHook( # tensors={ # 'step': train_model.global_step, # 'loss': train_model.cost, # 'lrn_rate': train_model.lrn_rate, # 'precision': precision # }, # every_n_iter=10) # log output every 10 steps class _CustomLogHook(tf.train.SessionRunHook): def before_run(self, run_context): return tf.train.SessionRunArgs( fetches = [train_model.global_step, train_model.cost]) def after_run(self, run_context, run_values): if run_values.results[0] % 10 == 0: # log output every 10 steps print('step:%d loss:%.2f' % (run_values.results[0], run_values.results[1])) return tf.estimator.EstimatorSpec( mode, loss=train_model.cost, train_op=train_model.train_op, #training_chief_hooks=[logging_hook]) training_chief_hooks=[_CustomLogHook()]) if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = { # 'accuracy': accuracy } return tf.estimator.EstimatorSpec( mode, loss=train_model.cost, eval_metric_ops=eval_metric_ops) """ Please umcomment when you use INFER """
def fit(self, X_train, Y_train): x = tf.placeholder(tf.float32, [None, self.image_size, self.image_size, 1], name='input') y = tf.placeholder(tf.float32, [None, self.num_classes]) hps = resnet_model.HParams(batch_size=self.batch_size, num_classes=self.num_classes, num_residual_units=self.num_residual_units, use_bottleneck=self.is_bottlneck, relu_leakiness=self.relu_leakiness, weight_decay_rate=self.weight_decay) model = resnet_model.ResNet(hps, x, y) predict = model.out output = tf.nn.softmax(predict, name='output') with tf.name_scope("cross_ent"): cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y)) cost += model._decay() var_list = [v for v in tf.trainable_variables()] with tf.name_scope("train"): gradients = tf.gradients(cost, var_list) gradients = list(zip(gradients, var_list)) optimizer = tf.train.MomentumOptimizer(self.learning_rate, 0.9) train_op = optimizer.apply_gradients(grads_and_vars=gradients) with tf.name_scope("accuracy"): prediction = tf.equal(tf.argmax(output, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32)) saver = tf.train.Saver() #Initialize the data generator seperately for the training set,didn't initialize validation set train_generator = ImageDataGenerator(X_train, Y_train, shuffle=True, scale_size=(self.image_size, self.image_size), nb_classes=self.num_classes) # Get the number of training steps per epoch train_batches_per_epoch = np.floor(self.data_size / self.batch_size).astype(np.int16) # Start Tensorflow session with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True)) as sess: sess.run(tf.global_variables_initializer()) #writer.add_graph(sess.graph) if not self.restore_checkpoint == '': saver.restore(sess, self.restore_checkpoint) print("{} Start training...".format(datetime.now())) #print("{} Open Tensorboard :tensorboard --logdir {} --host localhost --port 6006".format(datetime.now(),self.filewriter_path)) for epoch in range(self.num_epochs): step = 1 while step < train_batches_per_epoch: # Get a batch of images and labels batch_xs, batch_ys = train_generator.next_batch( self.batch_size) # And run the training op feed_dict = {x: batch_xs, y: batch_ys} sess.run(train_op, feed_dict=feed_dict) # Generate summary with the current batch of data and write to file if step % self.display_step == 0: # loss, acc, s = sess.run([cost, accuracy, merged_summary], feed_dict=feed_dict) loss, acc = sess.run([cost, accuracy], feed_dict=feed_dict) #writer.add_summary(s, epoch * train_batches_per_epoch + step) print( "Iter {}/{}, training mini-batch loss = {:.5f}, training accuracy = {:.5f}" .format(step * self.batch_size, train_batches_per_epoch * self.batch_size, loss, acc)) step += 1 train_generator.reset_pointer() '''
# mode = 'train' or 'eval' dataset = 'cifar10' mode = 'train' if mode == 'train': num_iterations = 56000 batch_size = 128 else: num_iterations = 100 batch_size = 100 if dataset == 'cifar10': num_classes = 10 else: num_classes = 100 # set hyperparameters of resnet hps = resnet_model.HParams(batch_size=batch_size, num_classes=num_classes, init_lr=0.1, num_residual_units=5, use_bottleneck=False, weight_decay_rate=0.0001, relu_leakiness=0.0, optimizer='momentum') if mode == 'train': train(hps, num_iterations, dataset) else: evaluate(hps, num_iterations, dataset)
def train_resnet_mentornet(max_step_run): """Trains the mentornet with the student resnet model. Args: max_step_run: The maximum number of gradient steps. """ if not os.path.exists(FLAGS.train_log_dir): os.makedirs(FLAGS.train_log_dir) g = tf.Graph() with g.as_default(): with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): tf_global_step = tf.train.get_or_create_global_step() # pylint: disable=line-too-long images, one_hot_labels, clean_images, clean_one_hot_labels, num_samples_per_epoch, num_of_classes = cifar_data_provider.my_provide_resnet_data( FLAGS.dataset_name, 'train', FLAGS.batch_size, dataset_dir=FLAGS.data_dir) hps = resnet_model.HParams( batch_size=FLAGS.batch_size, num_classes=num_of_classes, min_lrn_rate=0.0001, lrn_rate=FLAGS.learning_rate, num_residual_units=9, use_bottleneck=False, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='mom') images.set_shape([FLAGS.batch_size, 32, 32, 3]) tf.logging.info('num_of_example=%s', num_samples_per_epoch) # Define the model: resnet = resnet_model.ResNet(hps, images, one_hot_labels, mode='train') logits = resnet.build_model() # Specify the loss function: loss = tf.nn.softmax_cross_entropy_with_logits( labels=one_hot_labels, logits=logits) dropout_rates = utils.parse_dropout_rate_list(FLAGS.example_dropout_rates) example_dropout_rates = tf.convert_to_tensor( dropout_rates, np.float32, name='example_dropout_rates') loss_p_percentile = tf.convert_to_tensor( np.array([FLAGS.loss_p_percentile] * 100), np.float32, name='loss_p_percentile') loss = tf.reshape(loss, [-1, 1]) epoch_step = tf.to_int32( tf.floor(tf.divide(tf_global_step, max_step_run) * 100)) zero_labels = tf.zeros([tf.shape(loss)[0], 1], tf.float32) v = utils.mentornet( epoch_step, loss, zero_labels, loss_p_percentile, example_dropout_rates, burn_in_epoch=FLAGS.burn_in_epoch, fixed_epoch_after_burn_in=FLAGS.fixed_epoch_after_burn_in, loss_moving_average_decay=FLAGS.loss_moving_average_decay) tf.stop_gradient(v) # Split v into clean data & noise data part is_clean = tf.reshape(tf.reduce_all(tf.equal(one_hot_labels, clean_one_hot_labels), axis=1), [-1,1]) clean_v = tf.boolean_mask(v, is_clean) noise_v = tf.boolean_mask(v, ~is_clean) tf.add_to_collection('v', v) tf.add_to_collection('v', clean_v) tf.add_to_collection('v', noise_v) slim.summaries.add_histogram_summary(tf.boolean_mask(v, is_clean), 'clean_v') slim.summaries.add_histogram_summary(tf.boolean_mask(v, ~is_clean), 'noisy_v') # Log data utilization data_util = utils.summarize_data_utilization(v, tf_global_step, FLAGS.batch_size) decay_loss = resnet.decay() weighted_loss_vector = tf.multiply(loss, v) weighted_loss = tf.reduce_mean(weighted_loss_vector) slim.summaries.add_scalar_summary( tf.reduce_mean(loss), 'mentornet/orig_loss') slim.summaries.add_scalar_summary(weighted_loss, 'mentornet/weighted_loss') # Normalize the decay loss based on v weighed_decay_loss = decay_loss * (tf.reduce_sum(v) / FLAGS.batch_size) weighted_total_loss = weighted_loss + weighed_decay_loss slim.summaries.add_scalar_summary(weighted_total_loss, 'mentornet/total_loss') slim.summaries.add_scalar_summary(weighted_total_loss, 'total_loss') tf.add_to_collection('total_loss', weighted_total_loss) boundaries = [19531, 25000, 30000] values = [FLAGS.learning_rate * t for t in [1, 0.1, 0.01, 0.001]] lr = tf.train.piecewise_constant(tf_global_step, boundaries, values) slim.summaries.add_scalar_summary(lr, 'learning_rate') # Specify the optimization scheme: with tf.control_dependencies([weighted_total_loss, data_util]): # Set up training. trainable_variables = tf.trainable_variables() trainable_variables = tf.contrib.framework.filter_variables( trainable_variables, exclude_patterns=['mentornet']) grads = tf.gradients(weighted_total_loss, trainable_variables) optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9) apply_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=tf_global_step, name='train_step') train_ops = [apply_op] + resnet.extra_train_ops train_op = tf.group(*train_ops) # Parameter restore setup if FLAGS.trained_mentornet_dir is not None: ckpt_model = FLAGS.trained_mentornet_dir if os.path.isdir(FLAGS.trained_mentornet_dir): ckpt_model = tf.train.latest_checkpoint(ckpt_model) # Fix the mentornet parameters variables_to_restore = slim.get_variables_to_restore( # TODO(lujiang): mentornet_inputs or mentor_inputs? include=['mentornet', 'mentornet_inputs']) iassign_op1, ifeed_dict1 = tf.contrib.framework.assign_from_checkpoint( ckpt_model, variables_to_restore) # Create an initial assignment function. def init_assign_fn(sess): tf.logging.info('Restore using customer initializer %s', '.' * 10) sess.run(iassign_op1, ifeed_dict1) else: init_assign_fn = None tf.logging.info('-' * 20 + 'MentorNet' + '-' * 20) tf.logging.info('loaded pretrained mentornet from %s', ckpt_model) tf.logging.info('loss_p_percentile=%3f', FLAGS.loss_p_percentile) tf.logging.info('burn_in_epoch=%d', FLAGS.burn_in_epoch) tf.logging.info('fixed_epoch_after_burn_in=%s', FLAGS.fixed_epoch_after_burn_in) tf.logging.info('loss_moving_average_decay=%3f', FLAGS.loss_moving_average_decay) tf.logging.info('example_dropout_rates %s', ','.join( str(t) for t in dropout_rates)) tf.logging.info('-' * 20) saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=24) # Run training. slim.learning.train( train_op=train_op, train_step_fn=resnet_train_step, logdir=FLAGS.train_log_dir, master=FLAGS.master, is_chief=FLAGS.task == 0, saver=saver, number_of_steps=max_step_run, init_fn=init_assign_fn, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs)
MODE = 'train' LOG_ROOT='../results/resnet_model' DATASET='cifar100' DEV = '/gpu:0' tf.reset_default_graph() # construct train model and session batch_size_train = 128 batch_size_test = 100 hps_train = resnet_model.HParams(batch_size=batch_size_train, num_classes=NUM_CLASSES, min_lrn_rate=0.0001, lrn_rate=0.1, mom=0.9, clip_norm_base=10.0, num_residual_units=5, use_bottleneck=True, weight_decay_rate=0.0002, relu_leakiness=0.1, optimizer='YF', model_scope='train') # specify how much memory to use on each GPU gpu_mem_portion=0.5 n_core = 16 with tf.variable_scope("train"), tf.device(DEV): model_train = get_model(hps_train, DATASET, TRAIN_DATA_PATH, mode='train') init_op = tf.global_variables_initializer() sess = GetTrainingSession(model_train, gpu_mem_portion=gpu_mem_portion) # run steps