def _tower_fn(is_training, weight_decay, feature, label, tower_losses, tower_gradvars, tower_preds, is_cpu): """Build computation tower for each device (CPU or GPU). Args: is_training: true if is training graph. weight_decay: weight regularization strength, a float. feature: a Tensor. label: a Tensor. tower_losses: a list to be appended with current tower's loss. tower_gradvars: a list to be appended with current tower's gradients. tower_preds: a list to be appended with current tower's predictions. is_cpu: true if build tower on CPU. """ data_format = 'channels_last' if is_cpu else 'channels_first' model = cifar10_model.ResNetCifar10( FLAGS.num_layers, is_training=is_training, data_format=data_format) logits = model.forward_pass(feature, input_data_format='channels_last') tower_pred = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits) } tower_preds.append(tower_pred) tower_loss = tf.losses.sparse_softmax_cross_entropy( logits=logits, labels=label) tower_loss = tf.reduce_mean(tower_loss) model_params = tf.trainable_variables() tower_loss += weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in model_params]) tower_losses.append(tower_loss) tower_grad = tf.gradients(tower_loss, model_params) tower_gradvars.append(zip(tower_grad, model_params))
def _tower_fn(is_training, weight_decay, feature, label, data_format, num_layers, batch_norm_decay, batch_norm_epsilon, optimizer, gradient_scale): """Build computation tower (Resnet). Args: is_training: true if is training graph. weight_decay: weight regularization strength, a float. feature: a Tensor. label: a Tensor. data_format: channels_last (NHWC) or channels_first (NCHW). num_layers: number of layers, an int. batch_norm_decay: decay for batch normalization, a float. batch_norm_epsilon: epsilon for batch normalization, a float. Returns: A tuple with the loss for the tower, the gradients and parameters, and predictions. """ model = cifar10_model.ResNetCifar10(num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=is_training, data_format=data_format) logits = model.forward_pass(feature, input_data_format='channels_last') tower_pred = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits) } tower_loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=label) tower_loss = tf.reduce_mean(tower_loss) model_params = tf.trainable_variables() tower_loss += weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in model_params]) compgrad_tower = [optimizer.compute_gradients(tower_loss, model_params)] for g, v in compgrad_tower[0]: tf.summary.histogram(v.name + '_ORG', g) tf.logging.info('@sahiltyagi4 shape of new_grads ' + str(g.shape)) tf.logging.info('@sahiltyagi4 shape of vars ' + str(v.shape)) new_grads = [(grad[0] * gradient_scale) for grad in compgrad_tower[0]] for new_g, new_v in zip(new_grads, model_params): tf.summary.histogram(new_v.name + '_MODIFIED', new_g) #last_gradient = new_g return tower_loss, zip(new_grads, model_params), tower_pred, compgrad_tower
def _tower_fn( is_training, weight_decay, feature, label, data_format, num_layers, batch_norm_decay, batch_norm_epsilon, ): """Build computation tower (Resnet). Args: is_training: true if is training graph. weight_decay: weight regularization strength, a float. feature: a Tensor. label: a Tensor. data_format: channels_last (NHWC) or channels_first (NCHW). num_layers: number of layers, an int. batch_norm_decay: decay for batch normalization, a float. batch_norm_epsilon: epsilon for batch normalization, a float. Returns: A tuple with the loss for the tower, the gradients and parameters, and predictions. """ model = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=is_training, data_format=data_format, ) logits = model.forward_pass(feature, input_data_format="channels_last") tower_pred = { "classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits), } tower_loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=label) tower_loss = tf.reduce_mean(tower_loss) model_params = tf.trainable_variables() tower_loss += weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in model_params]) tower_grad = tf.gradients(tower_loss, model_params) return tower_loss, zip(tower_grad, model_params), tower_pred
def __init__(self, dim): self.dim = dim # param values from cifar10_main.py if not tf.test.is_gpu_available(): data_format = 'channels_last' else: data_format = 'channels_first' is_training = True weight_decay = 2e-4, num_layers = 8 batch_size = 32 batch_norm_decay = 0.997 batch_norm_epsilon = 1e-5 image_batch = tf.random_uniform((batch_size, 32, 32, 3)) label_batch = tf.ones((batch_size, ), dtype=tf.int32) self.model = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=is_training, data_format=data_format) self.logits = self.model.forward_pass( image_batch, input_data_format='channels_last') # make size of parameters multiple of 8 (75360) dummy_var = tf.Variable(tf.ones((5, ))) self.pred = { 'classes': tf.argmax(input=self.logits, axis=1), 'probabilities': tf.nn.softmax(self.logits) } self.loss = tf.losses.sparse_softmax_cross_entropy(logits=self.logits, labels=label_batch) self.model_params = tf.trainable_variables() self.loss += weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in self.model_params]) grads = tf.gradients(self.loss, self.model_params) self.grad = tf.concat([tf.reshape(g, [-1]) for g in grads], axis=0) self.weights = np.zeros(self.grad.shape, dtype=np.float32) # TODO: make this into an op that accepts actual values self.set_weights_op = tf.global_variables_initializer() # todo(y): pad things so that it's divisible by num_ps? self.sess = tf.Session()
def main(_): num_train_examples = 45000 melt.apps.train.init() batch_size = melt.batch_size() num_gpus = melt.num_gpus() batch_size_per_gpu = FLAGS.batch_size # batch size not changed but FLAGS.batch_size will change to batch_size / num_gpus #print('--------------batch_size, FLAGS.batch_size, num_steps_per_epoch', batch_size, FLAGS.batch_size, num_train_examples // batch_size) global_scope = FLAGS.algo with tf.variable_scope(global_scope) as global_scope: data_format = 'channels_first' num_layers = 44 batch_norm_decay = 0.997 batch_norm_epsilon = 1e-05 data_dir = './mount/data/cifar10/' with tf.variable_scope('main') as scope: model = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=True, data_format=data_format) dataset = cifar10.Cifar10DataSet(data_dir, subset='train', use_distortion=True) ## This is wrong will cause all gpu read same data, so slow convergence but will get better test result #_, image_batch, label_batch = dataset.make_batch(FLAGS.batch_size) def loss_function(): # doing this 2gpu will get similar result as 1gpu, seems a bit better valid result and a bit worse test result might due to randomness _, image_batch, label_batch = dataset.make_batch( batch_size_per_gpu) return tower_loss(model, image_batch, label_batch) #loss_function = lambda: tower_loss(model, image_batch, label_batch) loss = melt.tower_losses(loss_function, num_gpus) pred = model.predict() pred = pred['classes'] label_batch = dataset.label_batch acc = tf.reduce_mean(tf.to_float(tf.equal(pred, label_batch))) #tf.summary.image('train/image', dataset.image_batch) # # Compute confusion matrix # matrix = tf.confusion_matrix(label_batch, pred, num_classes=10) # # Get a image tensor for summary usage # image_tensor = draw_confusion_matrix(matrix) # tf.summary.image('train/confusion_matrix', image_tensor) scope.reuse_variables() ops = [loss, acc] # TODO multiple gpu validation and inference validator = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=False, data_format=data_format) valid_dataset = cifar10.Cifar10DataSet(data_dir, subset='valid', use_distortion=False) valid_iterator = valid_dataset.make_batch(batch_size) valid_id_batch, valid_image_batch, valid_label_batch = valid_iterator.get_next( ) valid_loss = tower_loss(validator, valid_image_batch, valid_label_batch) valid_pred = validator.predict() valid_pred = valid_pred['classes'] ## seems not work with non rpeat mode.. #tf.summary.image('valid/image', valid_image_batch) ## Compute confusion matrix #matrix = tf.confusion_matrix(valid_label_batch, valid_pred, num_classes=10) ## Get a image tensor for summary usage #image_tensor = draw_confusion_matrix(matrix) #tf.summary.image('valid/confusion_matrix', image_tensor) #loss_function = lambda: tower_loss(validator, val_image_batch, val_label_batch) #val_loss = melt.tower_losses(loss_function, FLAGS.num_gpus, is_training=False) #eval_ops = [val_loss] metric_eval_fn = lambda model_path=None: \ evaluator.evaluate([valid_id_batch, valid_loss, valid_pred, valid_label_batch, valid_image_batch], valid_iterator, model_path=model_path) predictor = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=False, data_format=data_format) predictor.init_predict() test_dataset = cifar10.Cifar10DataSet(data_dir, subset='test', use_distortion=False) test_iterator = test_dataset.make_batch(batch_size) test_id_batch, test_image_batch, test_label_batch = test_iterator.get_next( ) test_pred = predictor.predict(test_image_batch, input_data_format='channels_last') test_pred = test_pred['classes'] inference_fn = lambda model_path=None: \ evaluator.inference([test_id_batch, test_pred], test_iterator, model_path=model_path) global eval_names names = ['loss', 'acc'] melt.apps.train_flow(ops, names=names, metric_eval_fn=metric_eval_fn, inference_fn=inference_fn, model_dir=FLAGS.model_dir, num_steps_per_epoch=num_train_examples // batch_size)
def _tower_fn(is_training, weight_decay, feature, label, data_format, num_layers, batch_norm_decay, batch_norm_epsilon, optimizer, gradient_scale, w_name): """Build computation tower (Resnet). Args: is_training: true if is training graph. weight_decay: weight regularization strength, a float. feature: a Tensor. label: a Tensor. data_format: channels_last (NHWC) or channels_first (NCHW). num_layers: number of layers, an int. batch_norm_decay: decay for batch normalization, a float. batch_norm_epsilon: epsilon for batch normalization, a float. Returns: A tuple with the loss for the tower, the gradients and parameters, and predictions. """ tf_config = json.loads(os.environ['TF_CONFIG']) tasktype = tf_config['task']['type'] index = str(tf_config['task']['index']) data_format = 'channels_first' model = cifar10_model.ResNetCifar10(num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=is_training, data_format=data_format) logits = model.forward_pass(feature, input_data_format='channels_last') tower_pred = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits) } tower_loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=label) tower_loss = tf.reduce_mean(tower_loss) ### LOGGING START TIME HERE # get_time_module = tf.load_op_library('/home/tensorflow/bazel-bin/tensorflow/core/user_ops/get_time.so') # start_time_operation = get_time_module.get_time(tower_loss) # start_time_operation = tf.reshape(start_time_operation, [-1]) # start_time_operation = tf.convert_to_tensor(start_time_operation, name='sahil_grad_start_time_operationresult') # loss_op = tf.reshape(tower_loss, [-1]) # start_time_op = tf.compat.v1.py_func(func=return_time, inp=[loss_op], Tout=tf.float32) # start_time_op = tf.reshape(start_time_op, [-1]) # start_time_op = tf.reduce_sum(start_time_op, name='START_SAHIL_TIME_GRADIENT') model_params = tf.trainable_variables() tower_loss += weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in model_params]) #start_ts = tf.timestamp(name='TS_START_SAHIL') # compgrad_tower = [optimizer.compute_gradients(tower_loss, model_params, worker_name=str(start_time_operation[0]))] compgrad_tower = [optimizer.compute_gradients(tower_loss, model_params)] #compgrad_tower = [optimizer.compute_gradients(tower_loss, model_params, worker_name=str(start_ts))] for g, v in compgrad_tower[0]: tf.summary.histogram(v.name + '_ORG', g) #last_gradient=0 new_grads = [(grad[0] * gradient_scale) for grad in compgrad_tower[0]] for new_g, new_v in zip(new_grads, model_params): tf.summary.histogram(new_v.name + '_MODIFIED', new_g) #last_gradient = new_g return tower_loss, zip(new_grads, model_params), tower_pred, compgrad_tower
def main(_): num_train_examples = 45000 melt.apps.init() batch_size = melt.batch_size() num_gpus = melt.num_gpus() batch_size_per_gpu = FLAGS.batch_size # batch size not changed but FLAGS.batch_size will change to batch_size / num_gpus #print('--------------batch_size, FLAGS.batch_size, num_steps_per_epoch', batch_size, FLAGS.batch_size, num_train_examples // batch_size) global_scope = FLAGS.algo with tf.variable_scope(global_scope) as global_scope: data_format = 'channels_first' num_layers = 44 batch_norm_decay = 0.997 batch_norm_epsilon = 1e-05 data_dir = './mount/data/cifar10/' with tf.variable_scope('main') as scope: model = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, training=True, data_format=data_format) dataset = cifar10.Cifar10DataSet(data_dir, subset='train', use_distortion=True) # this is faster then above method iterator = dataset.make_batch(batch_size) batch = iterator.get_next() ## Now below is also ok... # x = {'id': batch[0], 'image': batch[1]} # y = batch[2] # batch = (x, y) # x, y = melt.split_batch(batch, batch_size, num_gpus) # image_batches, label_batches = [item['image'] for item in x], y _, image_batches, label_batches = melt.split_batch( batch, batch_size, num_gpus) def loss_function(i): return tower_loss(model, image_batches[i], label_batches[i]) label_batch = label_batches[-1] #loss_function = lambda: tower_loss(model, image_batch, label_batch) loss = melt.tower(loss_function, num_gpus) pred = model.predict() pred = pred['classes'] #label_batch = dataset.label_batch acc = tf.reduce_mean(tf.to_float(tf.equal(pred, label_batch))) #tf.summary.image('train/image', dataset.image_batch) # # Compute confusion matrix # matrix = tf.confusion_matrix(label_batch, pred, num_classes=10) # # Get a image tensor for summary usage # image_tensor = draw_confusion_matrix(matrix) # tf.summary.image('train/confusion_matrix', image_tensor) scope.reuse_variables() ops = [loss, acc] validator = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, training=False, data_format=data_format) valid_dataset = cifar10.Cifar10DataSet(data_dir, subset='valid', use_distortion=False) valid_iterator = valid_dataset.make_batch(batch_size) valid_batch = valid_iterator.get_next() valid_id_batches, valid_image_batches, valid_label_batches = melt.split_batch( valid_batch, batch_size, num_gpus, training=False) def valid_loss_fn(i): valid_loss = tower_loss(validator, valid_image_batches[i], valid_label_batches[i]) valid_pred = validator.predict() return valid_id_batches[i], valid_loss, valid_pred[ 'classes'], valid_label_batches[i], valid_image_batches[i] num_valid_examples = dataset.num_examples_per_epoch(subset='valid') valid_ops = melt.tower(valid_loss_fn, num_gpus, training=False) ## seems not work with non rpeat mode.. #tf.summary.image('valid/image', valid_image_batch) ## Compute confusion matrix #matrix = tf.confusion_matrix(valid_label_batch, valid_pred, num_classes=10) ## Get a image tensor for summary usage #image_tensor = draw_confusion_matrix(matrix) #tf.summary.image('valid/confusion_matrix', image_tensor) #loss_function = lambda: tower_loss(validator, val_image_batch, val_label_batch) #val_loss = melt.tower_losses(loss_function, FLAGS.num_gpus, training=False) #eval_ops = [val_loss] metric_eval_fn = lambda model_path=None: \ evaluator.evaluate(valid_ops, valid_iterator, num_steps=-(-num_valid_examples // batch_size), num_examples=num_valid_examples, model_path=model_path, num_gpus=num_gpus) predictor = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, training=False, data_format=data_format) predictor.init_predict() test_dataset = cifar10.Cifar10DataSet(data_dir, subset='test', use_distortion=False) test_iterator = test_dataset.make_batch(batch_size) test_batch = test_iterator.get_next() test_id_batches, test_image_batches, test_label_batches = melt.split_batch( test_batch, batch_size, num_gpus, training=False) def test_fn(i): test_pred = predictor.predict(test_image_batches[i]) test_pred = test_pred['classes'] return test_id_batches[i], test_pred num_test_examples = dataset.num_examples_per_epoch(subset='test') test_ops = melt.tower(test_fn, num_gpus, training=False) inference_fn = lambda model_path=None: \ evaluator.inference(test_ops, test_iterator, num_steps=-(-num_test_examples // batch_size), num_examples=num_test_examples, model_path=model_path, num_gpus=num_gpus) global eval_names names = ['loss', 'acc'] melt.apps.train_flow(ops, names=names, metric_eval_fn=metric_eval_fn, inference_fn=inference_fn, model_dir=FLAGS.model_dir, num_steps_per_epoch=num_train_examples // batch_size)