def gen_train_graph(input_app, input_results, trainer): """ main flow, key graph """ #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod) if FLAGS.num_gpus == 0: loss = tower_loss(trainer, input_app, input_results) else: loss_function = lambda: tower_loss(trainer) #here loss is a list of losses loss = melt.tower_losses(loss_function, FLAGS.num_gpus) print('num tower losses:', len(loss)) ops = [loss] #--------mark train graph finished, all graph after must share variable from train graph #melt.reuse_variables() trainer.is_training = False deal_debug_results = None #FLAGS.debug = True if FLAGS.debug == True: #ops += [tf.get_collection('scores')[-1], tf.get_collection('encode_feature')[-1], tf.get_collection('encode_state')[-1]] #ops += [tf.get_collection('debug_seqeuence')[-1], tf.get_collection('debug_length')[-1]] #print('-----', tf.get_collection('sequence')) #ops += [tf.get_collection('fixed_text')[-1], tf.get_collection('eval_text')[-1], tf.get_collection('fixed_input_text')[-1], # tf.get_collection('sequence')[1], tf.get_collection('sequence_length')[1], # tf.get_collection('outputs')[1]] def _deal_debug_results(results): print(results) print([x.shape for x in results]) deal_debug_results = _deal_debug_results return ops, deal_debug_results
def gen_train_graph(input_app, input_results, trainer): """ main flow, key graph """ #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod) if FLAGS.num_gpus == 0: loss = tower_loss(trainer, input_app, input_results) else: loss_function = lambda: tower_loss(trainer) #here loss is a list of losses loss = melt.tower_losses(loss_function, FLAGS.num_gpus) print('num tower losses:', len(loss)) ops = [loss] #--------mark train graph finished, all graph after must share variable from train graph #melt.reuse_variables() trainer.is_training = False deal_debug_results = None if FLAGS.debug == True: ops += [tf.get_collection('scores')[-1]] def _deal_debug_results(results): print(results) deal_debug_results = _deal_debug_results return ops, deal_debug_results
def gen_train_graph(input_app, input_results, trainer): """ main flow, key graph """ #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod) if FLAGS.num_gpus == 0: loss = tower_loss(trainer, input_app, input_results) else: loss_function = lambda: tower_loss(trainer) #here loss is a list of losses loss = melt.tower_losses(loss_function, FLAGS.num_gpus) print('num tower losses:', len(loss)) ops = [loss] #--------mark train graph finished, all graph after must share variable from train graph #melt.reuse_variables() trainer.is_training = False deal_debug_results = None if FLAGS.debug == True: ops += [tf.get_collection('scores')[-1]] def _deal_debug_results(results): _, scores = results print('scores', scores) # if not FLAGS.feed_dict: # ops += [text, text_str, neg_text, neg_text_str] # def _deal_debug_results(results): # if FLAGS.feed_dict: # _, scores = results # else: # _, scores, text, text_str, neg_text, neg_text_str = results # print(scores) # if not FLAGS.feed_dict: # print(text_str[0], text[0], text2ids.ids2text(text[0])) # print(neg_text_str[0][0][0], neg_text[0][0], text2ids.ids2text(neg_text[0][0])) # # global step # # if step == 42: # # print(neg_text_str[8][3][0], neg_text[8][3], text2ids.ids2text(neg_text[8][3])) # # step += 1 deal_debug_results = _deal_debug_results ###----------show how to debug #debug_ops = [text, neg_text, trainer.emb, trainer.scores] #debug_ops += trainer.gradients #print(trainer.gradients) #ops += debug_ops #def _deal_debug_results(results): # for result in results[-len(debug_ops):]: # #print(result.shape) # print(result) #deal_debug_results = _deal_debug_results return ops, deal_debug_results
def gen_train_graph(input_app, input_results, trainer): """ main flow, key graph """ #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod) if FLAGS.num_gpus == 0: loss = tower_loss(trainer, input_app, input_results) else: loss_function = lambda: tower_loss(trainer) #here loss is a list of losses loss = melt.tower_losses(loss_function, FLAGS.num_gpus) print('num tower losses:', len(loss)) ops = [loss] #--------mark train graph finished, all graph after must share variable from train graph #melt.reuse_variables() trainer.is_training = False deal_debug_results = None #FLAGS.debug = True if FLAGS.debug == True: #ops += [tf.get_collection('scores')[-1], tf.get_collection('encode_feature')[-1], tf.get_collection('encode_state')[-1]] #ops += [tf.get_collection('debug_seqeuence')[-1], tf.get_collection('debug_length')[-1]] ops += [tf.get_collection('logits')[-1]] def _deal_debug_results(results): print(results) #_, seq, len = results #for item in seq[0]: # print(item) #print('len:', len[0]) #_, scores, encode_feature, encode_state = results #print('scores', scores) #print('encode_feature', encode_feature) #print('encode_state', encode_state) deal_debug_results = _deal_debug_results return ops, deal_debug_results
def gen_train_graph(input_app, input_results, trainer): """ main flow, key graph """ #--- if you don't want to use mutli gpu, here just for safe(code same with old single gpu cod) if FLAGS.num_gpus > 1 and FLAGS.use_tower_loss: loss_function = lambda: tower_loss(trainer) #here loss is a list of losses loss = melt.tower_losses(loss_function, FLAGS.num_gpus) else: loss = tower_loss(trainer, input_app, input_results) ops = [loss] deal_debug_results = None #FLAGS.debug = True if FLAGS.debug == True: #ops += [tf.get_collection('scores')[-1], tf.get_collection('encode_feature')[-1], tf.get_collection('encode_state')[-1]] #ops += [tf.get_collection('debug_seqeuence')[-1], tf.get_collection('debug_length')[-1]] ops += [tf.get_collection('logits')[-1]] def _deal_debug_results(results): print(results) #_, seq, len = results #for item in seq[0]: # print(item) #print('len:', len[0]) #_, scores, encode_feature, encode_state = results #print('scores', scores) #print('encode_feature', encode_feature) #print('encode_state', encode_state) deal_debug_results = _deal_debug_results return ops, deal_debug_results
def main(_): num_train_examples = 45000 melt.apps.train.init() batch_size = melt.batch_size() num_gpus = melt.num_gpus() batch_size_per_gpu = FLAGS.batch_size # batch size not changed but FLAGS.batch_size will change to batch_size / num_gpus #print('--------------batch_size, FLAGS.batch_size, num_steps_per_epoch', batch_size, FLAGS.batch_size, num_train_examples // batch_size) global_scope = FLAGS.algo with tf.variable_scope(global_scope) as global_scope: data_format = 'channels_first' num_layers = 44 batch_norm_decay = 0.997 batch_norm_epsilon = 1e-05 data_dir = './mount/data/cifar10/' with tf.variable_scope('main') as scope: model = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=True, data_format=data_format) dataset = cifar10.Cifar10DataSet(data_dir, subset='train', use_distortion=True) ## This is wrong will cause all gpu read same data, so slow convergence but will get better test result #_, image_batch, label_batch = dataset.make_batch(FLAGS.batch_size) def loss_function(): # doing this 2gpu will get similar result as 1gpu, seems a bit better valid result and a bit worse test result might due to randomness _, image_batch, label_batch = dataset.make_batch( batch_size_per_gpu) return tower_loss(model, image_batch, label_batch) #loss_function = lambda: tower_loss(model, image_batch, label_batch) loss = melt.tower_losses(loss_function, num_gpus) pred = model.predict() pred = pred['classes'] label_batch = dataset.label_batch acc = tf.reduce_mean(tf.to_float(tf.equal(pred, label_batch))) #tf.summary.image('train/image', dataset.image_batch) # # Compute confusion matrix # matrix = tf.confusion_matrix(label_batch, pred, num_classes=10) # # Get a image tensor for summary usage # image_tensor = draw_confusion_matrix(matrix) # tf.summary.image('train/confusion_matrix', image_tensor) scope.reuse_variables() ops = [loss, acc] # TODO multiple gpu validation and inference validator = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=False, data_format=data_format) valid_dataset = cifar10.Cifar10DataSet(data_dir, subset='valid', use_distortion=False) valid_iterator = valid_dataset.make_batch(batch_size) valid_id_batch, valid_image_batch, valid_label_batch = valid_iterator.get_next( ) valid_loss = tower_loss(validator, valid_image_batch, valid_label_batch) valid_pred = validator.predict() valid_pred = valid_pred['classes'] ## seems not work with non rpeat mode.. #tf.summary.image('valid/image', valid_image_batch) ## Compute confusion matrix #matrix = tf.confusion_matrix(valid_label_batch, valid_pred, num_classes=10) ## Get a image tensor for summary usage #image_tensor = draw_confusion_matrix(matrix) #tf.summary.image('valid/confusion_matrix', image_tensor) #loss_function = lambda: tower_loss(validator, val_image_batch, val_label_batch) #val_loss = melt.tower_losses(loss_function, FLAGS.num_gpus, is_training=False) #eval_ops = [val_loss] metric_eval_fn = lambda model_path=None: \ evaluator.evaluate([valid_id_batch, valid_loss, valid_pred, valid_label_batch, valid_image_batch], valid_iterator, model_path=model_path) predictor = cifar10_model.ResNetCifar10( num_layers, batch_norm_decay=batch_norm_decay, batch_norm_epsilon=batch_norm_epsilon, is_training=False, data_format=data_format) predictor.init_predict() test_dataset = cifar10.Cifar10DataSet(data_dir, subset='test', use_distortion=False) test_iterator = test_dataset.make_batch(batch_size) test_id_batch, test_image_batch, test_label_batch = test_iterator.get_next( ) test_pred = predictor.predict(test_image_batch, input_data_format='channels_last') test_pred = test_pred['classes'] inference_fn = lambda model_path=None: \ evaluator.inference([test_id_batch, test_pred], test_iterator, model_path=model_path) global eval_names names = ['loss', 'acc'] melt.apps.train_flow(ops, names=names, metric_eval_fn=metric_eval_fn, inference_fn=inference_fn, model_dir=FLAGS.model_dir, num_steps_per_epoch=num_train_examples // batch_size)