def main(_): with tf.Graph().as_default(): num_batches_per_epoch_train = int(60000 / cfg.batch_size) num_batches_test = int(10000 / cfg.batch_size) batch_x, batch_labels = create_inputs(is_train=False) output = net.build_arch(batch_x, is_train=False) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session() as sess: tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(cfg.test_logdir, graph=sess.graph) for epoch in range(cfg.epoch): ckpt = os.path.join( cfg.logdir, 'model.ckpt-%d' % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) for i in range(num_batches_test): summary_str = sess.run(summary_op) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) step += 1
def main(_): with tf.Graph().as_default(), tf.device('/cpu:0'): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) num_batches_per_epoch = int(60000 / cfg.batch_size) opt = tf.train.AdamOptimizer() batch_x, batch_labels = create_inputs(is_train=True) # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): output = net.build_arch(batch_x, is_train=True) loss = net.cross_ent_loss(output, batch_labels) grad = opt.compute_gradients(loss) loss_name = 'cross_ent_loss' summaries = [] summaries.append(tf.summary.scalar(loss_name, loss)) train_op = opt.apply_gradients(grad, global_step=global_step) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=cfg.epoch) #read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) summary_op = tf.summary.merge(summaries) tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(cfg.logdir, graph=sess.graph) for step in range(cfg.epoch * num_batches_per_epoch): tic = time.time() _, loss_value = sess.run([train_op, loss]) print('%d iteration is finished in ' % step + '%f second' % (time.time() - tic)) # test1_v = sess.run(test2) # if np.isnan(loss_value): # print('bbb') # assert not np.isnan(np.any(test2_v[0])), 'a is nan' assert not np.isnan(loss_value), 'loss is nan' if step % 10 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) if (step % num_batches_per_epoch) == 0: ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=step)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size) batch_x, batch_labels = create_inputs() output = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session() as sess: tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter( cfg.test_logdir, graph=None) # graph=sess.graph, huge! for epoch in range(cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = re.compile() ckpt = os.path.join( cfg.logdir, 'model.ckpt-%d' % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run( [batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def main(args): # 1、设置GPU模式 session_config = cfg.set_gpu() with tf.Graph().as_default(): # 2、设置随机种子、读取数据batch、类别数 tf.set_random_seed(1234) coord_add = cfg.get_coord_add(dataset_name) num_classes = cfg.get_num_classes(dataset_name) labels_txt = cfg.search_keyword_files(recognize_data_dir, recognize_labels_txt_keywords) labels_maps = cfg.read_label_txt_to_dict(labels_txt[0]) with tf.Session(config=session_config) as sess: create_inputs = cfg.get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) batch_x, batch_labels = create_inputs() # 3、初始化网络 output, pose_out = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) results, labels = net.batch_results_and_labels(output, batch_labels) # 4、全局初始化和启动数据线程 (要放在初始化网络之后) coord, threads = cfg.init_variables_and_start_thread(sess) # 5、恢复model cfg.restore_model(sess, ckpt) # 6、求出全部预测值和标签list np_predicts_list = [] np_lables_list = [] for i in range(num_batches_test): np_results,np_labels = sess.run( [results, labels]) print(np_results) print(np_labels) np_predicts_list.extend(np_results) np_lables_list.extend(np_labels) np_predicts_list_str = str(np_predicts_list) np_lables_list_str = str(np_lables_list) with open('predicts_and_labels.txt','w') as f: f.write('predicts\r\n') f.write(np_predicts_list_str + '\r\n') f.write('labels\r\n') f.write(np_lables_list_str + '\r\n') cfg.stop_threads(coord,threads)
def main(args): tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) with tf.Graph().as_default(): num_batches_test = int(dataset_size_test / cfg.batch_size * 0.5) batch_x, batch_labels = create_inputs() output, pose_out = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() session_config = tf.ConfigProto( device_count={'GPU': 0}, gpu_options={ 'allow_growth': 1, # 'per_process_gpu_memory_fraction': 0.1, 'visible_device_list': '0' }, allow_soft_placement=True) with tf.Session(config=session_config) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) mode_file = tf.train.latest_checkpoint(ckpt) saver.restore(sess, mode_file) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v = sess.run([batch_acc]) accuracy_sum += batch_acc_v[0] print(accuracy_sum) ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def main(_): coord_add = [[[8., 8.], [12., 8.], [16., 8.]], [[8., 12.], [12., 12.], [16., 12.]], [[8., 16.], [12., 16.], [16., 16.]]] with tf.Graph().as_default(): batch_x, dt, datanum = utils.get_pred_data() num_batches_test = math.ceil(datanum / cfg.batch_size) print("total data:", datanum, ", run count:", num_batches_test, ", dt:", dt) #print(batch_x) output = net.build_arch(batch_x, coord_add, is_train=False) predict = tf.argmax(output, axis=1) saver = tf.train.Saver() sess = tf.Session() tf.train.start_queue_runners(sess=sess) ckpt = tf.train.get_checkpoint_state(cfg.logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print(ckpt.model_checkpoint_path) for i in range(num_batches_test): y_pred, output1 = sess.run([predict, output]) if i % 10 == 0: print("step:", i, "/", num_batches_test ) #,",",np.shape(y_pred),np.shape(output1)) if i == 0: y_pred1 = y_pred else: y_pred1 = np.concatenate((y_pred1, y_pred), axis=0) print(np.shape(y_pred1), ",", datanum) print(y_pred1) trade_data.out_indi_data(cfg.test_dataset, y_pred1, datalen=cfg.image_size)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) coord_add = get_coord_add(dataset_name) num_classes = get_num_classes(dataset_name) dataset_size = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) test_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(), tf.device('/cpu:0'): """Get global_step.""" global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size) """Set tf summaries.""" summaries = [] valid_sum = [] """Use exponential decay leanring rate?""" # lrn_rate = tf.maximum(tf.train.exponential_decay( # 1e-2, global_step, num_batches_per_epoch, 0.8), 1e-5) # summaries.append(tf.summary.scalar('learning_rate', lrn_rate)) opt = tf.train.AdamOptimizer(learning_rate=0.001) """Get batch from data queue.""" train_q = create_inputs() test_q = test_inputs() use_train_data = tf.placeholder(dtype=tf.bool, shape=()) batch_x, batch_labels = tf.cond(use_train_data, true_fn=lambda: train_q, false_fn=lambda: test_q) # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Define the dataflow graph.""" m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): norm_batch_x = tf.contrib.layers.batch_norm(batch_x, is_training=True) # Select network architecture. if cfg.network == 'conv': import capsnet_em as net output = net.build_arch(norm_batch_x, coord_add, is_train=True, num_classes=num_classes) elif cfg.network == 'fc': import capsnet_fc as net output = net.build_arch(norm_batch_x, is_train=True, num_classes=num_classes) else: raise ValueError('Invalid network architecture: ' % cfg.network) # Select loss function. if cfg.loss_fn == 'spread': loss = net.spread_loss(output, batch_labels, m_op) elif cfg.loss_fn == 'margin': loss = net.margin_loss(output, batch_labels) elif cfg.loss_fn == 'cross_en': loss = net.cross_entropy_loss(output, batch_labels) else: raise ValueError('Invalid loss function: ' % cfg.loss_fn) acc = net.accuracy(output, batch_labels) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad] + \ [tf.check_numerics(loss, message='Loss NaN Found')] """Add to summary.""" summaries.append(tf.summary.scalar('loss', loss)) summaries.append(tf.summary.scalar('acc', acc)) valid_sum.append(tf.summary.scalar('val_loss', loss)) valid_sum.append(tf.summary.scalar('val_acc', acc)) """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.gpu_frac) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [ v for v in tf.global_variables() if 'Adam' not in v.name ] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) """Display parameters""" total_p = np.sum([ np.prod(v.get_shape().as_list()) for v in var_to_save ]).astype(np.int32) train_p = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge(summaries) valid_sum_op = tf.summary.merge(valid_sum) """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" summary_writer = tf.summary.FileWriter( cfg.logdir, graph=None) # graph = sess.graph, huge! """Main loop.""" m_min = 0.2 m_max = 0.9 m = m_min for step in range(cfg.epoch * num_batches_per_epoch): if (step % num_batches_per_epoch) == 0: tic = time.time() progbar = tf.keras.utils.Progbar( num_batches_per_epoch, verbose=(1 if cfg.progbar else 0)) """"TF queue would pop batch until no file""" try: _, loss_value, acc_value = sess.run([train_op, loss, acc], feed_dict={ use_train_data: True, m_op: m }) progbar.update((step % num_batches_per_epoch), values=[('loss', loss_value), ('acc', acc_value)]) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning( '%d iteration contains NaN gradients. Discard.' % step) continue """Write to summary.""" if step % 10 == 0: summary_str = sess.run(summary_op, feed_dict={ use_train_data: True, m_op: m }) summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * 0.6) if m > m_max: m = m_max """Save model periodically""" ckpt_path = os.path.join( cfg.logdir, 'model-{0:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step) # Add a new progress bar if ((step + 1) % num_batches_per_epoch) == 0: toc = time.time() val_loss_value, val_acc_value = (0.0, 0.0) for i in range(num_batches_test): val_batch = sess.run([loss, acc], feed_dict={ use_train_data: False, m_op: m }) val_loss_batch, val_acc_batch = val_batch val_loss_value += val_loss_batch / num_batches_test val_acc_value += val_acc_batch / num_batches_test valid_sum_str = sess.run(valid_sum_op, feed_dict={ use_train_data: False, m_op: m }) summary_writer.add_summary(valid_sum_str, step) print('\nEpoch %d/%d in ' % (step // num_batches_per_epoch + 1, cfg.epoch) + '%.1fs' % (toc - tic) + ' - loss: %f' % val_loss_value + ' - acc: %f' % val_acc_value) """Join threads""" coord.join(threads)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) output, _ = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter( cfg.test_logdir, graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir) for epoch in range(1, cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join(cfg.logdir, __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run( [batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) print('%d batch accuracy.' % batch_acc_v) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc) coord.join(threads)
def main(_): coord_add = [[[8., 8.], [12., 8.], [16., 8.]], [[8., 12.], [12., 12.], [16., 12.]], [[8., 16.], [12., 16.], [16., 16.]]] coord_add = np.array(coord_add, dtype=np.float32) / 28. """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(), tf.device('/cpu:0'): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) batch_x, batch_labels = utils.get_shuffle_tfrecord(is_training=True) datanum = 272965 num_batches_per_epoch = int(datanum / cfg.batch_size) print(datanum, num_batches_per_epoch) # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum( tf.train.exponential_decay(1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer(learning_rate=lrn_rate) m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): output = net.build_arch(batch_x, coord_add, is_train=True) # loss = net.cross_ent_loss(output, batch_labels) loss = net.spread_loss(output, batch_labels, m_op) accuracy = net.test_accuracy(output, batch_labels) tf.summary.scalar("spread_loss", loss) tf.summary.scalar("accuracy", accuracy) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None ] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) # Print trainable variable parameter statistics to stdout. # By default, statistics are associated with each graph node. param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) # param_stats is tensorflow.tfprof.TFGraphNodeProto proto. # Let's print the root below. print('total_params: %d\n' % param_stats.total_parameters) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=30) #cfg.epoch) # restore from the check point ckpt = tf.train.get_checkpoint_state(cfg.logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) initial_step = int(ckpt.model_checkpoint_path.split('-')[1]) print(ckpt, ckpt.model_checkpoint_path, initial_step) m = 0.9 else: initial_step = 0 m = 0.2 # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" # if not os.path.exists(cfg.logdir): # os.makedirs(cfg.logdir) summary_writer = tf.summary.FileWriter( cfg.logdir, graph=sess.graph) # graph = sess.graph, huge! cal_num = 0 for step in range(cfg.epoch): for i in range(num_batches_per_epoch): tic = time.time() """"TF queue would pop batch until no file""" try: _, loss_value, accuracy_val = sess.run( [train_op, loss, accuracy], feed_dict={m_op: m}) print( '%d/%d, %d/%d iteration is finished in ' % (step, cfg.epoch, i, num_batches_per_epoch) + '%f second' % (time.time() - tic) + ',m:', m, ',loss: %f' % loss_value, ",accuracy:", accuracy_val) cal_num += 1 except tf.errors.InvalidArgumentError: print('%d iteration contains NaN gradients. Discard.' % cal_num) continue else: """Write to summary.""" if i % 30 == 0: summary_str = sess.run(summary_op, feed_dict={m_op: m}) summary_writer.add_summary(summary_str, initial_step + cal_num) if cal_num % cfg.saveperiod == 0: ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=initial_step + cal_num) if m < 0.9: m += round((0.9 - 0.2) / num_batches_per_epoch, 5) else: m = 0.9 ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=initial_step + cal_num) """Join threads""" coord.join(threads)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance( args[2], str) dataset_name = args[1] model_name = args[2] """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = 2 # int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_squash = tf.divide(batch_x, 255.) batch_x_norm = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) output, pose_out = net.build_arch(batch_x_norm, coord_add, is_train=False, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) batch_acc = net.test_accuracy(output, batch_labels) m_op = tf.constant(0.9) loss, spread_loss, mse, recon_img_squash = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) data_size = int(batch_x.get_shape()[1]) recon_img = tf.multiply( tf.reshape(recon_img_squash, shape=[cfg.batch_size, data_size, data_size, 1]), 255.) orig_img = tf.reshape(batch_x, shape=[cfg.batch_size, data_size, data_size, 1]) tf.summary.image('orig_image', orig_img) tf.summary.image('recon_image', recon_img) saver = tf.train.Saver() step = 0 tf.summary.scalar('accuracy', batch_acc) summary_op = tf.summary.merge_all() with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(45, 46): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join( cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) #ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) ############Comentar linea de abajo #ckpt = os.path.join(cfg.logdir, "caps/mnist/model-0.3764.ckpt-1718") saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str, orig_image, recon_image = sess.run( [batch_acc, summary_op, orig_img, recon_img]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 # display original/reconstructed images in matplotlib plot_imgs(orig_image, i, 'ori') plot_imgs(recon_image, i, 'rec') ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) coord_add = get_coord_add(dataset_name) dataset_size = get_dataset_size_train(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(), tf.device('/cpu:0'): """Get global_step.""" global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum(tf.train.exponential_decay( 1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer() # lrn_rate """Get batch from data queue.""" batch_x, batch_labels = create_inputs() # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Define the dataflow graph.""" m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): batch_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) # loss = net.cross_ent_loss(output, batch_labels) loss, spread_loss, mse = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [v for v in tf.global_variables( ) if 'Adam' not in v.name] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) """Display parameters""" total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32) train_p = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" if not os.path.exists(cfg.logdir + '/train_log/'): os.makedirs(cfg.logdir + '/train_log/') summary_writer = tf.summary.FileWriter( cfg.logdir + '/train_log/', graph=sess.graph) # graph = sess.graph, huge! """Main loop.""" m_min = 0.2 m_max = 0.9 m = m_min for step in range(cfg.epoch * num_batches_per_epoch + 1): tic = time.time() """"TF queue would pop batch until no file""" try: _, loss_value, summary_str = sess.run( [train_op, loss, summary_op], feed_dict={m_op: m}) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning('%d iteration contains NaN gradients. Discard.' % step) continue else: """Write to summary.""" if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max """Save model periodically""" ckpt_path = os.path.join( cfg.logdir, 'model-{}.ckpt'.format(round(loss_value, 4))) saver.save(sess, ckpt_path, global_step=step) """Join threads""" coord.join(threads)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str) dataset_name = args[1] model_name = args[2] """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs( dataset_name, is_train=False, epochs=cfg.epoch) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = 2 # int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_squash = tf.divide(batch_x, 255.) batch_x_norm = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) output, pose_out = net.build_arch(batch_x_norm, coord_add, is_train=False, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) batch_acc = net.test_accuracy(output, batch_labels) m_op = tf.constant(0.9) loss, spread_loss, mse, recon_img_squash = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) data_size = int(batch_x.get_shape()[1]) recon_img = tf.multiply(tf.reshape(recon_img_squash, shape=[ cfg.batch_size, data_size, data_size, 1]), 255.) orig_img = tf.reshape(batch_x, shape=[ cfg.batch_size, data_size, data_size, 1]) tf.summary.image('orig_image', orig_img) tf.summary.image('recon_image', recon_img) saver = tf.train.Saver() step = 0 tf.summary.scalar('accuracy', batch_acc) summary_op = tf.summary.merge_all() with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(14, 15): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join( cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str, orig_image, recon_image = sess.run( [batch_acc, summary_op, orig_img, recon_img]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 # display original/reconstructed images in matplotlib plot_imgs(orig_image, i, 'ori') plot_imgs(recon_image, i, 'rec') ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def test_model(n_tests, x_test, y_test, ang_min, ang_max): # Placeholders for input data and the targets x_input = tf.placeholder(tf.float32, (None, *IMG_DIM), name='Input') y_target = tf.placeholder(tf.int32, [None, ], name='Target') coord_add = get_coord_add(dataset_name ) sample_batch = tf.identity(x_input) batch_labels = tf.identity(y_target) batch_x = slim.batch_norm(sample_batch, center=False, is_training=False, trainable=False) output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=NCLASSES) batch_acc_sum = net.test_accuracy_sum(output, batch_labels) batch_pred = net.test_predict(output, batch_labels) saver = tf.train.Saver() sess = tf.Session() sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) model_path = cfg.logdir + '/caps/mnist' saver.restore(sess, tf.train.latest_checkpoint(model_path)) nImg = x_test.shape[0] batch_size = int(cfg.batch_size) nBatches = int(nImg / batch_size) accuraces = [] mean_acc = 0 for n in range(n_tests): print('\nTest %d/%d' % (n + 1, n_tests)) print('-' * 30 + 'Begin: testing' + '-' * 30) acc = 0 k = 0 xi = np.empty([1, sy, sx, 1]) x_init = np.empty([1, sy, sx, 1]) for i in range(nBatches): x = x_test[i * batch_size: (i + 1) * batch_size, :, :, :] y = y_test[i * batch_size: (i + 1) * batch_size] xr = np.empty(x.shape) for j in range(x.shape[0]): xr[j, :, :, :] = utils.create_inputs_mnist_rot_excl_range(x[j, :, :, :], y[j], ang_min, ang_max) k += 1 batch_acc_v = sess.run(batch_acc_sum, feed_dict={x_input: xr, y_target: y}) acc += batch_acc_v # Just checking what images we are feeding to the network if i == 0 and n == 0: for j in range(batch_size): if j == 0: xi[0, :, :, :] = xr[0, :, :, :] x_init[0, :, :, :] = x[0, :, :, :] else: xi = np.concatenate([xi, np.expand_dims(xr[j, :, :, :], 0)]) x_init = np.concatenate([x_init, np.expand_dims(x[j, :, :, :],0)]) # xr = np.concatenate([xr, x_recon]) if j == (batch_size - 1): images = utils.combine_images(xi) image = images Image.fromarray(image.astype(np.uint8)).save(cfg.logdir + "/batch_rot.png") images = utils.combine_images(x_init) image = images Image.fromarray(image.astype(np.uint8)).save(cfg.logdir + "/batch_init.png") sys.stdout.write(ERASE_LINE) sys.stdout.write("\r \r {0}%".format(int(100 * k / nImg))) sys.stdout.flush() time.sleep(0.001) x = x_test[k:, :, :, :] y = y_test[k:] # duplicate the last sample to adjust the batch size n_left = nImg-k n_tile = BATCH_SIZE - n_left x_tile = np.tile(np.expand_dims(x_test[nImg-1, :, :, :],0), [n_tile, 1, 1, 1]) y_tile = np.tile(y_test[nImg-1], n_tile) x = np.concatenate( (x, x_tile) ) y = np.concatenate((y, y_tile)) xr = np.empty(x.shape) for j in range(x.shape[0]): xr[j, :, :, :] = utils.create_inputs_mnist_rot_excl_range(x[j, :, :, :], y[j], ang_min, ang_max) batch_pred_v = sess.run(batch_pred, feed_dict={x_input: xr, y_target: y}) left_pred = np.asarray(batch_pred_v[:n_left], dtype=np.float32) acc += np.sum(left_pred) k += n_left sys.stdout.write(ERASE_LINE) sys.stdout.write("\r \r {0}%".format(str(100))) sys.stdout.flush() time.sleep(0.001) print('\n') print('-' * 30 + 'End: testing' + '-' * 30) acc_aver = acc / float(y_test.shape[0]) print('Number of images: {}, Accuracy: {}'.format(k, acc_aver)) mean_acc += acc_aver accuraces.append(acc_aver) mean_acc = mean_acc / float(n_tests) var_acc = 0 accuraces = np.array(accuraces) for i in range(accuraces.shape[0]): var_acc += (accuraces[i] - mean_acc)*(accuraces[i] - mean_acc) var_acc /= float(n_tests) print('\nTesting is finished!') print('Testing options:\nAngles range from {} to {}\tIs only 3 and 4: {}'.format(ang_min, ang_max, is_only_3_and_4)) print('\nMean testing accuracy for {} runs: {}'.format(n_tests, mean_acc)) print('Variance of testing accuracy for {} runs: {}'.format(n_tests, var_acc))
def main(args): assert len(args) == 2 and isinstance(args[1], str) # Get dataset name dataset_name = args[1] # mnist logger.info(f'Using dataset: {dataset_name}') # Set reproducible random seed tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) # (3, 3, 2) dataset_size = get_dataset_size_train(dataset_name) # 55,000 num_classes = get_num_classes(dataset_name) # 10 create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) with tf.Graph().as_default(), tf.device('/cpu:0'): # Get global_step global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) num_batches_per_epoch = dataset_size // cfg.batch_size # 1100 opt = tf.train.AdamOptimizer() # Get batch from data queue batch_x, batch_labels = create_inputs() # (50 28, 28, 1), (50,) m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): batch_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch( batch_x, coord_add, is_train=True, num_classes=num_classes) # (50, 10), (50, 10, 18) tf.logging.debug(pose_out.get_shape()) # Define loss = spread_loss + reconstruction loss loss, spread_loss, mse, _ = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) tf.summary.scalar('train_acc', acc) grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None ] + [tf.check_numerics(loss, message='Loss NaN Found')] # Apply graident with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) # Set Session settings sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) # Set Saver var_to_save = [ v for v in tf.global_variables() if 'Adam' not in v.name ] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) # Display parameters total_p = np.sum([ np.prod(v.get_shape().as_list()) for v in var_to_save ]).astype(np.int32) train_p = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) # Set summary op summary_op = tf.summary.merge_all() # Start coord & queue coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Set summary writer if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + f"/caps/{dataset_name}/train_log/", graph=sess.graph) # graph = sess.graph, huge! # Main loop m_min = 0.2 m_max = 0.9 m = m_min for step in range(cfg.epoch * num_batches_per_epoch + 1): tic = time.time() # TF queue would pop batch until no file try: _, loss_value, summary_str = sess.run( [train_op, loss, summary_op], feed_dict={m_op: m}) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning( '%d iteration contains NaN gradients. Discard.' % step) continue else: if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max # Save model periodically ckpt_path = os.path.join( cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step)
def main(args): assert len(args) == 2 and isinstance(args[1], str) # Get dataset name dataset_name = args[1] # mnist logger.info(f'Using dataset: {dataset_name}') # Set reproducible random seed tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) # (3, 3, 2) dataset_size = get_dataset_size_train(dataset_name) # 55,000 num_classes = get_num_classes(dataset_name) # 10 create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) with tf.Graph().as_default(), tf.device('/cpu:0'): # Get global_step global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) num_batches_per_epoch = dataset_size // cfg.batch_size # 1100 opt = tf.train.AdamOptimizer() # Get batch from data queue batch_x, batch_labels = create_inputs() # (50 28, 28, 1), (50,) m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): batch_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) # (50, 10), (50, 10, 18) tf.logging.debug(pose_out.get_shape()) # Define loss = spread_loss + reconstruction loss loss, spread_loss, mse, _ = net.spread_loss(output, pose_out, batch_squash, batch_labels, m_op) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) tf.summary.scalar('train_acc', acc) grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')] # Apply graident with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) # Set Session settings sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) # Set Saver var_to_save = [v for v in tf.global_variables() if 'Adam' not in v.name] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) # Display parameters total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32) train_p = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) # Set summary op summary_op = tf.summary.merge_all() # Start coord & queue coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Set summary writer if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + f"/caps/{dataset_name}/train_log/", graph=sess.graph) # graph = sess.graph, huge! # Main loop m_min = 0.2 m_max = 0.9 m = m_min for step in range(cfg.epoch * num_batches_per_epoch + 1): tic = time.time() # TF queue would pop batch until no file try: _, loss_value, summary_str = sess.run([train_op, loss, summary_op], feed_dict={m_op: m}) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning('%d iteration contains NaN gradients. Discard.' % step) continue else: if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max # Save model periodically ckpt_path = os.path.join(cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str) dataset_name = args[1] model_name = args[2] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs( dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) if model_name == "caps": output, _ = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) elif model_name == "cnn_baseline": output = net.build_arch_baseline(batch_x, is_train=False, num_classes=num_classes) else: raise "Please select model from 'caps' or 'cnn_baseline' as the secondary argument of eval.py!" batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(1, cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run([batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc) coord.join(threads)
def main(_): coord_add = [[[8., 8.], [12., 8.], [16., 8.]], [[8., 12.], [12., 12.], [16., 12.]], [[8., 16.], [12., 16.], [16., 16.]]] with tf.Graph().as_default(): batch_x, batch_labels, datanum = utils.get_batch_data( is_training=False) num_batches_test = math.ceil(datanum / cfg.batch_size) #get the ceiling int output = net.build_arch(batch_x, coord_add, is_train=False) predict = tf.argmax(output, axis=1) batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) sess = tf.Session() tf.train.start_queue_runners(sess=sess) ckpt = tf.train.get_checkpoint_state(cfg.logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print(ckpt.model_checkpoint_path) summary_writer = tf.summary.FileWriter(cfg.test_logdir, graph=sess.graph) for epoch in range(cfg.test_epoch): accuracy_sum = 0 for i in range(num_batches_test): y_pred, y, batch_acc_v, summary_str = sess.run( [predict, batch_labels, batch_acc, summary_op]) if i % 10 == 0: print('%d/%d batches are tested.' % (step, num_batches_test)) #print("labels:\n",batch_labels) print("Y:\n", y) print("Y_prediction:", batch_acc_v, "\n", y_pred) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 if i == 0: y_pred1 = y_pred label1 = y else: y_pred1 = np.concatenate((y_pred1, y_pred), axis=0) label1 = np.concatenate((label1, y), axis=0) #print("Label:",np.shape(label1),"\n", label1) ave_acc = accuracy_sum / num_batches_test # print("The last batch----Y:",np.shape(y),"\n", y) # print("Y_prediction:", batch_acc_v, "\n", y_pred) print(epoch, 'epoch: average accuracy is %f' % ave_acc) print(np.shape(y_pred1), ",", datanum) label1 = label1[:datanum] y_pred1 = y_pred1[:datanum] print("label:", np.shape(label1)) trade_data.out_indi_data(cfg.test_dataset, y_pred1, datalen=cfg.image_size)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) coord_add = get_coord_add(dataset_name) dataset_size = get_dataset_size_train(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(), tf.device('/cpu:0'): """Get global_step.""" global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Set tf summaries.""" summaries = [] """Use exponential decay leanring rate?""" lrn_rate = tf.maximum( tf.train.exponential_decay(1e-3, global_step, 2e2, 0.66), 1e-5) summaries.append(tf.summary.scalar('learning_rate', lrn_rate)) opt = tf.train.AdamOptimizer(lrn_rate) """Get batch from data queue.""" batch_x, batch_labels = create_inputs() # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Define the dataflow graph.""" m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): output = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) # loss = net.cross_ent_loss(output, batch_labels) loss = net.spread_loss(output, batch_labels, m_op) """Compute gradient.""" grad = opt.compute_gradients(loss) """Add loss to summary.""" summaries.append(tf.summary.scalar('spread_loss', loss)) """Apply graident.""" train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [ v for v in tf.global_variables() if 'Adam' not in v.name ] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) """Display parameters""" total_p = np.sum([ np.prod(v.get_shape().as_list()) for v in var_to_save ]).astype(np.int32) train_p = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge(summaries) """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" summary_writer = tf.summary.FileWriter( cfg.logdir, graph=None) # graph = sess.graph, huge! """Main loop.""" m_min = 0.2 m_max = 0.9 m = m_min for step in range(cfg.epoch * num_batches_per_epoch): tic = time.time() """"TF queue would pop batch until no file""" _, loss_value = sess.run([train_op, loss], feed_dict={m_op: m}) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) """Check NaN""" assert not np.isnan(loss_value), 'loss is nan' """Write to summary.""" if step % 10 == 0: summary_str = sess.run(summary_op, feed_dict={m_op: m}) summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * 0.6) if m > m_max: m = m_max """Save model periodically""" ckpt_path = os.path.join( cfg.logdir, 'model-{}.ckpt'.format(round(loss_value, 4))) saver.save(sess, ckpt_path, global_step=step) """Join threads""" coord.join(threads)
def main(_): height = width = 28 if isPadding: height = width = 40 train_x, train_y, test_x,test_y = get_mnist() X = tf.placeholder(tf.float32, [cfg.batch_size, height, width,1]) Y = tf.placeholder(tf.int32, [cfg.batch_size]) M = tf.placeholder(tf.float32, ()) predict,check = net.build_arch(X) predict_class = tf.cast(tf.argmax(predict,-1), tf.int32) accuracy = tf.reduce_mean(tf.cast(tf.equal(predict_class,Y),tf.float32)) loss_spread = net.spread_loss(predict, Y, M) loss_regular = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) loss = loss_spread + loss_regular train_op = tf.train.AdamOptimizer(0.001,0.5).minimize(loss) sess = tf.Session() saver = tf.train.Saver() if isNewTrain: sess.run(tf.global_variables_initializer()) print('Initialized!') else : saver.restore(sess, modelName) print("Model restored") margin = 0.5 start_sec = time.time() iteration_train = int(np.minimum(cfg.max_count,len(train_x))/cfg.batch_size) iteration_test = int(np.minimum(cfg.max_count,len(test_x))/cfg.batch_size) num_batches = cfg.batch_size for step in range(cfg.epoch+1): acc_sum_train = 0.0 acc_sum_test = 0.0 for i in range(iteration_train): tic = time.time() start = i*num_batches end = start+num_batches batch_x = train_x[start:end] if isPadding:batch_x = padding(batch_x, 40) dic = {X:batch_x,Y:train_y[start:end],M:margin} _, loss_,acc,check_ = sess.run([train_op, loss,accuracy,check],dic) acc_sum_train+=acc/iteration_train now = strftime("%H:%M:%S", localtime()) if i%int(iteration_train/2)==0: print('%d/%d %d/%d batch acc:%.3f, loss:%.5f margin:%.2f sec:%.2f check:%.3f' % (step,cfg.epoch,i,iteration_train, acc,loss_, margin,time.time()-tic,np.std(check_))) assert not np.isnan(loss_) for i in range(iteration_test): start = i * num_batches end = start + num_batches batch_x = test_x[start:end] if isPadding:batch_x = padding(batch_x, 40) acc_te = sess.run(accuracy, {X:batch_x,Y:test_y[start:end],M:margin}) acc_sum_test+=acc_te/iteration_test print ('%d/%d train:%.3f test:%.3f' %(step, cfg.epoch, acc_sum_train, acc_sum_test)) this_sec = time.time() if margin < 0.9 and margin < acc: margin = np.minimum(margin+0.01, 0.9) if acc_sum_train>=1 or step==cfg.epoch or this_sec - start_sec > 60 * 5 : start_sec = this_sec save_path = saver.save(sess, modelName) print("Model Saved, time:%s, %s" %(now, save_path)) if acc_sum_train>=1: break print ('training finish',acc_sum_train,acc_sum_test)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance( args[2], str) dataset_name = args[1] model_name = args[2] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) if model_name == "caps": output, _ = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) elif model_name == "cnn_baseline": output = net.build_arch_baseline(batch_x, is_train=False, num_classes=num_classes) else: raise "Please select model from 'caps' or 'cnn_baseline' as the secondary argument of eval.py!" batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) session_config = tf.ConfigProto( device_count={'GPU': 0}, gpu_options={ 'allow_growth': 1, # 'per_process_gpu_memory_fraction': 0.1, 'visible_device_list': '0' }, allow_soft_placement=True) with tf.Session(config=session_config) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(1, cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join( cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run( [batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc) coord.join(threads)
def main(_): coord_add = [[[8., 8.], [12., 8.], [16., 8.]], [[8., 12.], [12., 12.], [16., 12.]], [[8., 16.], [12., 16.], [16., 16.]]] coord_add = np.array(coord_add, dtype=np.float32)/28. data = utils.load_trade(is_training=True) datanum = data.num_examples with tf.Graph().as_default(), tf.device('/cpu:0'): batch_x =tf.placeholder(tf.float32,[cfg.batch_size,cfg.image_size,cfg.image_size,3]) batch_labels = tf.placeholder(tf.int32,[cfg.batch_size]) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) opt = tf.train.AdamOptimizer() #batch_x, batch_labels,datanum = utils.get_shuffle_batch_data(is_training=True) num_batches_per_epoch = int(datanum / cfg.batch_size) print(datanum,num_batches_per_epoch) # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): output = net.build_arch(batch_x, coord_add, is_train=True) # loss = net.cross_ent_loss(output, batch_labels) loss = net.spread_loss(output, batch_labels, m_op) accuracy = net.test_accuracy(output,batch_labels) grad = opt.compute_gradients(loss) loss_name = 'spread_loss' # Print trainable variable parameter statistics to stdout. # By default, statistics are associated with each graph node. param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), tfprof_options=tf.contrib.tfprof.model_analyzer. TRAINABLE_VARS_PARAMS_STAT_OPTIONS) # param_stats is tensorflow.tfprof.TFGraphNodeProto proto. # Let's print the root below. sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) summaries = [] summaries.append(tf.summary.scalar(loss_name, loss)) summaries.append(tf.summary.scalar("accuracy",accuracy)) train_op = opt.apply_gradients(grad, global_step=global_step) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.global_variables_initializer()) # add addition options to trace the session execution options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) #cfg.epoch) # restore from the check point ckpt = tf.train.get_checkpoint_state(cfg.logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) initial_step = int(ckpt.model_checkpoint_path.split('-')[1]) print(ckpt, ckpt.model_checkpoint_path, initial_step) else: initial_step =0 m = 0.2 summary_op = tf.summary.merge(summaries) tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(cfg.logdir, graph=sess.graph) cal_num=0 for step in range(cfg.epoch): for i in range(num_batches_per_epoch): tic = time.time() x,y = data.next_batch(cfg.batch_size) _, loss_value,accuracy_val = sess.run([train_op, loss,accuracy], feed_dict={batch_x:x,batch_labels:y,m_op: m}) print('%d/%d, %d/%d iteration is finished in ' % (step,cfg.epoch,i,num_batches_per_epoch) + '%f second' % (time.time()-tic) + ',m:',m,',loss: %f'% loss_value,",accuracy:",accuracy_val) assert not np.isnan(loss_value), 'loss is nan' cal_num+=1 if i % 30 == 0: summary_str = sess.run(summary_op, feed_dict={batch_x:x,batch_labels:y,m_op: m}, options=options, run_metadata=run_metadata ) summary_writer.add_run_metadata(run_metadata,'step%d'% cal_num) summary_writer.add_summary(summary_str, initial_step+cal_num) # Print to stdout an analysis of the memory usage and the timing information # broken down by operations. # tf.contrib.tfprof.model_analyzer.print_model_analysis( # tf.get_default_graph(), # run_meta=run_metadata, # tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY) # fetched_timeline = timeline.Timeline(run_metadata.step_stats) # chrome_trace = fetched_timeline.generate_chrome_trace_format() # with open('./time_line/timeline_02_step_%d.json' % i, 'w') as f: # f.write(chrome_trace) if cal_num % cfg.saveperiod == 0: ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=initial_step + cal_num) if m<0.9: m += round((0.9-0.2) / num_batches_per_epoch,5) else: m = 0.9 ckpt_path = os.path.join(cfg.logdir, 'model.ckpt') saver.save(sess, ckpt_path, global_step=initial_step+cal_num)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size = get_dataset_size_train(dataset_name) num_classes = get_num_classes(dataset_name) # Prepare Training Data (x_train, y_train), (x_test, y_test) = utils.load_mnist_excluded() with tf.Graph().as_default(): #, tf.device('/cpu:0'): # Placeholders for input data and the targets x_input = tf.placeholder(tf.float32, (None, *IMG_DIM), name='Input') y_target = tf.placeholder(tf.int32, [ None, ], name='Target') """Get global_step.""" global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum( tf.train.exponential_decay(1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer() # lrn_rate """Define the dataflow graph.""" m_op = tf.placeholder(dtype=tf.float32, shape=()) with tf.device('/gpu:0'): with slim.arg_scope([slim.variable]): #, device='/cpu:0'): sample_batch = tf.identity(x_input) batch_labels = tf.identity(y_target) batch_squash = tf.divide(sample_batch, 255.) batch_x = slim.batch_norm(sample_batch, center=False, is_training=True, trainable=True) output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) loss, spread_loss, mse, reconstruction = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) sample_batch = tf.squeeze(sample_batch) decode_res_op = tf.concat([ sample_batch, 255 * tf.reshape(reconstruction, [cfg.batch_size, IMAGE_SIZE, IMAGE_SIZE]) ], axis=0) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) tf.summary.scalar('train__batch_acc', acc) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [ tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None ] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [ v for v in tf.global_variables() if 'Adam' not in v.name ] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) """Display parameters""" total_p = np.sum([ np.prod(v.get_shape().as_list()) for v in var_to_save ]).astype(np.int32) train_p = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + '/caps/{}/train_log/'.format(dataset_name), graph=sess.graph) # graph = sess.graph, huge! if not os.path.exists(cfg.logdir + '/caps/{}/images/'.format(dataset_name)): os.makedirs(cfg.logdir + '/caps/{}/images/'.format(dataset_name)) """Main loop.""" m_min = 0.2 m_max = 0.9 m = m_min max_iter = cfg.epoch * num_batches_per_epoch + 1 for step in range(max_iter): tic = time.time() """"TF queue would pop batch until no file""" batch_x, batch_y = utils.get_random_mnist_batch( x_train, y_train, cfg.batch_size) try: _, loss_value, train_acc_val, summary_str, mse_value = sess.run( [train_op, loss, acc, summary_op, mse], feed_dict={ m_op: m, x_input: batch_x, y_target: batch_y }) sys.stdout.write(ERASE_LINE) sys.stdout.write('\r\r%d/%d iteration finishes in ' % (step, max_iter) + '%f second' % (time.time() - tic) + ' training accuracy = %f' % train_acc_val + ' loss=%f' % loss_value + '\treconstruction_loss=%f' % mse_value) sys.stdout.flush() time.sleep(0.001) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning( '%d iteration contains NaN gradients. Discard.' % step) continue else: """Write to summary.""" if step % 10 == 0: summary_writer.add_summary(summary_str, step) if step % 200 == 0: images = sess.run(decode_res_op, feed_dict={ m_op: m, x_input: batch_x, y_target: batch_y }) image = combine_images(images) img_name = cfg.logdir + '/caps/{}/images/'.format( dataset_name) + "/step_{}.png".format(str(step)) Image.fromarray(image.astype(np.uint8)).save(img_name) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: if step > 0: m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule) if m > m_max: m = m_max """Save model periodically """ ckpt_path = os.path.join( cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step) ckpt_path = os.path.join(cfg.logdir + '/caps/{}/'.format(dataset_name), 'finall-model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step) print('Training is finished!')