Ejemplo n.º 1
0
def main(_):
    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(60000 / cfg.batch_size)
        num_batches_test = int(10000 / cfg.batch_size)

        batch_x, batch_labels = create_inputs(is_train=False)
        output = net.build_arch(batch_x, is_train=False)
        batch_acc = net.test_accuracy(output, batch_labels)
        saver = tf.train.Saver()

        step = 0

        summaries = []
        summaries.append(tf.summary.scalar('accuracy', batch_acc))
        summary_op = tf.summary.merge(summaries)

        with tf.Session() as sess:
            tf.train.start_queue_runners(sess=sess)
            summary_writer = tf.summary.FileWriter(cfg.test_logdir,
                                                   graph=sess.graph)

            for epoch in range(cfg.epoch):
                ckpt = os.path.join(
                    cfg.logdir,
                    'model.ckpt-%d' % (num_batches_per_epoch_train * epoch))
                saver.restore(sess, ckpt)

                for i in range(num_batches_test):
                    summary_str = sess.run(summary_op)
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)

                    step += 1
Ejemplo n.º 2
0
def main(_):
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        num_batches_per_epoch = int(60000 / cfg.batch_size)
        opt = tf.train.AdamOptimizer()

        batch_x, batch_labels = create_inputs(is_train=True)
        # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32)
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                output = net.build_arch(batch_x, is_train=True)
                loss = net.cross_ent_loss(output, batch_labels)

            grad = opt.compute_gradients(loss)

        loss_name = 'cross_ent_loss'

        summaries = []
        summaries.append(tf.summary.scalar(loss_name, loss))

        train_op = opt.apply_gradients(grad, global_step=global_step)

        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False))
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver(tf.global_variables(), max_to_keep=cfg.epoch)

        #read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)

        summary_op = tf.summary.merge(summaries)
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(cfg.logdir, graph=sess.graph)

        for step in range(cfg.epoch * num_batches_per_epoch):
            tic = time.time()
            _, loss_value = sess.run([train_op, loss])
            print('%d iteration is finished in ' % step + '%f second' %
                  (time.time() - tic))
            # test1_v = sess.run(test2)

            # if np.isnan(loss_value):
            #     print('bbb')
            #  assert not np.isnan(np.any(test2_v[0])), 'a is nan'
            assert not np.isnan(loss_value), 'loss is nan'

            if step % 10 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if (step % num_batches_per_epoch) == 0:
                ckpt_path = os.path.join(cfg.logdir, 'model.ckpt')
                saver.save(sess, ckpt_path, global_step=step)
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 2 and isinstance(args[1], str)
    dataset_name = args[1]
    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(dataset_name,
                                      is_train=False,
                                      epochs=cfg.epoch)
    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size)
        num_batches_test = int(dataset_size_test / cfg.batch_size)

        batch_x, batch_labels = create_inputs()
        output = net.build_arch(batch_x,
                                coord_add,
                                is_train=False,
                                num_classes=num_classes)
        batch_acc = net.test_accuracy(output, batch_labels)
        saver = tf.train.Saver()

        step = 0

        summaries = []
        summaries.append(tf.summary.scalar('accuracy', batch_acc))
        summary_op = tf.summary.merge(summaries)

        with tf.Session() as sess:
            tf.train.start_queue_runners(sess=sess)
            summary_writer = tf.summary.FileWriter(
                cfg.test_logdir, graph=None)  # graph=sess.graph, huge!

            for epoch in range(cfg.epoch):
                # requires a regex to adapt the loss value in the file name here
                ckpt_re = re.compile()
                ckpt = os.path.join(
                    cfg.logdir,
                    'model.ckpt-%d' % (num_batches_per_epoch_train * epoch))
                saver.restore(sess, ckpt)

                accuracy_sum = 0
                for i in range(num_batches_test):
                    batch_acc_v, summary_str = sess.run(
                        [batch_acc, summary_op])
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)

                    accuracy_sum += batch_acc_v

                    step += 1

                ave_acc = accuracy_sum / num_batches_test
                print('the average accuracy is %f' % ave_acc)
def main(args):
    # 1、设置GPU模式
    session_config = cfg.set_gpu()

    with tf.Graph().as_default():

        # 2、设置随机种子、读取数据batch、类别数
        tf.set_random_seed(1234)
        coord_add = cfg.get_coord_add(dataset_name)
        num_classes = cfg.get_num_classes(dataset_name)
        labels_txt = cfg.search_keyword_files(recognize_data_dir, recognize_labels_txt_keywords)
        labels_maps = cfg.read_label_txt_to_dict(labels_txt[0])


        with tf.Session(config=session_config) as sess:

            create_inputs = cfg.get_create_inputs(dataset_name, is_train=False, epochs=cfg.epoch)
            batch_x, batch_labels = create_inputs()


            # 3、初始化网络
            output, pose_out = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes)
            tf.logging.debug(pose_out.get_shape())
            results, labels = net.batch_results_and_labels(output, batch_labels)

            # 4、全局初始化和启动数据线程 (要放在初始化网络之后)
            coord, threads = cfg.init_variables_and_start_thread(sess)

            # 5、恢复model
            cfg.restore_model(sess, ckpt)

            # 6、求出全部预测值和标签list
            np_predicts_list = []
            np_lables_list = []
            for i in range(num_batches_test):
                np_results,np_labels = sess.run(
                    [results, labels])
                print(np_results)
                print(np_labels)
                np_predicts_list.extend(np_results)
                np_lables_list.extend(np_labels)

            np_predicts_list_str = str(np_predicts_list)
            np_lables_list_str = str(np_lables_list)
            with open('predicts_and_labels.txt','w') as f:
                f.write('predicts\r\n')
                f.write(np_predicts_list_str + '\r\n')
                f.write('labels\r\n')
                f.write(np_lables_list_str + '\r\n')

            cfg.stop_threads(coord,threads)
Ejemplo n.º 5
0
def main(args):
    tf.set_random_seed(1234)
    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(dataset_name,
                                      is_train=False,
                                      epochs=cfg.epoch)

    with tf.Graph().as_default():
        num_batches_test = int(dataset_size_test / cfg.batch_size * 0.5)
        batch_x, batch_labels = create_inputs()
        output, pose_out = net.build_arch(batch_x,
                                          coord_add,
                                          is_train=False,
                                          num_classes=num_classes)
        tf.logging.debug(pose_out.get_shape())

        batch_acc = net.test_accuracy(output, batch_labels)
        saver = tf.train.Saver()
        session_config = tf.ConfigProto(
            device_count={'GPU': 0},
            gpu_options={
                'allow_growth': 1,
                # 'per_process_gpu_memory_fraction': 0.1,
                'visible_device_list': '0'
            },
            allow_soft_placement=True)
        with tf.Session(config=session_config) as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            mode_file = tf.train.latest_checkpoint(ckpt)
            saver.restore(sess, mode_file)

            accuracy_sum = 0
            for i in range(num_batches_test):
                batch_acc_v = sess.run([batch_acc])
                accuracy_sum += batch_acc_v[0]
                print(accuracy_sum)

            ave_acc = accuracy_sum / num_batches_test
            print('the average accuracy is %f' % ave_acc)
def main(_):
    coord_add = [[[8., 8.], [12., 8.], [16., 8.]],
                 [[8., 12.], [12., 12.], [16., 12.]],
                 [[8., 16.], [12., 16.], [16., 16.]]]

    with tf.Graph().as_default():
        batch_x, dt, datanum = utils.get_pred_data()
        num_batches_test = math.ceil(datanum / cfg.batch_size)
        print("total data:", datanum, ", run count:", num_batches_test,
              ",  dt:", dt)
        #print(batch_x)

        output = net.build_arch(batch_x, coord_add, is_train=False)
        predict = tf.argmax(output, axis=1)

        saver = tf.train.Saver()

        sess = tf.Session()
        tf.train.start_queue_runners(sess=sess)
        ckpt = tf.train.get_checkpoint_state(cfg.logdir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            print(ckpt.model_checkpoint_path)

        for i in range(num_batches_test):
            y_pred, output1 = sess.run([predict, output])

            if i % 10 == 0:
                print("step:", i, "/", num_batches_test
                      )  #,",",np.shape(y_pred),np.shape(output1))
            if i == 0:
                y_pred1 = y_pred
            else:
                y_pred1 = np.concatenate((y_pred1, y_pred), axis=0)

        print(np.shape(y_pred1), ",", datanum)
        print(y_pred1)
        trade_data.out_indi_data(cfg.test_dataset,
                                 y_pred1,
                                 datalen=cfg.image_size)
Ejemplo n.º 7
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 2 and isinstance(args[1], str)
    dataset_name = args[1]
    logger.info('Using dataset: {}'.format(dataset_name))
    coord_add = get_coord_add(dataset_name)
    num_classes = get_num_classes(dataset_name)

    dataset_size = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    create_inputs = get_create_inputs(dataset_name,
                                      is_train=True,
                                      epochs=cfg.epoch)
    test_inputs = get_create_inputs(dataset_name,
                                    is_train=False,
                                    epochs=cfg.epoch)
    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        """Get global_step."""
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        """Get batches per epoch."""
        num_batches_per_epoch = int(dataset_size / cfg.batch_size)
        num_batches_test = int(dataset_size_test / cfg.batch_size)
        """Set tf summaries."""
        summaries = []
        valid_sum = []
        """Use exponential decay leanring rate?"""
        # lrn_rate = tf.maximum(tf.train.exponential_decay(
        #     1e-2, global_step, num_batches_per_epoch, 0.8), 1e-5)
        # summaries.append(tf.summary.scalar('learning_rate', lrn_rate))
        opt = tf.train.AdamOptimizer(learning_rate=0.001)
        """Get batch from data queue."""
        train_q = create_inputs()
        test_q = test_inputs()
        use_train_data = tf.placeholder(dtype=tf.bool, shape=())
        batch_x, batch_labels = tf.cond(use_train_data,
                                        true_fn=lambda: train_q,
                                        false_fn=lambda: test_q)
        # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32)
        """Define the dataflow graph."""
        m_op = tf.placeholder(dtype=tf.float32, shape=())
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                norm_batch_x = tf.contrib.layers.batch_norm(batch_x,
                                                            is_training=True)

                # Select network architecture.
                if cfg.network == 'conv':
                    import capsnet_em as net
                    output = net.build_arch(norm_batch_x,
                                            coord_add,
                                            is_train=True,
                                            num_classes=num_classes)
                elif cfg.network == 'fc':
                    import capsnet_fc as net
                    output = net.build_arch(norm_batch_x,
                                            is_train=True,
                                            num_classes=num_classes)
                else:
                    raise ValueError('Invalid network architecture: ' %
                                     cfg.network)

                # Select loss function.
                if cfg.loss_fn == 'spread':
                    loss = net.spread_loss(output, batch_labels, m_op)
                elif cfg.loss_fn == 'margin':
                    loss = net.margin_loss(output, batch_labels)
                elif cfg.loss_fn == 'cross_en':
                    loss = net.cross_entropy_loss(output, batch_labels)
                else:
                    raise ValueError('Invalid loss function: ' % cfg.loss_fn)

                acc = net.accuracy(output, batch_labels)
            """Compute gradient."""
            grad = opt.compute_gradients(loss)
            # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
            grad_check = [tf.check_numerics(g, message='Gradient NaN Found!')
                          for g, _ in grad] + \
                         [tf.check_numerics(loss, message='Loss NaN Found')]
        """Add to summary."""
        summaries.append(tf.summary.scalar('loss', loss))
        summaries.append(tf.summary.scalar('acc', acc))
        valid_sum.append(tf.summary.scalar('val_loss', loss))
        valid_sum.append(tf.summary.scalar('val_acc', acc))
        """Apply graident."""
        with tf.control_dependencies(grad_check):
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = opt.apply_gradients(grad, global_step=global_step)
        """Set Session settings."""
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.gpu_frac)
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False,
                                                gpu_options=gpu_options))
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        """Set Saver."""
        var_to_save = [
            v for v in tf.global_variables() if 'Adam' not in v.name
        ]  # Don't save redundant Adam beta/gamma
        saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch)
        """Display parameters"""
        total_p = np.sum([
            np.prod(v.get_shape().as_list()) for v in var_to_save
        ]).astype(np.int32)
        train_p = np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
        ]).astype(np.int32)
        logger.info('Total Parameters: {}'.format(total_p))
        logger.info('Trainable Parameters: {}'.format(train_p))

        # read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)
        """Set summary op."""
        summary_op = tf.summary.merge(summaries)
        valid_sum_op = tf.summary.merge(valid_sum)
        """Start coord & queue."""
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        """Set summary writer"""
        summary_writer = tf.summary.FileWriter(
            cfg.logdir, graph=None)  # graph = sess.graph, huge!
        """Main loop."""
        m_min = 0.2
        m_max = 0.9
        m = m_min
        for step in range(cfg.epoch * num_batches_per_epoch):
            if (step % num_batches_per_epoch) == 0:
                tic = time.time()
                progbar = tf.keras.utils.Progbar(
                    num_batches_per_epoch, verbose=(1 if cfg.progbar else 0))
            """"TF queue would pop batch until no file"""
            try:
                _, loss_value, acc_value = sess.run([train_op, loss, acc],
                                                    feed_dict={
                                                        use_train_data: True,
                                                        m_op: m
                                                    })
                progbar.update((step % num_batches_per_epoch),
                               values=[('loss', loss_value),
                                       ('acc', acc_value)])
            except KeyboardInterrupt:
                sess.close()
                sys.exit()
            except tf.errors.InvalidArgumentError:
                logger.warning(
                    '%d iteration contains NaN gradients. Discard.' % step)
                continue
            """Write to summary."""
            if step % 10 == 0:
                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           use_train_data: True,
                                           m_op: m
                                       })
                summary_writer.add_summary(summary_str, step)
            """Epoch wise linear annealling."""
            if (step % num_batches_per_epoch) == 0:
                if step > 0:
                    m += (m_max - m_min) / (cfg.epoch * 0.6)
                    if m > m_max:
                        m = m_max
                """Save model periodically"""
                ckpt_path = os.path.join(
                    cfg.logdir, 'model-{0:.4f}.ckpt'.format(loss_value))
                saver.save(sess, ckpt_path, global_step=step)

            # Add a new progress bar
            if ((step + 1) % num_batches_per_epoch) == 0:
                toc = time.time()
                val_loss_value, val_acc_value = (0.0, 0.0)
                for i in range(num_batches_test):
                    val_batch = sess.run([loss, acc],
                                         feed_dict={
                                             use_train_data: False,
                                             m_op: m
                                         })
                    val_loss_batch, val_acc_batch = val_batch
                    val_loss_value += val_loss_batch / num_batches_test
                    val_acc_value += val_acc_batch / num_batches_test
                valid_sum_str = sess.run(valid_sum_op,
                                         feed_dict={
                                             use_train_data: False,
                                             m_op: m
                                         })
                summary_writer.add_summary(valid_sum_str, step)
                print('\nEpoch %d/%d in ' %
                      (step // num_batches_per_epoch + 1, cfg.epoch) +
                      '%.1fs' % (toc - tic) + ' - loss: %f' % val_loss_value +
                      ' - acc: %f' % val_acc_value)
        """Join threads"""
        coord.join(threads)
Ejemplo n.º 8
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 2 and isinstance(args[1], str)
    dataset_name = args[1]
    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(dataset_name,
                                      is_train=False,
                                      epochs=cfg.epoch)
    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size)
        num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1)

        batch_x, batch_labels = create_inputs()
        batch_x = slim.batch_norm(batch_x,
                                  center=False,
                                  is_training=False,
                                  trainable=False)
        output, _ = net.build_arch(batch_x,
                                   coord_add,
                                   is_train=False,
                                   num_classes=num_classes)
        batch_acc = net.test_accuracy(output, batch_labels)
        saver = tf.train.Saver()

        step = 0

        summaries = []
        summaries.append(tf.summary.scalar('accuracy', batch_acc))
        summary_op = tf.summary.merge(summaries)

        with tf.Session(
                config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)) as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            summary_writer = tf.summary.FileWriter(
                cfg.test_logdir, graph=sess.graph)  # graph=sess.graph, huge!

            files = os.listdir(cfg.logdir)
            for epoch in range(1, cfg.epoch):
                # requires a regex to adapt the loss value in the file name here
                ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch)
                for __file in files:
                    if __file.endswith(ckpt_re + ".index"):
                        ckpt = os.path.join(cfg.logdir, __file[:-6])
                # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch))
                saver.restore(sess, ckpt)

                accuracy_sum = 0
                for i in range(num_batches_test):
                    batch_acc_v, summary_str = sess.run(
                        [batch_acc, summary_op])
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)
                    print('%d batch accuracy.' % batch_acc_v)

                    accuracy_sum += batch_acc_v

                    step += 1

                ave_acc = accuracy_sum / num_batches_test
                print('the average accuracy is %f' % ave_acc)

            coord.join(threads)
Ejemplo n.º 9
0
def main(_):
    coord_add = [[[8., 8.], [12., 8.], [16., 8.]],
                 [[8., 12.], [12., 12.], [16., 12.]],
                 [[8., 16.], [12., 16.], [16., 16.]]]

    coord_add = np.array(coord_add, dtype=np.float32) / 28.
    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        batch_x, batch_labels = utils.get_shuffle_tfrecord(is_training=True)
        datanum = 272965
        num_batches_per_epoch = int(datanum / cfg.batch_size)
        print(datanum, num_batches_per_epoch)
        # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32)
        """Use exponential decay leanring rate?"""
        lrn_rate = tf.maximum(
            tf.train.exponential_decay(1e-3, global_step,
                                       num_batches_per_epoch, 0.8), 1e-5)
        tf.summary.scalar('learning_rate', lrn_rate)
        opt = tf.train.AdamOptimizer(learning_rate=lrn_rate)

        m_op = tf.placeholder(dtype=tf.float32, shape=())
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                output = net.build_arch(batch_x, coord_add, is_train=True)
                # loss = net.cross_ent_loss(output, batch_labels)
                loss = net.spread_loss(output, batch_labels, m_op)
                accuracy = net.test_accuracy(output, batch_labels)
                tf.summary.scalar("spread_loss", loss)
                tf.summary.scalar("accuracy", accuracy)
            """Compute gradient."""
            grad = opt.compute_gradients(loss)
            # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
            grad_check = [
                tf.check_numerics(g, message='Gradient NaN Found!')
                for g, _ in grad if g is not None
            ] + [tf.check_numerics(loss, message='Loss NaN Found')]
        """Apply graident."""
        with tf.control_dependencies(grad_check):
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = opt.apply_gradients(grad, global_step=global_step)

            # Print trainable variable parameter statistics to stdout.
            # By default, statistics are associated with each graph node.
        param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
            tf.get_default_graph(),
            tfprof_options=tf.contrib.tfprof.model_analyzer.
            TRAINABLE_VARS_PARAMS_STAT_OPTIONS)

        # param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
        # Let's print the root below.
        print('total_params: %d\n' % param_stats.total_parameters)

        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False))
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=30)  #cfg.epoch)

        # restore from the check point
        ckpt = tf.train.get_checkpoint_state(cfg.logdir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            initial_step = int(ckpt.model_checkpoint_path.split('-')[1])
            print(ckpt, ckpt.model_checkpoint_path, initial_step)
            m = 0.9
        else:
            initial_step = 0
            m = 0.2

        # read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)
        """Set summary op."""
        summary_op = tf.summary.merge_all()
        """Start coord & queue."""
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        """Set summary writer"""
        # if not os.path.exists(cfg.logdir):
        #     os.makedirs(cfg.logdir)
        summary_writer = tf.summary.FileWriter(
            cfg.logdir, graph=sess.graph)  # graph = sess.graph, huge!

        cal_num = 0

        for step in range(cfg.epoch):
            for i in range(num_batches_per_epoch):
                tic = time.time()
                """"TF queue would pop batch until no file"""
                try:
                    _, loss_value, accuracy_val = sess.run(
                        [train_op, loss, accuracy], feed_dict={m_op: m})
                    print(
                        '%d/%d, %d/%d iteration is finished in ' %
                        (step, cfg.epoch, i, num_batches_per_epoch) +
                        '%f second' % (time.time() - tic) + ',m:', m,
                        ',loss: %f' % loss_value, ",accuracy:", accuracy_val)

                    cal_num += 1
                except tf.errors.InvalidArgumentError:
                    print('%d iteration contains NaN gradients. Discard.' %
                          cal_num)
                    continue
                else:
                    """Write to summary."""
                    if i % 30 == 0:
                        summary_str = sess.run(summary_op, feed_dict={m_op: m})
                        summary_writer.add_summary(summary_str,
                                                   initial_step + cal_num)

                    if cal_num % cfg.saveperiod == 0:
                        ckpt_path = os.path.join(cfg.logdir, 'model.ckpt')
                        saver.save(sess,
                                   ckpt_path,
                                   global_step=initial_step + cal_num)

                    if m < 0.9:
                        m += round((0.9 - 0.2) / num_batches_per_epoch, 5)
                    else:
                        m = 0.9

            ckpt_path = os.path.join(cfg.logdir, 'model.ckpt')
            saver.save(sess, ckpt_path, global_step=initial_step + cal_num)
        """Join threads"""
        coord.join(threads)
Ejemplo n.º 10
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 3 and isinstance(args[1], str) and isinstance(
        args[2], str)
    dataset_name = args[1]
    model_name = args[2]
    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(dataset_name,
                                      is_train=False,
                                      epochs=cfg.epoch)

    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size)
        num_batches_test = 2  # int(dataset_size_test / cfg.batch_size * 0.1)

        batch_x, batch_labels = create_inputs()
        batch_squash = tf.divide(batch_x, 255.)
        batch_x_norm = slim.batch_norm(batch_x,
                                       center=False,
                                       is_training=False,
                                       trainable=False)
        output, pose_out = net.build_arch(batch_x_norm,
                                          coord_add,
                                          is_train=False,
                                          num_classes=num_classes)
        tf.logging.debug(pose_out.get_shape())

        batch_acc = net.test_accuracy(output, batch_labels)
        m_op = tf.constant(0.9)
        loss, spread_loss, mse, recon_img_squash = net.spread_loss(
            output, pose_out, batch_squash, batch_labels, m_op)
        tf.summary.scalar('spread_loss', spread_loss)
        tf.summary.scalar('reconstruction_loss', mse)
        tf.summary.scalar('all_loss', loss)
        data_size = int(batch_x.get_shape()[1])
        recon_img = tf.multiply(
            tf.reshape(recon_img_squash,
                       shape=[cfg.batch_size, data_size, data_size, 1]), 255.)
        orig_img = tf.reshape(batch_x,
                              shape=[cfg.batch_size, data_size, data_size, 1])
        tf.summary.image('orig_image', orig_img)
        tf.summary.image('recon_image', recon_img)
        saver = tf.train.Saver()

        step = 0

        tf.summary.scalar('accuracy', batch_acc)
        summary_op = tf.summary.merge_all()

        with tf.Session(
                config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)) as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            if not os.path.exists(cfg.test_logdir +
                                  '/{}/{}/'.format(model_name, dataset_name)):
                os.makedirs(cfg.test_logdir +
                            '/{}/{}/'.format(model_name, dataset_name))
            summary_writer = tf.summary.FileWriter(
                cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name),
                graph=sess.graph)  # graph=sess.graph, huge!

            files = os.listdir(cfg.logdir +
                               '/{}/{}/'.format(model_name, dataset_name))
            for epoch in range(45, 46):
                # requires a regex to adapt the loss value in the file name here
                ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch)
                for __file in files:
                    if __file.endswith(ckpt_re + ".index"):
                        ckpt = os.path.join(
                            cfg.logdir +
                            '/{}/{}/'.format(model_name, dataset_name),
                            __file[:-6])
                #ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch))
                ############Comentar linea de abajo
                #ckpt = os.path.join(cfg.logdir, "caps/mnist/model-0.3764.ckpt-1718")
                saver.restore(sess, ckpt)

                accuracy_sum = 0
                for i in range(num_batches_test):
                    batch_acc_v, summary_str, orig_image, recon_image = sess.run(
                        [batch_acc, summary_op, orig_img, recon_img])
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)

                    accuracy_sum += batch_acc_v

                    step += 1
                    # display original/reconstructed images in matplotlib
                    plot_imgs(orig_image, i, 'ori')
                    plot_imgs(recon_image, i, 'rec')

                ave_acc = accuracy_sum / num_batches_test
                print('the average accuracy is %f' % ave_acc)
Ejemplo n.º 11
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 2 and isinstance(args[1], str)
    dataset_name = args[1]
    logger.info('Using dataset: {}'.format(dataset_name))
    coord_add = get_coord_add(dataset_name)
    dataset_size = get_dataset_size_train(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch)

    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        """Get global_step."""
        global_step = tf.get_variable(
            'global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        """Get batches per epoch."""
        num_batches_per_epoch = int(dataset_size / cfg.batch_size)

        """Use exponential decay leanring rate?"""
        lrn_rate = tf.maximum(tf.train.exponential_decay(
            1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5)
        tf.summary.scalar('learning_rate', lrn_rate)
        opt = tf.train.AdamOptimizer()  # lrn_rate

        """Get batch from data queue."""
        batch_x, batch_labels = create_inputs()
        # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32)

        """Define the dataflow graph."""
        m_op = tf.placeholder(dtype=tf.float32, shape=())
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                batch_squash = tf.divide(batch_x, 255.)
                batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True)
                output, pose_out = net.build_arch(batch_x, coord_add, is_train=True,
                                                  num_classes=num_classes)
                # loss = net.cross_ent_loss(output, batch_labels)
                loss, spread_loss, mse = net.spread_loss(
                    output, pose_out, batch_squash, batch_labels, m_op)
                tf.summary.scalar('spread_loss', spread_loss)
                tf.summary.scalar('reconstruction_loss', mse)
                tf.summary.scalar('all_loss', loss)

            """Compute gradient."""
            grad = opt.compute_gradients(loss)
            # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
            grad_check = [tf.check_numerics(g, message='Gradient NaN Found!')
                          for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')]

        """Apply graident."""
        with tf.control_dependencies(grad_check):
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = opt.apply_gradients(grad, global_step=global_step)

        """Set Session settings."""
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())

        """Set Saver."""
        var_to_save = [v for v in tf.global_variables(
        ) if 'Adam' not in v.name]  # Don't save redundant Adam beta/gamma
        saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch)

        """Display parameters"""
        total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32)
        train_p = np.sum([np.prod(v.get_shape().as_list())
                          for v in tf.trainable_variables()]).astype(np.int32)
        logger.info('Total Parameters: {}'.format(total_p))
        logger.info('Trainable Parameters: {}'.format(train_p))

        # read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)
        """Set summary op."""
        summary_op = tf.summary.merge_all()

        """Start coord & queue."""
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        """Set summary writer"""
        if not os.path.exists(cfg.logdir + '/train_log/'):
            os.makedirs(cfg.logdir + '/train_log/')
        summary_writer = tf.summary.FileWriter(
            cfg.logdir + '/train_log/', graph=sess.graph)  # graph = sess.graph, huge!

        """Main loop."""
        m_min = 0.2
        m_max = 0.9
        m = m_min
        for step in range(cfg.epoch * num_batches_per_epoch + 1):
            tic = time.time()
            """"TF queue would pop batch until no file"""
            try:
                _, loss_value, summary_str = sess.run(
                    [train_op, loss, summary_op], feed_dict={m_op: m})
                logger.info('%d iteration finishs in ' % step + '%f second' %
                            (time.time() - tic) + ' loss=%f' % loss_value)
            except KeyboardInterrupt:
                sess.close()
                sys.exit()
            except tf.errors.InvalidArgumentError:
                logger.warning('%d iteration contains NaN gradients. Discard.' % step)
                continue
            else:
                """Write to summary."""
                if step % 5 == 0:
                    summary_writer.add_summary(summary_str, step)

                """Epoch wise linear annealling."""
                if (step % num_batches_per_epoch) == 0:
                    if step > 0:
                        m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule)
                        if m > m_max:
                            m = m_max

                    """Save model periodically"""
                    ckpt_path = os.path.join(
                        cfg.logdir, 'model-{}.ckpt'.format(round(loss_value, 4)))
                    saver.save(sess, ckpt_path, global_step=step)

        """Join threads"""
        coord.join(threads)
Ejemplo n.º 12
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str)
    dataset_name = args[1]
    model_name = args[2]

    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(
        dataset_name, is_train=False, epochs=cfg.epoch)

    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size)
        num_batches_test = 2  # int(dataset_size_test / cfg.batch_size * 0.1)

        batch_x, batch_labels = create_inputs()
        batch_squash = tf.divide(batch_x, 255.)
        batch_x_norm = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False)
        output, pose_out = net.build_arch(batch_x_norm, coord_add,
                                          is_train=False, num_classes=num_classes)
        tf.logging.debug(pose_out.get_shape())

        batch_acc = net.test_accuracy(output, batch_labels)
        m_op = tf.constant(0.9)
        loss, spread_loss, mse, recon_img_squash = net.spread_loss(
            output, pose_out, batch_squash, batch_labels, m_op)
        tf.summary.scalar('spread_loss', spread_loss)
        tf.summary.scalar('reconstruction_loss', mse)
        tf.summary.scalar('all_loss', loss)
        data_size = int(batch_x.get_shape()[1])
        recon_img = tf.multiply(tf.reshape(recon_img_squash, shape=[
                                cfg.batch_size, data_size, data_size, 1]), 255.)
        orig_img = tf.reshape(batch_x, shape=[
            cfg.batch_size, data_size, data_size, 1])
        tf.summary.image('orig_image', orig_img)
        tf.summary.image('recon_image', recon_img)
        saver = tf.train.Saver()

        step = 0

        tf.summary.scalar('accuracy', batch_acc)
        summary_op = tf.summary.merge_all()

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True, log_device_placement=False)) as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)):
                os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name))
            summary_writer = tf.summary.FileWriter(
                cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph)  # graph=sess.graph, huge!

            files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name))
            for epoch in range(14, 15):
                # requires a regex to adapt the loss value in the file name here
                ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch)
                for __file in files:
                    if __file.endswith(ckpt_re + ".index"):
                        ckpt = os.path.join(
                            cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6])
                # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch))
                saver.restore(sess, ckpt)

                accuracy_sum = 0
                for i in range(num_batches_test):
                    batch_acc_v, summary_str, orig_image, recon_image = sess.run(
                        [batch_acc, summary_op, orig_img, recon_img])
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)

                    accuracy_sum += batch_acc_v

                    step += 1
                    # display original/reconstructed images in matplotlib
                    plot_imgs(orig_image, i, 'ori')
                    plot_imgs(recon_image, i, 'rec')

                ave_acc = accuracy_sum / num_batches_test
                print('the average accuracy is %f' % ave_acc)
Ejemplo n.º 13
0
def test_model(n_tests, x_test, y_test, ang_min, ang_max):

    # Placeholders for input data and the targets
    x_input = tf.placeholder(tf.float32, (None, *IMG_DIM), name='Input')
    y_target = tf.placeholder(tf.int32, [None, ], name='Target')

    coord_add = get_coord_add(dataset_name )
    sample_batch = tf.identity(x_input)
    batch_labels = tf.identity(y_target)
    batch_x = slim.batch_norm(sample_batch, center=False, is_training=False, trainable=False)
    output, pose_out = net.build_arch(batch_x, coord_add, is_train=True,
                                      num_classes=NCLASSES)
    batch_acc_sum = net.test_accuracy_sum(output, batch_labels)
    batch_pred = net.test_predict(output, batch_labels)

    saver = tf.train.Saver()

    sess = tf.Session()
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())

    model_path = cfg.logdir + '/caps/mnist'
    saver.restore(sess, tf.train.latest_checkpoint(model_path))

    nImg = x_test.shape[0]
    batch_size = int(cfg.batch_size)
    nBatches = int(nImg / batch_size)

    accuraces = []

    mean_acc = 0
    for n in range(n_tests):
        print('\nTest %d/%d' % (n + 1, n_tests))

        print('-' * 30 + 'Begin: testing' + '-' * 30)
        acc = 0
        k = 0
        xi = np.empty([1, sy, sx, 1])
        x_init = np.empty([1, sy, sx, 1])

        for i in range(nBatches):
            x = x_test[i * batch_size: (i + 1) * batch_size, :, :, :]
            y = y_test[i * batch_size: (i + 1) * batch_size]
            xr = np.empty(x.shape)
            for j in range(x.shape[0]):
                xr[j, :, :, :] = utils.create_inputs_mnist_rot_excl_range(x[j, :, :, :], y[j],
                                                                          ang_min, ang_max)

                k += 1

            batch_acc_v = sess.run(batch_acc_sum, feed_dict={x_input: xr, y_target: y})
            acc += batch_acc_v

            # Just checking what images we are feeding to the network
            if i == 0 and n == 0:
                for j in range(batch_size):
                    if j == 0:
                        xi[0, :, :, :] = xr[0, :, :, :]
                        x_init[0, :, :, :] = x[0, :, :, :]
                    else:
                        xi = np.concatenate([xi, np.expand_dims(xr[j, :, :, :], 0)])
                        x_init = np.concatenate([x_init, np.expand_dims(x[j, :, :, :],0)])
                    # xr = np.concatenate([xr, x_recon])
                    if j == (batch_size - 1):
                        images = utils.combine_images(xi)
                        image = images
                        Image.fromarray(image.astype(np.uint8)).save(cfg.logdir + "/batch_rot.png")

                        images = utils.combine_images(x_init)
                        image = images
                        Image.fromarray(image.astype(np.uint8)).save(cfg.logdir + "/batch_init.png")

            sys.stdout.write(ERASE_LINE)
            sys.stdout.write("\r \r {0}%".format(int(100 * k / nImg)))
            sys.stdout.flush()
            time.sleep(0.001)


        x = x_test[k:, :, :, :]
        y = y_test[k:]

        # duplicate the last sample to adjust the batch size
        n_left = nImg-k
        n_tile = BATCH_SIZE - n_left

        x_tile = np.tile(np.expand_dims(x_test[nImg-1, :, :, :],0), [n_tile, 1, 1, 1])
        y_tile = np.tile(y_test[nImg-1], n_tile)

        x = np.concatenate( (x, x_tile) )
        y = np.concatenate((y, y_tile))

        xr = np.empty(x.shape)
        for j in range(x.shape[0]):
            xr[j, :, :, :] = utils.create_inputs_mnist_rot_excl_range(x[j, :, :, :], y[j],
                                                                      ang_min, ang_max)

        batch_pred_v = sess.run(batch_pred, feed_dict={x_input: xr, y_target: y})
        left_pred = np.asarray(batch_pred_v[:n_left], dtype=np.float32)

        acc += np.sum(left_pred)

        k += n_left

        sys.stdout.write(ERASE_LINE)
        sys.stdout.write("\r \r {0}%".format(str(100)))
        sys.stdout.flush()
        time.sleep(0.001)

        print('\n')
        print('-' * 30 + 'End: testing' + '-' * 30)

        acc_aver = acc / float(y_test.shape[0])

        print('Number of images: {}, Accuracy: {}'.format(k, acc_aver))

        mean_acc += acc_aver
        accuraces.append(acc_aver)

    mean_acc = mean_acc / float(n_tests)

    var_acc = 0
    accuraces = np.array(accuraces)
    for i in range(accuraces.shape[0]):
        var_acc += (accuraces[i] - mean_acc)*(accuraces[i] - mean_acc)

    var_acc /= float(n_tests)

    print('\nTesting is finished!')
    print('Testing options:\nAngles range from {} to {}\tIs only 3 and 4: {}'.format(ang_min, ang_max, is_only_3_and_4))
    print('\nMean testing accuracy for {} runs: {}'.format(n_tests, mean_acc))
    print('Variance of testing accuracy for {} runs: {}'.format(n_tests, var_acc))
Ejemplo n.º 14
0
def main(args):
    assert len(args) == 2 and isinstance(args[1], str)

    # Get dataset name
    dataset_name = args[1]  # mnist
    logger.info(f'Using dataset: {dataset_name}')

    # Set reproducible random seed
    tf.set_random_seed(1234)

    coord_add = get_coord_add(dataset_name)  # (3, 3, 2)
    dataset_size = get_dataset_size_train(dataset_name)  # 55,000
    num_classes = get_num_classes(dataset_name)  # 10
    create_inputs = get_create_inputs(dataset_name,
                                      is_train=True,
                                      epochs=cfg.epoch)

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Get global_step
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        num_batches_per_epoch = dataset_size // cfg.batch_size  # 1100

        opt = tf.train.AdamOptimizer()

        # Get batch from data queue
        batch_x, batch_labels = create_inputs()  # (50 28, 28, 1), (50,)

        m_op = tf.placeholder(dtype=tf.float32, shape=())
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                batch_squash = tf.divide(batch_x, 255.)
                batch_x = slim.batch_norm(batch_x,
                                          center=False,
                                          is_training=True,
                                          trainable=True)

                output, pose_out = net.build_arch(
                    batch_x, coord_add, is_train=True,
                    num_classes=num_classes)  # (50, 10), (50, 10, 18)
                tf.logging.debug(pose_out.get_shape())

                # Define loss = spread_loss + reconstruction loss
                loss, spread_loss, mse, _ = net.spread_loss(
                    output, pose_out, batch_squash, batch_labels, m_op)

                acc = net.test_accuracy(output, batch_labels)
                tf.summary.scalar('spread_loss', spread_loss)
                tf.summary.scalar('reconstruction_loss', mse)
                tf.summary.scalar('all_loss', loss)
                tf.summary.scalar('train_acc', acc)

            grad = opt.compute_gradients(loss)
            # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
            grad_check = [
                tf.check_numerics(g, message='Gradient NaN Found!')
                for g, _ in grad if g is not None
            ] + [tf.check_numerics(loss, message='Loss NaN Found')]

        # Apply graident
        with tf.control_dependencies(grad_check):
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = opt.apply_gradients(grad, global_step=global_step)

        # Set Session settings
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False))
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())

        # Set Saver
        var_to_save = [
            v for v in tf.global_variables() if 'Adam' not in v.name
        ]  # Don't save redundant Adam beta/gamma
        saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch)

        # Display parameters
        total_p = np.sum([
            np.prod(v.get_shape().as_list()) for v in var_to_save
        ]).astype(np.int32)
        train_p = np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
        ]).astype(np.int32)
        logger.info('Total Parameters: {}'.format(total_p))
        logger.info('Trainable Parameters: {}'.format(train_p))

        # read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)

        # Set summary op
        summary_op = tf.summary.merge_all()

        # Start coord & queue
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # Set summary writer
        if not os.path.exists(cfg.logdir +
                              '/caps/{}/train_log/'.format(dataset_name)):
            os.makedirs(cfg.logdir +
                        '/caps/{}/train_log/'.format(dataset_name))
        summary_writer = tf.summary.FileWriter(
            cfg.logdir + f"/caps/{dataset_name}/train_log/",
            graph=sess.graph)  # graph = sess.graph, huge!

        # Main loop
        m_min = 0.2
        m_max = 0.9
        m = m_min
        for step in range(cfg.epoch * num_batches_per_epoch + 1):
            tic = time.time()
            # TF queue would pop batch until no file
            try:
                _, loss_value, summary_str = sess.run(
                    [train_op, loss, summary_op], feed_dict={m_op: m})
                logger.info('%d iteration finishs in ' % step + '%f second' %
                            (time.time() - tic) + ' loss=%f' % loss_value)
            except KeyboardInterrupt:
                sess.close()
                sys.exit()
            except tf.errors.InvalidArgumentError:
                logger.warning(
                    '%d iteration contains NaN gradients. Discard.' % step)
                continue
            else:
                if step % 5 == 0:
                    summary_writer.add_summary(summary_str, step)
                """Epoch wise linear annealling."""
                if (step % num_batches_per_epoch) == 0:
                    if step > 0:
                        m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule)
                        if m > m_max:
                            m = m_max

                    # Save model periodically
                    ckpt_path = os.path.join(
                        cfg.logdir + '/caps/{}/'.format(dataset_name),
                        'model-{:.4f}.ckpt'.format(loss_value))
                    saver.save(sess, ckpt_path, global_step=step)
Ejemplo n.º 15
0
def main(args):
    assert len(args) == 2 and isinstance(args[1], str)

    # Get dataset name
    dataset_name = args[1]   # mnist
    logger.info(f'Using dataset: {dataset_name}')

    # Set reproducible random seed
    tf.set_random_seed(1234)

    coord_add = get_coord_add(dataset_name)             # (3, 3, 2)
    dataset_size = get_dataset_size_train(dataset_name) # 55,000
    num_classes = get_num_classes(dataset_name)         # 10
    create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch)

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Get global_step
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        num_batches_per_epoch = dataset_size // cfg.batch_size # 1100

        opt = tf.train.AdamOptimizer()

        # Get batch from data queue
        batch_x, batch_labels = create_inputs() # (50 28, 28, 1), (50,)

        m_op = tf.placeholder(dtype=tf.float32, shape=())
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                batch_squash = tf.divide(batch_x, 255.)
                batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True)

                output, pose_out = net.build_arch(batch_x, coord_add, is_train=True, num_classes=num_classes) # (50, 10), (50, 10, 18)
                tf.logging.debug(pose_out.get_shape())

                # Define loss = spread_loss + reconstruction loss
                loss, spread_loss, mse, _ = net.spread_loss(output, pose_out, batch_squash, batch_labels, m_op)

                acc = net.test_accuracy(output, batch_labels)
                tf.summary.scalar('spread_loss', spread_loss)
                tf.summary.scalar('reconstruction_loss', mse)
                tf.summary.scalar('all_loss', loss)
                tf.summary.scalar('train_acc', acc)

            grad = opt.compute_gradients(loss)
            # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
            grad_check = [tf.check_numerics(g, message='Gradient NaN Found!')
                          for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')]

        # Apply graident
        with tf.control_dependencies(grad_check):
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = opt.apply_gradients(grad, global_step=global_step)

        # Set Session settings
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False))
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())

        # Set Saver
        var_to_save = [v for v in tf.global_variables() if 'Adam' not in v.name]  # Don't save redundant Adam beta/gamma
        saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch)

        # Display parameters
        total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32)
        train_p = np.sum([np.prod(v.get_shape().as_list())
                          for v in tf.trainable_variables()]).astype(np.int32)
        logger.info('Total Parameters: {}'.format(total_p))
        logger.info('Trainable Parameters: {}'.format(train_p))

        # read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)

        # Set summary op
        summary_op = tf.summary.merge_all()

        # Start coord & queue
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # Set summary writer
        if not os.path.exists(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name)):
            os.makedirs(cfg.logdir + '/caps/{}/train_log/'.format(dataset_name))
        summary_writer = tf.summary.FileWriter(
            cfg.logdir + f"/caps/{dataset_name}/train_log/", graph=sess.graph)  # graph = sess.graph, huge!

        # Main loop
        m_min = 0.2
        m_max = 0.9
        m = m_min
        for step in range(cfg.epoch * num_batches_per_epoch + 1):
            tic = time.time()
            # TF queue would pop batch until no file
            try:
                _, loss_value, summary_str = sess.run([train_op, loss, summary_op], feed_dict={m_op: m})
                logger.info('%d iteration finishs in ' % step + '%f second' %
                            (time.time() - tic) + ' loss=%f' % loss_value)
            except KeyboardInterrupt:
                sess.close()
                sys.exit()
            except tf.errors.InvalidArgumentError:
                logger.warning('%d iteration contains NaN gradients. Discard.' % step)
                continue
            else:
                if step % 5 == 0:
                    summary_writer.add_summary(summary_str, step)

                """Epoch wise linear annealling."""
                if (step % num_batches_per_epoch) == 0:
                    if step > 0:
                        m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule)
                        if m > m_max:
                            m = m_max

                    # Save model periodically
                    ckpt_path = os.path.join(cfg.logdir + '/caps/{}/'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value))
                    saver.save(sess, ckpt_path, global_step=step)
Ejemplo n.º 16
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str)
    dataset_name = args[1]
    model_name = args[2]
    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(
        dataset_name, is_train=False, epochs=cfg.epoch)

    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size)
        num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1)

        batch_x, batch_labels = create_inputs()
        batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False)
        if model_name == "caps":
            output, _ = net.build_arch(batch_x, coord_add,
                                       is_train=False, num_classes=num_classes)
        elif model_name == "cnn_baseline":
            output = net.build_arch_baseline(batch_x,
                                             is_train=False, num_classes=num_classes)
        else:
            raise "Please select model from 'caps' or 'cnn_baseline' as the secondary argument of eval.py!"
        batch_acc = net.test_accuracy(output, batch_labels)
        saver = tf.train.Saver()

        step = 0

        summaries = []
        summaries.append(tf.summary.scalar('accuracy', batch_acc))
        summary_op = tf.summary.merge(summaries)

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True, log_device_placement=False)) as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)):
                os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name))
            summary_writer = tf.summary.FileWriter(
                cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph)  # graph=sess.graph, huge!

            files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name))
            for epoch in range(1, cfg.epoch):
                # requires a regex to adapt the loss value in the file name here
                ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch)
                for __file in files:
                    if __file.endswith(ckpt_re + ".index"):
                        ckpt = os.path.join(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6])
                # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch))
                saver.restore(sess, ckpt)

                accuracy_sum = 0
                for i in range(num_batches_test):
                    batch_acc_v, summary_str = sess.run([batch_acc, summary_op])
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)

                    accuracy_sum += batch_acc_v

                    step += 1

                ave_acc = accuracy_sum / num_batches_test
                print('the average accuracy is %f' % ave_acc)

            coord.join(threads)
Ejemplo n.º 17
0
def main(_):
    coord_add = [[[8., 8.], [12., 8.], [16., 8.]],
                 [[8., 12.], [12., 12.], [16., 12.]],
                 [[8., 16.], [12., 16.], [16., 16.]]]

    with tf.Graph().as_default():
        batch_x, batch_labels, datanum = utils.get_batch_data(
            is_training=False)
        num_batches_test = math.ceil(datanum /
                                     cfg.batch_size)  #get the ceiling int

        output = net.build_arch(batch_x, coord_add, is_train=False)
        predict = tf.argmax(output, axis=1)
        batch_acc = net.test_accuracy(output, batch_labels)
        saver = tf.train.Saver()

        step = 0

        summaries = []
        summaries.append(tf.summary.scalar('accuracy', batch_acc))
        summary_op = tf.summary.merge(summaries)

        sess = tf.Session()
        tf.train.start_queue_runners(sess=sess)
        ckpt = tf.train.get_checkpoint_state(cfg.logdir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            print(ckpt.model_checkpoint_path)
        summary_writer = tf.summary.FileWriter(cfg.test_logdir,
                                               graph=sess.graph)

        for epoch in range(cfg.test_epoch):
            accuracy_sum = 0
            for i in range(num_batches_test):
                y_pred, y, batch_acc_v, summary_str = sess.run(
                    [predict, batch_labels, batch_acc, summary_op])

                if i % 10 == 0:
                    print('%d/%d batches are tested.' %
                          (step, num_batches_test))
                    #print("labels:\n",batch_labels)
                    print("Y:\n", y)
                    print("Y_prediction:", batch_acc_v, "\n", y_pred)
                summary_writer.add_summary(summary_str, step)
                accuracy_sum += batch_acc_v
                step += 1
                if i == 0:
                    y_pred1 = y_pred
                    label1 = y
                else:
                    y_pred1 = np.concatenate((y_pred1, y_pred), axis=0)
                    label1 = np.concatenate((label1, y), axis=0)
                #print("Label:",np.shape(label1),"\n", label1)

            ave_acc = accuracy_sum / num_batches_test
            # print("The last batch----Y:",np.shape(y),"\n", y)
            # print("Y_prediction:", batch_acc_v, "\n", y_pred)
            print(epoch, 'epoch: average accuracy is %f' % ave_acc)

            print(np.shape(y_pred1), ",", datanum)
            label1 = label1[:datanum]
            y_pred1 = y_pred1[:datanum]
            print("label:", np.shape(label1))
            trade_data.out_indi_data(cfg.test_dataset,
                                     y_pred1,
                                     datalen=cfg.image_size)
Ejemplo n.º 18
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 2 and isinstance(args[1], str)
    dataset_name = args[1]
    logger.info('Using dataset: {}'.format(dataset_name))
    coord_add = get_coord_add(dataset_name)
    dataset_size = get_dataset_size_train(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(dataset_name,
                                      is_train=True,
                                      epochs=cfg.epoch)
    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        """Get global_step."""
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        """Get batches per epoch."""
        num_batches_per_epoch = int(dataset_size / cfg.batch_size)
        """Set tf summaries."""
        summaries = []
        """Use exponential decay leanring rate?"""
        lrn_rate = tf.maximum(
            tf.train.exponential_decay(1e-3, global_step, 2e2, 0.66), 1e-5)
        summaries.append(tf.summary.scalar('learning_rate', lrn_rate))
        opt = tf.train.AdamOptimizer(lrn_rate)
        """Get batch from data queue."""
        batch_x, batch_labels = create_inputs()
        # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32)
        """Define the dataflow graph."""
        m_op = tf.placeholder(dtype=tf.float32, shape=())
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                output = net.build_arch(batch_x,
                                        coord_add,
                                        is_train=True,
                                        num_classes=num_classes)
                # loss = net.cross_ent_loss(output, batch_labels)
                loss = net.spread_loss(output, batch_labels, m_op)
            """Compute gradient."""
            grad = opt.compute_gradients(loss)
        """Add loss to summary."""
        summaries.append(tf.summary.scalar('spread_loss', loss))
        """Apply graident."""
        train_op = opt.apply_gradients(grad, global_step=global_step)
        """Set Session settings."""
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False))
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        """Set Saver."""
        var_to_save = [
            v for v in tf.global_variables() if 'Adam' not in v.name
        ]  # Don't save redundant Adam beta/gamma
        saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch)
        """Display parameters"""
        total_p = np.sum([
            np.prod(v.get_shape().as_list()) for v in var_to_save
        ]).astype(np.int32)
        train_p = np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
        ]).astype(np.int32)
        logger.info('Total Parameters: {}'.format(total_p))
        logger.info('Trainable Parameters: {}'.format(train_p))

        # read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)
        """Set summary op."""
        summary_op = tf.summary.merge(summaries)
        """Start coord & queue."""
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        """Set summary writer"""
        summary_writer = tf.summary.FileWriter(
            cfg.logdir, graph=None)  # graph = sess.graph, huge!
        """Main loop."""
        m_min = 0.2
        m_max = 0.9
        m = m_min
        for step in range(cfg.epoch * num_batches_per_epoch):
            tic = time.time()
            """"TF queue would pop batch until no file"""
            _, loss_value = sess.run([train_op, loss], feed_dict={m_op: m})
            logger.info('%d iteration finishs in ' % step + '%f second' %
                        (time.time() - tic) + ' loss=%f' % loss_value)
            """Check NaN"""
            assert not np.isnan(loss_value), 'loss is nan'
            """Write to summary."""
            if step % 10 == 0:
                summary_str = sess.run(summary_op, feed_dict={m_op: m})
                summary_writer.add_summary(summary_str, step)
            """Epoch wise linear annealling."""
            if (step % num_batches_per_epoch) == 0:
                if step > 0:
                    m += (m_max - m_min) / (cfg.epoch * 0.6)
                    if m > m_max:
                        m = m_max
                """Save model periodically"""
                ckpt_path = os.path.join(
                    cfg.logdir, 'model-{}.ckpt'.format(round(loss_value, 4)))
                saver.save(sess, ckpt_path, global_step=step)
        """Join threads"""
        coord.join(threads)
Ejemplo n.º 19
0
def main(_):      
    height = width = 28
    if isPadding: height = width = 40
    
    train_x, train_y, test_x,test_y = get_mnist()
    
    X = tf.placeholder(tf.float32, [cfg.batch_size, height, width,1])
    Y = tf.placeholder(tf.int32, [cfg.batch_size])
    M = tf.placeholder(tf.float32, ())      

    predict,check = net.build_arch(X)
    
    predict_class = tf.cast(tf.argmax(predict,-1), tf.int32)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predict_class,Y),tf.float32))
    
    loss_spread = net.spread_loss(predict, Y, M)
    loss_regular = tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
    loss = loss_spread + loss_regular        
    train_op = tf.train.AdamOptimizer(0.001,0.5).minimize(loss)
    
    sess = tf.Session()        
    saver = tf.train.Saver()
    if isNewTrain: 
        sess.run(tf.global_variables_initializer())
        print('Initialized!')
    else :        
        saver.restore(sess, modelName)
        print("Model restored")
    
    margin = 0.5
    start_sec = time.time()
    iteration_train = int(np.minimum(cfg.max_count,len(train_x))/cfg.batch_size)
    iteration_test = int(np.minimum(cfg.max_count,len(test_x))/cfg.batch_size) 
    num_batches = cfg.batch_size
    for step in range(cfg.epoch+1):
        acc_sum_train = 0.0
        acc_sum_test = 0.0
        for i in range(iteration_train):
            tic = time.time()
            start = i*num_batches
            end = start+num_batches
            batch_x = train_x[start:end]
            if isPadding:batch_x = padding(batch_x, 40)                
            dic = {X:batch_x,Y:train_y[start:end],M:margin}
            _, loss_,acc,check_ = sess.run([train_op, loss,accuracy,check],dic)
            
            acc_sum_train+=acc/iteration_train
            now = strftime("%H:%M:%S", localtime())
            if i%int(iteration_train/2)==0:         
                print('%d/%d %d/%d batch acc:%.3f, loss:%.5f margin:%.2f sec:%.2f check:%.3f' 
                      % (step,cfg.epoch,i,iteration_train, acc,loss_, margin,time.time()-tic,np.std(check_)))

            assert not np.isnan(loss_) 
        
        for i in range(iteration_test):
            start = i * num_batches
            end = start + num_batches   
            batch_x = test_x[start:end]             
            if isPadding:batch_x = padding(batch_x, 40)
            acc_te = sess.run(accuracy, {X:batch_x,Y:test_y[start:end],M:margin})
            acc_sum_test+=acc_te/iteration_test
            
        print ('%d/%d train:%.3f  test:%.3f' %(step, cfg.epoch, acc_sum_train, acc_sum_test))
                    
        this_sec = time.time()
        if margin < 0.9 and margin < acc: margin = np.minimum(margin+0.01, 0.9)
        if acc_sum_train>=1 or step==cfg.epoch or this_sec - start_sec > 60 * 5 :
            start_sec = this_sec
            save_path = saver.save(sess, modelName)            
            print("Model Saved, time:%s, %s" %(now, save_path))                
            if acc_sum_train>=1: break   
        
    print ('training finish',acc_sum_train,acc_sum_test)
Ejemplo n.º 20
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 3 and isinstance(args[1], str) and isinstance(
        args[2], str)
    dataset_name = args[1]
    model_name = args[2]
    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(dataset_name,
                                      is_train=False,
                                      epochs=cfg.epoch)
    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size)
        num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1)

        batch_x, batch_labels = create_inputs()
        batch_x = slim.batch_norm(batch_x,
                                  center=False,
                                  is_training=False,
                                  trainable=False)
        if model_name == "caps":
            output, _ = net.build_arch(batch_x,
                                       coord_add,
                                       is_train=False,
                                       num_classes=num_classes)
        elif model_name == "cnn_baseline":
            output = net.build_arch_baseline(batch_x,
                                             is_train=False,
                                             num_classes=num_classes)
        else:
            raise "Please select model from 'caps' or 'cnn_baseline' as the secondary argument of eval.py!"
        batch_acc = net.test_accuracy(output, batch_labels)
        saver = tf.train.Saver()

        step = 0

        summaries = []
        summaries.append(tf.summary.scalar('accuracy', batch_acc))
        summary_op = tf.summary.merge(summaries)

        session_config = tf.ConfigProto(
            device_count={'GPU': 0},
            gpu_options={
                'allow_growth': 1,
                # 'per_process_gpu_memory_fraction': 0.1,
                'visible_device_list': '0'
            },
            allow_soft_placement=True)
        with tf.Session(config=session_config) as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            if not os.path.exists(cfg.test_logdir +
                                  '/{}/{}/'.format(model_name, dataset_name)):
                os.makedirs(cfg.test_logdir +
                            '/{}/{}/'.format(model_name, dataset_name))
            summary_writer = tf.summary.FileWriter(
                cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name),
                graph=sess.graph)  # graph=sess.graph, huge!

            files = os.listdir(cfg.logdir +
                               '/{}/{}/'.format(model_name, dataset_name))
            for epoch in range(1, cfg.epoch):
                # requires a regex to adapt the loss value in the file name here
                ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch)
                for __file in files:
                    if __file.endswith(ckpt_re + ".index"):
                        ckpt = os.path.join(
                            cfg.logdir +
                            '/{}/{}/'.format(model_name, dataset_name),
                            __file[:-6])
                # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch))
                saver.restore(sess, ckpt)

                accuracy_sum = 0
                for i in range(num_batches_test):
                    batch_acc_v, summary_str = sess.run(
                        [batch_acc, summary_op])
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)

                    accuracy_sum += batch_acc_v

                    step += 1

                ave_acc = accuracy_sum / num_batches_test
                print('the average accuracy is %f' % ave_acc)

            coord.join(threads)
Ejemplo n.º 21
0
def main(_):
    coord_add = [[[8., 8.], [12., 8.], [16., 8.]],
                 [[8., 12.], [12., 12.], [16., 12.]],
                 [[8., 16.], [12., 16.], [16., 16.]]]

    coord_add = np.array(coord_add, dtype=np.float32)/28.
    data = utils.load_trade(is_training=True)
    datanum = data.num_examples
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        batch_x =tf.placeholder(tf.float32,[cfg.batch_size,cfg.image_size,cfg.image_size,3])
        batch_labels = tf.placeholder(tf.int32,[cfg.batch_size])
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
        opt = tf.train.AdamOptimizer()

        #batch_x, batch_labels,datanum = utils.get_shuffle_batch_data(is_training=True)
        num_batches_per_epoch = int(datanum / cfg.batch_size)
        print(datanum,num_batches_per_epoch)
        # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32)

        m_op = tf.placeholder(dtype=tf.float32, shape=())
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                output = net.build_arch(batch_x, coord_add, is_train=True)
                # loss = net.cross_ent_loss(output, batch_labels)
                loss = net.spread_loss(output, batch_labels, m_op)
                accuracy = net.test_accuracy(output,batch_labels)
            grad = opt.compute_gradients(loss)

        loss_name = 'spread_loss'

        # Print trainable variable parameter statistics to stdout.
        # By default, statistics are associated with each graph node.
        param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
            tf.get_default_graph(),
            tfprof_options=tf.contrib.tfprof.model_analyzer.
                TRAINABLE_VARS_PARAMS_STAT_OPTIONS)

        # param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
        # Let's print the root below.
        sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

        summaries = []
        summaries.append(tf.summary.scalar(loss_name, loss))
        summaries.append(tf.summary.scalar("accuracy",accuracy))

        train_op = opt.apply_gradients(grad, global_step=global_step)

        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False))
        sess.run(tf.global_variables_initializer())

        # add addition options to trace the session execution
        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()

        saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)  #cfg.epoch)

        # restore from the check point
        ckpt = tf.train.get_checkpoint_state(cfg.logdir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            initial_step = int(ckpt.model_checkpoint_path.split('-')[1])
            print(ckpt, ckpt.model_checkpoint_path, initial_step)
        else:
            initial_step =0
        m = 0.2

        summary_op = tf.summary.merge(summaries)
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(cfg.logdir, graph=sess.graph)


        cal_num=0
        for step in range(cfg.epoch):
            for i in range(num_batches_per_epoch):
                tic = time.time()
                x,y = data.next_batch(cfg.batch_size)
                _, loss_value,accuracy_val = sess.run([train_op, loss,accuracy], feed_dict={batch_x:x,batch_labels:y,m_op: m})
                print('%d/%d, %d/%d iteration is finished in ' % (step,cfg.epoch,i,num_batches_per_epoch) + '%f second' % (time.time()-tic) + ',m:',m,',loss: %f'% loss_value,",accuracy:",accuracy_val)

                assert not np.isnan(loss_value), 'loss is nan'
                cal_num+=1
                if i % 30 == 0:

                    summary_str = sess.run(summary_op, feed_dict={batch_x:x,batch_labels:y,m_op: m},
                                           options=options,
                                           run_metadata=run_metadata
                                           )
                    summary_writer.add_run_metadata(run_metadata,'step%d'% cal_num)
                    summary_writer.add_summary(summary_str, initial_step+cal_num)

                    # Print to stdout an analysis of the memory usage and the timing information
                    # broken down by operations.
                    # tf.contrib.tfprof.model_analyzer.print_model_analysis(
                    #     tf.get_default_graph(),
                    #     run_meta=run_metadata,
                    #     tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)

                    # fetched_timeline = timeline.Timeline(run_metadata.step_stats)
                    # chrome_trace = fetched_timeline.generate_chrome_trace_format()
                    # with open('./time_line/timeline_02_step_%d.json' % i, 'w') as f:
                    #     f.write(chrome_trace)

                if cal_num % cfg.saveperiod == 0:
                    ckpt_path = os.path.join(cfg.logdir, 'model.ckpt')
                    saver.save(sess, ckpt_path, global_step=initial_step + cal_num)

                if m<0.9:
                    m += round((0.9-0.2) / num_batches_per_epoch,5)
                else:
                    m = 0.9

            ckpt_path = os.path.join(cfg.logdir, 'model.ckpt')
            saver.save(sess, ckpt_path, global_step=initial_step+cal_num)
Ejemplo n.º 22
0
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 2 and isinstance(args[1], str)
    dataset_name = args[1]
    logger.info('Using dataset: {}'.format(dataset_name))
    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    coord_add = get_coord_add(dataset_name)
    dataset_size = get_dataset_size_train(dataset_name)
    num_classes = get_num_classes(dataset_name)

    # Prepare Training Data
    (x_train, y_train), (x_test, y_test) = utils.load_mnist_excluded()

    with tf.Graph().as_default():  #, tf.device('/cpu:0'):

        # Placeholders for input data and the targets
        x_input = tf.placeholder(tf.float32, (None, *IMG_DIM), name='Input')
        y_target = tf.placeholder(tf.int32, [
            None,
        ], name='Target')
        """Get global_step."""
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        """Get batches per epoch."""
        num_batches_per_epoch = int(dataset_size / cfg.batch_size)
        """Use exponential decay leanring rate?"""
        lrn_rate = tf.maximum(
            tf.train.exponential_decay(1e-3, global_step,
                                       num_batches_per_epoch, 0.8), 1e-5)
        tf.summary.scalar('learning_rate', lrn_rate)
        opt = tf.train.AdamOptimizer()  # lrn_rate
        """Define the dataflow graph."""
        m_op = tf.placeholder(dtype=tf.float32, shape=())
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable]):  #, device='/cpu:0'):
                sample_batch = tf.identity(x_input)
                batch_labels = tf.identity(y_target)
                batch_squash = tf.divide(sample_batch, 255.)
                batch_x = slim.batch_norm(sample_batch,
                                          center=False,
                                          is_training=True,
                                          trainable=True)
                output, pose_out = net.build_arch(batch_x,
                                                  coord_add,
                                                  is_train=True,
                                                  num_classes=num_classes)

                tf.logging.debug(pose_out.get_shape())
                loss, spread_loss, mse, reconstruction = net.spread_loss(
                    output, pose_out, batch_squash, batch_labels, m_op)
                sample_batch = tf.squeeze(sample_batch)
                decode_res_op = tf.concat([
                    sample_batch,
                    255 * tf.reshape(reconstruction,
                                     [cfg.batch_size, IMAGE_SIZE, IMAGE_SIZE])
                ],
                                          axis=0)
                acc = net.test_accuracy(output, batch_labels)
                tf.summary.scalar('spread_loss', spread_loss)
                tf.summary.scalar('reconstruction_loss', mse)
                tf.summary.scalar('all_loss', loss)
                tf.summary.scalar('train__batch_acc', acc)
            """Compute gradient."""
            grad = opt.compute_gradients(loss)
            # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
            grad_check = [
                tf.check_numerics(g, message='Gradient NaN Found!')
                for g, _ in grad if g is not None
            ] + [tf.check_numerics(loss, message='Loss NaN Found')]
        """Apply graident."""
        with tf.control_dependencies(grad_check):
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = opt.apply_gradients(grad, global_step=global_step)
        """Set Session settings."""
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=False))
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        """Set Saver."""
        var_to_save = [
            v for v in tf.global_variables() if 'Adam' not in v.name
        ]  # Don't save redundant Adam beta/gamma
        saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch)
        """Display parameters"""
        total_p = np.sum([
            np.prod(v.get_shape().as_list()) for v in var_to_save
        ]).astype(np.int32)
        train_p = np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
        ]).astype(np.int32)
        logger.info('Total Parameters: {}'.format(total_p))
        logger.info('Trainable Parameters: {}'.format(train_p))

        # read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)
        """Set summary op."""
        summary_op = tf.summary.merge_all()
        """Start coord & queue."""
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        """Set summary writer"""
        if not os.path.exists(cfg.logdir +
                              '/caps/{}/train_log/'.format(dataset_name)):
            os.makedirs(cfg.logdir +
                        '/caps/{}/train_log/'.format(dataset_name))
        summary_writer = tf.summary.FileWriter(
            cfg.logdir + '/caps/{}/train_log/'.format(dataset_name),
            graph=sess.graph)  # graph = sess.graph, huge!

        if not os.path.exists(cfg.logdir +
                              '/caps/{}/images/'.format(dataset_name)):
            os.makedirs(cfg.logdir + '/caps/{}/images/'.format(dataset_name))
        """Main loop."""
        m_min = 0.2
        m_max = 0.9
        m = m_min
        max_iter = cfg.epoch * num_batches_per_epoch + 1

        for step in range(max_iter):
            tic = time.time()
            """"TF queue would pop batch until no file"""

            batch_x, batch_y = utils.get_random_mnist_batch(
                x_train, y_train, cfg.batch_size)

            try:
                _, loss_value, train_acc_val, summary_str, mse_value = sess.run(
                    [train_op, loss, acc, summary_op, mse],
                    feed_dict={
                        m_op: m,
                        x_input: batch_x,
                        y_target: batch_y
                    })

                sys.stdout.write(ERASE_LINE)
                sys.stdout.write('\r\r%d/%d iteration finishes in ' %
                                 (step, max_iter) + '%f second' %
                                 (time.time() - tic) +
                                 ' training accuracy = %f' % train_acc_val +
                                 ' loss=%f' % loss_value +
                                 '\treconstruction_loss=%f' % mse_value)
                sys.stdout.flush()
                time.sleep(0.001)

            except KeyboardInterrupt:
                sess.close()
                sys.exit()
            except tf.errors.InvalidArgumentError:
                logger.warning(
                    '%d iteration contains NaN gradients. Discard.' % step)
                continue
            else:
                """Write to summary."""
                if step % 10 == 0:
                    summary_writer.add_summary(summary_str, step)

                if step % 200 == 0:
                    images = sess.run(decode_res_op,
                                      feed_dict={
                                          m_op: m,
                                          x_input: batch_x,
                                          y_target: batch_y
                                      })
                    image = combine_images(images)
                    img_name = cfg.logdir + '/caps/{}/images/'.format(
                        dataset_name) + "/step_{}.png".format(str(step))
                    Image.fromarray(image.astype(np.uint8)).save(img_name)
                """Epoch wise linear annealling."""
                if (step % num_batches_per_epoch) == 0:
                    if step > 0:
                        m += (m_max - m_min) / (cfg.epoch * cfg.m_schedule)
                        if m > m_max:
                            m = m_max
                    """Save model periodically """
                    ckpt_path = os.path.join(
                        cfg.logdir + '/caps/{}/'.format(dataset_name),
                        'model-{:.4f}.ckpt'.format(loss_value))
                    saver.save(sess, ckpt_path, global_step=step)

        ckpt_path = os.path.join(cfg.logdir + '/caps/{}/'.format(dataset_name),
                                 'finall-model-{:.4f}.ckpt'.format(loss_value))
        saver.save(sess, ckpt_path, global_step=step)

        print('Training is finished!')