Example #1
0
def run_perdict():
    data_sets = input_data.read_data_sets(FLAGS.input_data_dir,
                                          train=False,
                                          fill_labels=False)
    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Generate placeholders for the images, labels and seqlens.
        #num_features
        images_placeholder, labels_placeholder, seqlen_placeholder = placeholder_inputs(
            input_data.NUM_FEATURES)
        # Build a Graph that computes predictions from the inference model.
        #images_lp, seqlen_lp, num_features, num_layers, hidden_units
        logits = lstm_ctc_ocr.inference(images_placeholder, seqlen_placeholder,
                                        FLAGS.num_layers, FLAGS.hidden_units)

        # Add to the Graph the Ops for loss calculation.
        #logits, labels_lp, seqlen_lp
        loss = lstm_ctc_ocr.loss(logits, labels_placeholder,
                                 seqlen_placeholder)
        # global counter
        global_step = tf.Variable(0, name='global_step', trainable=False)
        # Add to the Graph the Ops that calculate and apply gradients.
        #loss, initial_learning_rate, decay_steps, decay_rate, momentum
        train_op, learning_rate = lstm_ctc_ocr.training(
            loss, global_step, FLAGS.initial_learning_rate, FLAGS.decay_steps,
            FLAGS.decay_rate, FLAGS.momentum)
        # Add the Op to compare the logits to the labels during evaluation.
        #logits, labels_lp, seqlen_lp
        dense_decoded, lerr = lstm_ctc_ocr.evaluation(logits,
                                                      labels_placeholder,
                                                      seqlen_placeholder)
        # Add the variable initializer Op.
        init = tf.global_variables_initializer()
        # Create a saver for writing training checkpoints.
        saver = tf.train.Saver()
        # Create a session for running Ops on the Graph.
        sess = tf.Session()
        # Run the Op to initialize the variables.
        sess.run(init)

        ckpt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
        if ckpt:
            saver.restore(sess, ckpt)
            print('restore from ckpt{}'.format(ckpt))
        else:
            print('cannot restore')

        # Evaluate against the test set.
        print(
            '-------------------------- Test Data Eval: --------------------------'
        )
        do_eval(sess, dense_decoded, lerr, learning_rate, images_placeholder,
                labels_placeholder, seqlen_placeholder, data_sets.test)
Example #2
0
def run_training():
  # Get the sets of images and labels for training, validation, and
  # test.
  data_sets = input_data.read_data_sets(FLAGS.input_data_dir, fill_labels=False)
  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
    # Generate placeholders for the images, labels and seqlens.
    #num_features
    images_placeholder, labels_placeholder, seqlen_placeholder = placeholder_inputs(input_data.NUM_FEATURES)

    # Build a Graph that computes predictions from the inference model.
    #images_lp, seqlen_lp, num_features, num_layers, hidden_units
    logits = lstm_ctc_ocr.inference(images_placeholder, 
                                    seqlen_placeholder,
                                    FLAGS.num_layers,
                                    FLAGS.hidden_units)
  
    # Add to the Graph the Ops for loss calculation.
    #logits, labels_lp, seqlen_lp
    loss = lstm_ctc_ocr.loss(logits, labels_placeholder, seqlen_placeholder)
    # global counter
    global_step = tf.Variable(0, name='global_step', trainable=False)
    # Add to the Graph the Ops that calculate and apply gradients.
    #loss, initial_learning_rate, decay_steps, decay_rate, momentum
    train_op, learning_rate = lstm_ctc_ocr.training(loss, global_step, 
                                                    FLAGS.initial_learning_rate, 
                                                    FLAGS.decay_steps, 
                                                    FLAGS.decay_rate, 
                                                    FLAGS.momentum)

    # Add the Op to compare the logits to the labels during evaluation.
    #logits, labels_lp, seqlen_lp
    dense_decoded, lerr = lstm_ctc_ocr.evaluation(logits, labels_placeholder, seqlen_placeholder)

    # Build the summary Tensor based on the TF collection of Summaries.
    summary = tf.summary.merge_all()

    # Add the variable initializer Op.
    init = tf.global_variables_initializer()

    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()

    # Create a session for running Ops on the Graph.
    sess = tf.Session()

    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

    # And then after everything is built:

    # Run the Op to initialize the variables.
    sess.run(init)

    # Start the training loop.
    for cur_epoch in xrange(FLAGS.max_steps):
      start_time = time.time()

      steps_per_epoch = data_sets.train.steps_per_epoch(FLAGS.batch_size)

      for step_per_epoch in xrange(steps_per_epoch):
        # Fill a feed dictionary with the actual set of images and labels
        # for this particular training step.
        #data_set, iamges_pl, labels_pl, seqlen_pl
        feed_dict = fill_feed_dict(data_sets.train,
                                   images_placeholder,
                                   labels_placeholder, 
                                   seqlen_placeholder,
                                   )

        # Run one step of the model.  The return values are the activations
        # from the `train_op` (which is discarded) and the `loss` Op.  To
        # inspect the values of your Ops or variables, you may include them
        # in the list passed to sess.run() and the value tensors will be
        # returned in the tuple from the call.
        _, loss_value, g_step = sess.run([train_op, loss, global_step], feed_dict=feed_dict)
      
        duration = time.time() - start_time

        # Write the summaries and print an overview fairly often.
        # Print status to stdout.
        print('[global:%d epoch:%d/%d step:%d/%d] loss = %.2f (%.3f sec)' % (g_step, 
                                                                            cur_epoch, 
                                                                            FLAGS.max_steps,
                                                                            step_per_epoch,
                                                                            steps_per_epoch, 
                                                                            loss_value, 
                                                                            duration))
        if g_step % 100 == 0:
          # Update the events file.
          summary_str = sess.run(summary, feed_dict=feed_dict)
          summary_writer.add_summary(summary_str, g_step)
          summary_writer.flush()

        # Save a checkpoint and evaluate the model periodically.
        if (g_step + 1) % 500 == 0 or (g_step + 1) == FLAGS.max_steps:
          checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
          saver.save(sess, checkpoint_file, global_step=g_step)
          # Evaluate against the validation set.
          print('-------------------------- Validation Data Eval: --------------------------')
          do_eval(sess,
                  dense_decoded,
                  lerr,
                  learning_rate,
                  images_placeholder,
                  labels_placeholder,
                  seqlen_placeholder,
                  data_sets.validation)
Example #3
0
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf
    import time
    import logging
    import lstm_ctc_ocr
    #import redis_logger_handler
    #redis_logger_handler.logging_setup(args.redis)

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    worker_name = '(worker:%s tf:%s idx:%s)' % (worker_num, job_name,
                                                task_index)

    logging.info(
        '{0} batch_size:{1} initial_learning_rate:{2} decay_steps:{3} decay_rate:{4} momentum:{5}'
        .format(worker_name, args.batch_size, args.initial_learning_rate,
                args.decay_steps, args.decay_rate, args.momentum))
    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    CHANNELS = 1
    IMAGE_WIDTH = 120
    IMAGE_HEIGHT = 45
    NUM_FEATURES = IMAGE_HEIGHT * CHANNELS

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def sparse_tuple_from_label(sequences, dtype=numpy.int32):
        indices = []
        values = []
        for n, seq in enumerate(sequences):
            indices.extend(zip([n] * len(seq), range(len(seq))))
            values.extend(seq)
        indices = numpy.asarray(indices, dtype=numpy.int64)
        values = numpy.asarray(values, dtype=dtype)
        shape = numpy.asarray(
            [len(sequences),
             numpy.asarray(indices).max(0)[1] + 1],
            dtype=numpy.int64)
        return indices, values, shape

    def get_input_lens(sequences):
        lengths = numpy.asarray([120 for s in sequences], dtype=numpy.int64)
        return sequences, lengths

    def placeholder_inputs(num_features):
        images_placeholder = tf.placeholder(tf.float32,
                                            [None, None, num_features])
        labels_placeholder = tf.sparse_placeholder(tf.int32)
        seqlen_placeholder = tf.placeholder(tf.int32, [None])
        return images_placeholder, labels_placeholder, seqlen_placeholder

    def format_batch(data_set, batch_size, image_height, image_width):
        batch = data_set.next_batch(batch_size)
        images = []
        labels = []
        for item in batch:
            images.append(item[0])
            labels.append(item[1])
        xs = numpy.array(images)
        # [batch_size, height * width] => [batch_size, height, width]
        xs = xs.reshape(batch_size, image_width, image_height)
        xs = xs.astype(numpy.float32)
        xs = xs / 255.
        ys = labels
        return xs, ys

    def fill_feed_dict(xs, ys, images_pl, labels_pl, seqlen_pl):
        images_feed, seqlen_feed = get_input_lens(xs)
        labels_feed = sparse_tuple_from_label(ys)
        feed_dict = {
            images_pl: images_feed,
            labels_pl: labels_feed,
            seqlen_pl: seqlen_feed,
        }
        return feed_dict

    def do_eval(sess, dense_decoded, lastbatch_err, learning_rate,
                images_placeholder, labels_placeholder, seqlen_placeholder, xs,
                ys):
        true_count = 0  # Counts the number of correct predictions.
        feed_dict = fill_feed_dict(xs, ys, images_placeholder,
                                   labels_placeholder, seqlen_placeholder)
        dd, lerr, lr = sess.run([dense_decoded, lastbatch_err, learning_rate],
                                feed_dict=feed_dict)
        #accuracy calculation
        for i, origin_label in enumerate(ys):
            decoded_label = [j for j in dd[i] if j != -1]
            if i < 10:
                logging.info('{0} seq {1} => origin:{2} decoded:{3}'.format(
                    worker_name, i, origin_label, decoded_label))
            if origin_label == decoded_label:
                true_count += 1
        #accuracy
        acc = true_count * 1.0 / len(ys)
        #print subsummary
        logging.info(
            "%s accuracy = %.3f, lastbatch_err = %.3f, learning_rate = %.8f" %
            (worker_name, acc, lerr, lr))

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):
            # Generate placeholders for the images, labels and seqlens.
            images_placeholder, labels_placeholder, seqlen_placeholder = placeholder_inputs(
                NUM_FEATURES)
            # Build a Graph that computes predictions from the inference model.
            #images_lp, seqlen_lp, num_features, num_layers, hidden_units
            logits = lstm_ctc_ocr.inference(images_placeholder,
                                            seqlen_placeholder,
                                            args.num_layers, args.hidden_units)
            # Add to the Graph the Ops for loss calculation.
            #logits, labels_lp, seqlen_lp
            loss = lstm_ctc_ocr.loss(logits, labels_placeholder,
                                     seqlen_placeholder)
            tf.summary.scalar('loss', loss)
            # global counter
            global_step = tf.Variable(0, name='global_step', trainable=False)
            # Add to the Graph the Ops that calculate and apply gradients.
            #loss, initial_learning_rate, decay_steps, decay_rate, momentum
            train_op, learning_rate = lstm_ctc_ocr.training(
                loss, global_step, args.initial_learning_rate,
                args.decay_steps, args.decay_rate, args.momentum)
            # Add the Op to compare the logits to the labels during evaluation.
            dense_decoded, lerr = lstm_ctc_ocr.evaluation(
                logits, labels_placeholder, seqlen_placeholder)
            tf.summary.scalar('lerr', lerr)

            summary_op = tf.summary.merge_all()
            # Add the variable initializer Op.
            init_op = tf.global_variables_initializer()
            # Create a saver for writing training checkpoints.
            saver = tf.train.Saver()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        logging.info("{0} tensorflow model path: {1}".format(
            worker_name, logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=60)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        validation_xs = None
        validation_ys = None
        validation_batchs = 10
        with sv.managed_session(server.target) as sess:
            logging.info("{0} session ready".format(worker_name))
            start_time = time.time()
            # Loop until the supervisor shuts down or 1000000 steps have completed.
            g_step = 0
            tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
            # for do_eval samples
            if None == validation_xs or None == validation_ys:
                validation_xs, validation_ys = format_batch(
                    tf_feed, args.batch_size * validation_batchs, IMAGE_HEIGHT,
                    IMAGE_WIDTH)

            while not sv.should_stop() and not tf_feed.should_stop(
            ) and g_step < (args.steps * args.epochs - validation_batchs):
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using feed_dict
                xs, ys = format_batch(tf_feed, args.batch_size, IMAGE_HEIGHT,
                                      IMAGE_WIDTH)
                feed_dict = fill_feed_dict(xs, ys, images_placeholder,
                                           labels_placeholder,
                                           seqlen_placeholder)
                # Run one step of the model.  The return values are the activations
                # from the `train_op` (which is discarded) and the `loss` Op.  To
                # inspect the values of your Ops or variables, you may include them
                # in the list passed to sess.run() and the value tensors will be
                # returned in the tuple from the call.
                _, loss_value, g_step = sess.run([train_op, loss, global_step],
                                                 feed_dict=feed_dict)

                duration = time.time() - start_time

                # Write the summaries and print an overview fairly often.
                if (g_step + 1) % 100 == 0:
                    # Print status to stdout.
                    logging.info(
                        '%s [g_step:%d epoch:%d/%d step:%d/%d] loss = %.2f (%.3f sec)'
                        % (worker_name, g_step, g_step / args.steps,
                           args.epochs, g_step % args.steps, args.steps,
                           loss_value, duration))
                    # Update the events file.
                    if sv.is_chief:
                        summary = sess.run(summary_op, feed_dict=feed_dict)
                        summary_writer.add_summary(summary, g_step)
                        summary_writer.flush()

                # Save a checkpoint and evaluate the model periodically.
                if (g_step + 1) % 500 == 0 or (g_step + 1) == args.steps:
                    # Evaluate against the validation set.
                    logging.info('{0} ---- Validation Data Eval: ----'.format(
                        worker_name))
                    do_eval(sess, dense_decoded, lerr, learning_rate,
                            images_placeholder, labels_placeholder,
                            seqlen_placeholder, validation_xs, validation_ys)

            if sv.should_stop() or g_step >= (args.steps * args.epochs -
                                              validation_batchs):
                logging.info("{0} terminating tf_feed".format(worker_name))
                tf_feed.terminate()

        # Ask for all the services to stop.
        logging.info("{0} stopping supervisor".format(worker_name))
        sv.stop()