def calculate_error_rate_variance_from_table(
    tables=['../data/tfidf-pairs.npy'],
    split=0.0,
    deviation=0.025,
    verbose=True,
    normalize=(0.0, 1.0)):
    data = []
    for t in tables:
        f = open(t, 'rb')
        r = np.load(f)
        data.append(r)
    for d in xrange(len(data)):
        data[d] = (data[d] - normalize[0]) / (normalize[1] - normalize[0])

    if verbose:
        print 'Calculating error rate variance with a deviation of %.5f...' % deviation
    splits = np.linspace(split - deviation, split + deviation, 100)
    errors = []
    for s in splits:
        errors.append(metrics.error_rate(data[0], data[1], s))
    errors = np.asarray(errors)
    v = np.std(errors)
    m = np.max(errors)
    if verbose:
        print 'Stdev: %.5f' % v
        print 'Max value: %.5f' % m
        print 'Used split point: %.5f' % split
    return v, m
def calculate_error_rate_from_table(tables=['../data/tfidf-pairs.npy'], split=0.0, verbose=True, normalize=(0.0, 1.0)):
    data = []
    for t in tables:
        f = open(t, 'rb')
        r = np.load(f)
        data.append(r)
    for d in xrange(len(data)):
        data[d] = (data[d] - normalize[0]) / (normalize[1] - normalize[0])

    if verbose:
        print 'Calculating optimal error rate...'
    error = metrics.error_rate(data[0], data[1], split)
    if verbose:
        print 'Error: %.5f' % error
        print 'Used split point: %.5f' % split
    return error
def calculate_error_rate_from_table(tables=['../data/tfidf-pairs.npy'],
                                    split=0.0,
                                    verbose=True,
                                    normalize=(0.0, 1.0)):
    data = []
    for t in tables:
        f = open(t, 'rb')
        r = np.load(f)
        data.append(r)
    for d in xrange(len(data)):
        data[d] = (data[d] - normalize[0]) / (normalize[1] - normalize[0])

    if verbose:
        print 'Calculating optimal error rate...'
    error = metrics.error_rate(data[0], data[1], split)
    if verbose:
        print 'Error: %.5f' % error
        print 'Used split point: %.5f' % split
    return error
def calculate_error_rate_variance_from_table(tables=['../data/tfidf-pairs.npy'], split=0.0, deviation=0.025, verbose=True, normalize=(0.0, 1.0)):
    data = []
    for t in tables:
        f = open(t, 'rb')
        r = np.load(f)
        data.append(r)
    for d in xrange(len(data)):
        data[d] = (data[d] - normalize[0]) / (normalize[1] - normalize[0])

    if verbose:
        print 'Calculating error rate variance with a deviation of %.5f...' % deviation
    splits = np.linspace(split - deviation, split + deviation, 100)
    errors = []
    for s in splits:
        errors.append(metrics.error_rate(data[0], data[1], s))
    errors = np.asarray(errors)
    v = np.std(errors)
    m = np.max(errors)
    if verbose:
        print 'Stdev: %.5f' % v
        print 'Max value: %.5f' % m
        print 'Used split point: %.5f' % split
    return v, m
Example #5
0
def main(argv = None):  # pylint: disable=unused-argument
  # load imageset
  train_set_folder = os.path.join(module_dir, os.path.pardir, os.path.pardir, 'data/ocr/train')
  test_set_folder = os.path.join(module_dir, os.path.pardir, os.path.pardir, 'data/ocr/test')

  # Extract it into numpy arrays.
  train_data, train_labels = load_imageset(train_set_folder, to_img_size = (28, 28, 1), ext = 'png')
  test_data, test_labels = load_imageset(test_set_folder, to_img_size = (28, 28, 1), ext = 'png')

  height = train_data.shape[1]
  width = train_data.shape[2]
  channel = (train_data.shape[3] if train_data.ndim > 3 else 1)

  label_max = np.amax(train_labels)
  label_min = np.amin(train_labels)
  num_labels = label_max - label_min + 1

  # Generate a validation set.
  train_data, train_labels, validation_data, validation_labels = split_cv(train_data, train_labels, 0.1)

  num_epochs = NUM_EPOCHS
  train_size = train_labels.shape[0]

  # This is where training samples and labels are fed to the graph.
  # These placeholder nodes will be fed a batch of training data at each
  # training step using the {feed_dict} argument to the Run() call below.
  train_data_node = tf.placeholder(
      tf.float32,
      shape = (BATCH_SIZE, height, width, channel))
  train_labels_node = tf.placeholder(tf.int64, shape = (BATCH_SIZE,))

  eval_data = tf.placeholder(
      tf.float32,
      shape=(EVAL_BATCH_SIZE, height, width, channel))

  # The variables below hold all the trainable weights. They are passed an
  # initial value which will be assigned when we call:
  # {tf.initialize_all_variables().run()}
  conv1_weights = tf.Variable(
      tf.truncated_normal([5, 5, channel, 32],  # 5x5 filter, depth 32.
                          stddev = 0.1,
                          seed = SEED),
      name="conv1_weights")
  conv1_biases = tf.Variable(tf.zeros([32]), name = "conv1_biases")
  
  conv2_weights = tf.Variable(
      tf.truncated_normal([5, 5, 32, 64],
                          stddev = 0.1,
                          seed = SEED),
      name="conv2_weights")
  conv2_biases = tf.Variable(tf.constant(0.1, shape = [64]), name = "conv2_biases")
  
  fc1_weights = tf.Variable(  # fully connected, depth 512.
      tf.truncated_normal(
          [height // 4 * width // 4 * 64, 512],
          stddev = 0.1,
          seed = SEED),
      name = "fc1_weights")
  fc1_biases = tf.Variable(tf.constant(0.1, shape = [512]), name = "fc1_biases")
  
  fc2_weights = tf.Variable(
      tf.truncated_normal([512, num_labels],
                          stddev = 0.1,
                          seed = SEED),
      name = "fc2_weights")
  fc2_biases = tf.Variable(tf.constant(0.1, shape = [num_labels]), name = "fc2_biases")

  # We will replicate the model structure for the training subgraph, as well
  # as the evaluation subgraphs, while sharing the trainable parameters.
  def lenet2(data, train = False):
    """LeNet2 definition."""
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
    # the same size as the input). Note that {strides} is a 4D array whose
    # shape matches the data layout: [n, h, w, c].
    conv1 = tf.nn.conv2d(data,
                         conv1_weights,
                         strides = [1, 1, 1, 1],
                         padding = 'SAME')
    # Bias and rectified linear non-linearity.
    relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases))
    # Max pooling. The kernel size spec {ksize} also follows the layout of
    # the data. Here we have a pooling window of 2, and a stride of 2.
    pool1 = tf.nn.max_pool(relu1,
                           ksize = [1, 2, 2, 1],
                           strides = [1, 2, 2, 1],
                           padding = 'SAME')
    conv2 = tf.nn.conv2d(pool1,
                         conv2_weights,
                         strides = [1, 1, 1, 1],
                         padding = 'SAME')
    relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases))
    pool2 = tf.nn.max_pool(relu2,
                           ksize = [1, 2, 2, 1],
                           strides = [1, 2, 2, 1],
                           padding = 'SAME')
    # Reshape the feature map cuboid into a 2D matrix to feed it to the
    # fully connected layers.
    pool_shape = pool2.get_shape().as_list()
    reshape = tf.reshape(pool2,
                         [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
    # Fully connected layer. Note that the '+' operation automatically
    # broadcasts the biases.
    fc1 = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
    # Add a 50% dropout during training only. Dropout also scales
    # activations such that no rescaling is needed at evaluation time.
    if train:
      fc1 = tf.nn.dropout(fc1, 0.5, seed = SEED)
    return tf.matmul(fc1, fc2_weights) + fc2_biases

  # Training computation: logits + cross-entropy loss.
  logits = lenet2(train_data_node, True)
  loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
      logits, train_labels_node))

  # L2 regularization for the fully connected parameters.
  regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                  tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
  # Add the regularization term to the loss.
  loss += 5e-4 * regularizers

  # Optimizer: set up a variable that's incremented once per batch and
  # controls the learning rate decay.
  batch = tf.Variable(0)
  # Decay once per epoch, using an exponential schedule starting at 0.01.
  learning_rate = tf.train.exponential_decay(
      0.01,                # Base learning rate.
      batch * BATCH_SIZE,  # Current index into the dataset.
      train_size,          # Decay step.
      0.95,                # Decay rate.
      staircase=True)
  # Use simple momentum for the optimization.
  optimizer = tf.train.MomentumOptimizer(learning_rate,
                                         0.9).minimize(loss,
                                                       global_step = batch)

  # Predictions for the current training minibatch.
  train_prediction = tf.nn.softmax(logits)

  # Predictions for the test and validation, which we'll compute less often.
  eval_prediction = tf.nn.softmax(lenet2(eval_data))

  # Small utility function to evaluate a dataset by feeding batches of data to
  # {eval_data} and pulling the results from {eval_predictions}.
  # Saves memory and enables this to run on smaller GPUs.
  def eval_in_batches(data, sess):
    """Get all predictions for a dataset by running it in small batches."""
    size = data.shape[0]
    if size < EVAL_BATCH_SIZE:
      raise ValueError("batch size for evals larger than dataset: %d" % size)
    predictions = np.ndarray(shape = (size, num_labels), dtype = np.float32)
    for begin in xrange(0, size, EVAL_BATCH_SIZE):
      end = begin + EVAL_BATCH_SIZE
      if end <= size:
        predictions[begin:end, :] = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[begin:end, ...]})
      else:
        batch_predictions = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
        predictions[begin:, :] = batch_predictions[begin - size:, :]
    return predictions

  # Create a local session to run the training.
  start_time = time.time()
  model_dir = os.path.join(module_dir, os.path.pardir, os.path.pardir, 'models') 
  with tf.Session() as sess:
    # Run all the initializers to prepare the trainable parameters.
    tf.initialize_all_variables().run()
    # Import base model weights
    saver = tf.train.Saver([conv1_weights, conv1_biases, conv2_weights, conv2_biases, fc1_weights, fc1_biases])
    ckpt = tf.train.get_checkpoint_state(os.path.join(model_dir, 'base'))
    if ckpt and ckpt.model_checkpoint_path:
      logger.info("Continue training from the model {}".format(ckpt.model_checkpoint_path))
      saver.restore(sess, ckpt.model_checkpoint_path)
    # for var in tf.trainable_variables():
    #  logger.info(var.eval())

    logger.info('Initialized!')
    # Loop through training steps.
    for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
      # Compute the offset of the current minibatch in the data.
      # Note that we could use better randomization across epochs.
      offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
      batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
      batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
      # This dictionary maps the batch data (as a numpy array) to the
      # node in the graph it should be fed to.
      feed_dict = {train_data_node: batch_data,
                   train_labels_node: batch_labels}
      # Run the graph and fetch some of the nodes.
      _, l, lr, predictions = sess.run(
          [optimizer, loss, learning_rate, train_prediction],
          feed_dict=feed_dict)
      if step % EVAL_FREQUENCY == 0:
        elapsed_time = time.time() - start_time
        start_time = time.time()
        logger.info('Step %d (epoch %.2f), %.1f ms' %
              (step, float(step) * BATCH_SIZE / train_size,
               1000 * elapsed_time / EVAL_FREQUENCY))
        logger.info('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
        logger.info('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels))
        logger.info('Validation error: %.1f%%' % error_rate(
            eval_in_batches(validation_data, sess), validation_labels))
        sys.stdout.flush()
    # Finally print the result!
    test_precision = precision(eval_in_batches(test_data, sess), test_labels)
    logger.info('Test precision: %.1f%%' % test_precision)

    # Model persistence
    saver = tf.train.Saver([conv1_weights, conv1_biases, conv2_weights, conv2_biases, fc1_weights, fc1_biases, fc2_weights, fc2_biases])
    model_path = os.path.join(model_dir, "finetuned", "lenet_finetuned.ckpt")
    save_path = saver.save(sess, model_path)
    logger.info("Model saved in file: %s" % save_path)
Example #6
0
def main(argv = None):

  # load mnist into numpy arrays.
  train_data, train_labels = load_minst_data(t = 'train')
  test_data, test_labels = load_minst_data(t = 'test')

  height = train_data.shape[1]
  width = train_data.shape[2]
  channel = (train_data.shape[3] if train_data.ndim > 3 else 1)

  label_max = np.amax(train_labels)
  label_min = np.amin(train_labels)
  num_labels = label_max - label_min + 1

  # Generate a validation set.
  validation_data = train_data[:VALIDATION_SIZE, ...]
  validation_labels = train_labels[:VALIDATION_SIZE]
  train_data = train_data[VALIDATION_SIZE:, ...]
  train_labels = train_labels[VALIDATION_SIZE:]
  num_epochs = NUM_EPOCHS
  train_size = train_labels.shape[0]

  # This is where training samples and labels are fed to the graph.
  # These placeholder nodes will be fed a batch of training data at each
  # training step using the {feed_dict} argument to the Run() call below.
  train_data_node = tf.placeholder(
      tf.float32,
      shape = (BATCH_SIZE, height, width, channel))
  train_labels_node = tf.placeholder(tf.int64, shape = (BATCH_SIZE,))

  eval_data = tf.placeholder(
      tf.float32,
      shape = (EVAL_BATCH_SIZE, height, width, channel))

  # The variables below hold all the trainable weights. They are passed an
  # initial value which will be assigned when we call:
  # {tf.initialize_all_variables().run()}
  conv1_weights = tf.Variable(
      tf.truncated_normal([5, 5, channel, 32],  # 5x5 filter, depth 32.
                          stddev = 0.1,
                          seed = SEED),
      name = "conv1_weights")
  conv1_biases = tf.Variable(tf.zeros([32]), name = "conv1_biases")
  
  conv2_weights = tf.Variable(
      tf.truncated_normal([5, 5, 32, 64],
                          stddev = 0.1,
                          seed = SEED),
      name = "conv2_weights")
  conv2_biases = tf.Variable(tf.constant(0.1, shape = [64]), name = "conv2_biases")
  
  fc1_weights = tf.Variable(  # fully connected, depth 512.
      tf.truncated_normal(
          [height // 4 * width // 4 * 64, 512],
          stddev = 0.1,
          seed = SEED),
      name = "fc1_weights")
  fc1_biases = tf.Variable(tf.constant(0.1, shape = [512]), name = "fc1_biases")
  
  fc2_weights = tf.Variable(
      tf.truncated_normal([512, num_labels],
                          stddev = 0.1,
                          seed = SEED),
      name = "fc2_weights")
  fc2_biases = tf.Variable(tf.constant(0.1, shape = [num_labels]), name = "fc2_biases")

  # We will replicate the model structure for the training subgraph, as well
  # as the evaluation subgraphs, while sharing the trainable parameters.
  def lenet(data, train = False):
    """LeNet definition."""
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
    # the same size as the input). Note that {strides} is a 4D array whose
    # shape matches the data layout: [n, h, w, c].
    conv1 = tf.nn.conv2d(data,
                         conv1_weights,
                         strides = [1, 1, 1, 1],
                         padding = 'SAME')
    # Bias and rectified linear non-linearity.
    relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases))
    # Max pooling. The kernel size spec {ksize} also follows the layout of
    # the data. Here we have a pooling window of 2, and a stride of 2.
    pool1 = tf.nn.max_pool(relu1,
                           ksize = [1, 2, 2, 1],
                           strides = [1, 2, 2, 1],
                           padding = 'SAME')
    conv2 = tf.nn.conv2d(pool1,
                         conv2_weights,
                         strides = [1, 1, 1, 1],
                         padding = 'SAME')
    relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_biases))
    pool2 = tf.nn.max_pool(relu2,
                           ksize = [1, 2, 2, 1],
                           strides = [1, 2, 2, 1],
                           padding = 'SAME')
    # Reshape the feature map cuboid into a 2D matrix to feed it to the
    # fully connected layers.
    pool_shape = pool2.get_shape().as_list()
    reshape = tf.reshape(pool2,
                         [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
    # Fully connected layer. Note that the '+' operation automatically
    # broadcasts the biases.
    fc1 = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
    # Add a 50% dropout during training only. Dropout also scales
    # activations such that no rescaling is needed at evaluation time.
    if train:
      fc1 = tf.nn.dropout(fc1, 0.5, seed = SEED)
    return tf.matmul(fc1, fc2_weights) + fc2_biases

  # Training computation: logits + cross-entropy loss.
  logits = lenet(train_data_node, True)
  loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, train_labels_node))

  # L2 regularization for the fully connected parameters.
  regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                  tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
  # Add the regularization term to the loss.
  loss += 5e-4 * regularizers

  # Optimizer: set up a variable that's incremented once per batch and
  # controls the learning rate decay.
  batch = tf.Variable(0)
  # Decay once per epoch, using an exponential schedule starting at 0.01.
  learning_rate = tf.train.exponential_decay(
      0.01,                # Base learning rate.
      batch * BATCH_SIZE,  # Current index into the dataset.
      train_size,          # Decay step.
      0.95,                # Decay rate.
      staircase = True)
  # Use simple momentum for the optimization.
  optimizer = tf.train.MomentumOptimizer(learning_rate,
                                         0.9).minimize(loss,
                                                       global_step = batch)

  # Predictions for the current training minibatch.
  train_prediction = tf.nn.softmax(logits)

  # Predictions for the test and validation, which we'll compute less often.
  eval_prediction = tf.nn.softmax(lenet(eval_data))

  # Small utility function to evaluate a dataset by feeding batches of data to
  # {eval_data} and pulling the results from {eval_predictions}.
  # Saves memory and enables this to run on smaller GPUs.
  def eval_in_batches(data, sess):
    """Get all predictions for a dataset by running it in small batches."""
    size = data.shape[0]
    if size < EVAL_BATCH_SIZE:
      raise ValueError("batch size for evals larger than dataset: %d" % size)
    predictions = np.ndarray(shape = (size, num_labels), dtype = np.float32)
    for begin in xrange(0, size, EVAL_BATCH_SIZE):
      end = begin + EVAL_BATCH_SIZE
      if end <= size:
        predictions[begin:end, :] = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[begin:end, ...]})
      else:
        batch_predictions = sess.run(
            eval_prediction,
            feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
        predictions[begin:, :] = batch_predictions[begin - size:, :]
    return predictions

  # Create a local session to run the training.
  start_time = time.time()
  model_dir = os.path.join(module_dir, os.path.pardir, os.path.pardir, 'models') 
  with tf.Session() as sess:
    # Run all the initializers to prepare the trainable parameters.
    tf.initialize_all_variables().run()
    logger.info('Initialized!')
    # Loop through training steps.
    for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
      # Compute the offset of the current minibatch in the data.
      # Note that we could use better randomization across epochs.
      offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
      batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
      batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
      # This dictionary maps the batch data (as a numpy array) to the
      # node in the graph it should be fed to.
      feed_dict = {train_data_node: batch_data,
                   train_labels_node: batch_labels}
      # Run the graph and fetch some of the nodes.
      _, l, lr, predictions = sess.run(
          [optimizer, loss, learning_rate, train_prediction],
          feed_dict=feed_dict)
      if step % EVAL_FREQUENCY == 0:
        elapsed_time = time.time() - start_time
        start_time = time.time()
        logger.info('Step %d (epoch %.2f), %.1f ms' %
              (step, float(step) * BATCH_SIZE / train_size,
               1000 * elapsed_time / EVAL_FREQUENCY))
        logger.info('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
        logger.info('Minibatch training error: %.1f%%' % error_rate(predictions, batch_labels))
        logger.info('Validation error: %.1f%%' % error_rate(eval_in_batches(validation_data, sess), validation_labels))
        sys.stdout.flush()

    # Finally print the result!
    test_precision = precision(eval_in_batches(test_data, sess), test_labels)
    logger.info('Test precision: %.1f%%' % test_precision)

    # Model persistence
    saver = tf.train.Saver([conv1_weights, conv1_biases, conv2_weights, conv2_biases, fc1_weights, fc1_biases])
    model_path = os.path.join(model_dir, "base", "lenet_base.ckpt")
    save_path = saver.save(sess, model_path)
    logger.info("Model saved in file: %s" % save_path)