Exemple #1
0
def main(argv=None):  # pylint: disable=unused-argument
    cifar10.maybe_download_and_extract()
    if tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)
    train()
    cifar10_eval.evaluate()
Exemple #2
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    self._start_time = current_time

        for step in range(0, FLAGS.max_steps + 1, FLAGS.log_frequency):
            print(str(step))
            with tf.train.MonitoredTrainingSession(
                    checkpoint_dir=FLAGS.train_dir,
                    hooks=[
                        tf.train.StopAtStepHook(last_step=step),
                        tf.train.NanTensorHook(loss),
                        _LoggerHook()
                    ],
                    config=tf.ConfigProto(log_device_placement=FLAGS.
                                          log_device_placement)) as mon_sess:
                while not mon_sess.should_stop():
                    mon_sess.run(train_op)
            # evaluate test data
            cifar10_eval.evaluate()
            # evaluate train data
            evaluate()
            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * \
                        FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                  'sec/batch)')
                    print (format_str % (datetime.now(), self._step, loss_value,
                                         examples_per_sec, sec_per_batch))
                if self._step % 1000 == 0:
                    cifar10_eval.evaluate()
def main(argv=None):  # pylint: disable=unused-argument
    total_time = time.time()
    data_load_time = time.time()
    cifar10.maybe_download_and_extract()
    if tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)
    data_load_time = time.time() - data_load_time
    train_time = time.time()
    train()
    train_time = time.time() - train_time
    test_time = time.time()
    cifar10_eval.evaluate()
    test_time = time.time() - test_time
    total_time = time.time() - total_time

    print_time('Data load', data_load_time)
    print_time('Train', train_time)
    print_time('Test', test_time)
    print_time('Total', total_time)
def main(argv=None):  # pylint: disable=unused-argument
    total_time = time.time()
    data_load_time = time.time()
    cifar10.maybe_download_and_extract()
    if tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)
    data_load_time = time.time() - data_load_time
    train_time = time.time()
    train()
    train_time = time.time() - train_time
    test_time = time.time()
    cifar10_eval.evaluate()
    test_time = time.time() - test_time
    total_time = time.time() - total_time

    print_time('Data load', data_load_time)
    print_time('Train', train_time)
    print_time('Test', test_time)
    print_time('Total', total_time)
def main():
    #	get all images in format of [r*1024,g*1024,b*1024]
    #	store them in dataset[0~9999]
    save = open("check.csv", "w")
    a = csv.writer(save)
    f = open("test_batch.bin", "rb")
    data = f.read()
    for i in range(len(data) / 3073):
        label.append(struct.unpack("B", data[i * 3073])[0])
        datatmp = data[i * 3073 + 1:i * 3073 + 3073]

        dataset.append(list(struct.unpack("B" * (len(datatmp)), datatmp[:])))

    f.close()
    #	add error
    for err_rate in range(1, 300, 10):
        print(err_rate)

        #for k in range(len(dataset)):
        for k in range(100):
            #show(dataset[k])
            err.append(addError(dataset[k], err_rate, Gradient(dataset[k])))
            #show(err[k])


#	show image

#	save back
        out = open("/tmp/cifar10_data/cifar-10-batches-bin/random_batch_1.bin",
                   "wb")
        #for j in range(len(dataset)):
        for j in range(100):
            back = [label[j]] + list(err[j])
            back = np.array(back, np.uint8)
            out.write(back)
        out.close()
        a.writerows([[str(err_rate), str(cifar10_eval.evaluate())]])
        del err[:]
    save.close()
Exemple #7
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.contrib.deprecated.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        # summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
        #                                          graph_def=sess.graph_def)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))
                printfile(loss_value)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if step % 1000 == 0:
                cifar10_eval.evaluate()
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    # Create a variable to count the number of train() calls. This equals the
    # number of batches processed * FLAGS.num_gpus.
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(0), trainable=False)

    # Calculate the learning rate schedule.
    num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                             (FLAGS.batch_size * FLAGS.num_gpus))
    decay_steps = int(num_batches_per_epoch * FLAGS.lr_decay_epochs)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(FLAGS.learning_rate,
                                    global_step,
                                    decay_steps,
                                    cifar10.LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)

    # Create an optimizer that performs gradient descent.
    opt = tf.train.AdamOptimizer(lr)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
          [images, labels], capacity=2 * FLAGS.num_gpus)
    # Calculate the gradients for each model tower.
    tower_grads = []
    with tf.variable_scope(tf.get_variable_scope()):
      for i in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % i):
          with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
            # Dequeues one batch for the GPU
            image_batch, label_batch = batch_queue.dequeue()
            # Calculate the loss for one tower of the CIFAR model. This function
            # constructs the entire CIFAR model but shares the variables across
            # all towers.
            loss = tower_loss(scope, image_batch, label_batch)

            # Reuse variables for the next tower.
            tf.get_variable_scope().reuse_variables()

            # Retain the summaries from the final tower.
            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

            #Added for BN - 25.7.17 Oran
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
              grads = opt.compute_gradients(loss)
              tower_grads.append(grads)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = average_gradients(tower_grads)

    # Add a summary to track the learning rate.
    summaries.append(tf.summary.scalar('learning_rate', lr))

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))

    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      summaries.append(tf.summary.histogram(var.op.name, var))

    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(
        cifar10.MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    # Group all updates to into a single train op.
    train_op = tf.group(apply_gradient_op, variables_averages_op)
#    train_op = tf.group(variables_averages_op)

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables())

    # Build the summary operation from the last tower summaries.
    summary_op = tf.summary.merge(summaries)

    # Build an initialization operation to run below.
    init = tf.global_variables_initializer()

    # Start running operations on the Graph. allow_soft_placement must be set to
    # True to build towers on GPU, as some of the ops do not have GPU
    # implementations.
    sess = tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
    
    step_reached = -1
    max_steps = int(FLAGS.epochs * cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / (FLAGS.batch_size * FLAGS.num_gpus))
    best_precision = 0
    f = open(FLAGS.train_dir + '/summary.txt', 'a')
    # Load model if not a new run
    if FLAGS.new_run==False:
        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and ckpt.model_checkpoint_path:
          saver.restore(sess, ckpt.model_checkpoint_path)
          step_reached = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]

    for step in xrange(int(step_reached)+1,max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      
      if step % 1000 == 0:
        num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = duration / FLAGS.num_gpus

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 1000 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
        
      if (step % 4000 == 0) and (step > 28000):
        precision = []
        for i in xrange(6):
          cifar10.draw_weights(sess)
          precision += [cifar10_eval.evaluate()]
          if precision[i]>best_precision:
              best_precision = precision[i]
              os.system('cp '+FLAGS.train_dir+'/weights/* '+FLAGS.train_dir+'/best_weights/')
        print('Average precision: '+str(round(np.mean(precision),3)))
        f.write('step: '+str(step)+', average precision = '+str(round(np.mean(precision),3))+'\n')
        
#      if step % 1 == 0: # Check CLT assumption
#        a1_, b1_, image_batch_, conv3_1_ = sess.run([a, b, image_batch, conv3_1])
#        # LAYER 1
#        x = conv3_1_[0,2:5,2:5,:]
#        activations = []
#        for i in range(700):
#            W = cifar10.draw_ternary_weight(a1_[:,:,:,10:12], b1_[:,:,:,10:12])
#            W = W[:,:,:,0]
#            activations += [np.sum(x*W)]
#        activations = np.squeeze(np.array(activations))
#        
#        mu_ = a1_[:,:,:,10] - b1_[:,:,:,10]
#        mu_bar = np.sum(x*mu_)
#        sigma = a1_[:,:,:,10] + b1_[:,:,:,10] - mu_*mu_ + 0.001
#        sigma_bar = np.sum(x*x*sigma)
#        samples = np.random.normal(mu_bar, np.sqrt(sigma_bar), 700)
#
#        max_=max(max(activations),max(samples))            
#        min_=min(min(activations),min(samples))  
#        plt.figure(1)
#        plt.subplot(211)
#        p_activations,_,_ = plt.hist(activations, bins=70, range=[min_,max_])
#        plt.title('Step ' +str(step)+';Layer 1 Activations: mu='+str(round(np.mean(activations),3))+'; Sigma='+str(round(np.var(activations),3)))
#        plt.subplot(212)
#        g_activations,_,_ = plt.hist(samples, bins=70, range=[min_,max_])
#        m = 0.5*(p_activations + g_activations)
#        js = 0.5*scipy.stats.entropy(p_activations, m) + 0.5*scipy.stats.entropy(g_activations, m)
##        jsd1+=[js]
##        steps1+=[step]
#        plt.title('Gaussian; mu='+str(round(mu_bar,3))+'; Sigma='+str(round(sigma_bar,3))+'; JS Divergence='+str(round(js,3)))
#        plt.show()
#        plt.savefig(FLAGS.train_dir + '/activations/' + str(step)+'.png')
#        plt.close(1)
    f.write('best precision = '+str(round(best_precision, 3))+'\n')
    f.close()
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        cifar10.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.MomentumOptimizer(lr, momentum=0.9, use_nesterov=False)
        # opt = tf.train.AdamOptimizer()

        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(FLAGS.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' %
                                       (cifar10.TOWER_NAME, i)) as scope:
                        # Calculate the loss for one tower of the CIFAR model. This function
                        # constructs the entire CIFAR model but shares the variables across
                        # all towers.
                        loss = tower_loss(scope)

                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)

                        # Calculate the gradients for the batch of data on this CIFAR tower.
                        grads = opt.compute_gradients(loss)

                        # Keep track of the gradients across all towers.
                        tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        # pdb.set_trace()
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Prune the network iterately.s
        #pruning_ops = pruning(k = 1.25, scale=0.02)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        if FLAGS.reload:
            ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
            if ckpt and ckpt.model_checkpoint_path:
                # Restores from checkpoint
                saver.restore(sess, ckpt.model_checkpoint_path)
                # Assuming model_checkpoint_path looks something like:
                #   /my-favorite-path/cifar10_train/model.ckpt-0,
                # extract global_step from it.
                step = int(
                    ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            else:
                step = 1
        else:
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
            tf.gfile.MakeDirs(FLAGS.train_dir)
            step = 1

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        while step <= FLAGS.max_steps:
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / FLAGS.num_gpus

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            #pruning
            if step % 3000 == 0:
                # Init precision and compress
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
                # Evalute precision and compress rate
                initPrecision, initCompress = cifar10_eval.evaluate()
                print("initPrecision: %.4f , initCompress: %.4f" % \
                     (initPrecision,initCompress))

                scaleFounder = float(0.01)
                while True:
                    # Prune the network iterately.s
                    pruning_ops = pruning(k=1.5, scale=scaleFounder)
                    sess.run(pruning_ops)
                    # Save the model for eval
                    # tip: no need for packup because pruning scale is  monotonic increasing
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)
                    # Evalute precision and compress rate
                    precision, compress = cifar10_eval.evaluate()
                    print("precision: %.4f , compress: %.4f" %
                          (precision, compress))

                    scaleFounder = scaleFounder + 0.01

                    if(
                        (FLAGS.precision_fact * (initPrecision - precision) -
                         FLAGS.compress_fact * (initCompress - compress) > 0 )or \
                        scaleFounder == 1.0
                      ):
                        break

            # Save the model checkpoint periodically.
            if step % 500 == 0 or step == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
                #cifar10_eval.evaluate()
            step += 1