Esempio n. 1
0
def train(model='fcn5'):
    config = tf.ConfigProto(allow_soft_placement=False,log_device_placement=FLAGS.log_device_placement)
    device_id = FLAGS.device_id
    device_str = ''
    if int(device_id) >= 0:
        device_str = '/gpu:%d'%int(device_id)
    else:
        device_str = '/cpu:0'


    with tf.Graph().as_default(), tf.device(device_str), tf.Session(config=config) as sess:
        feature_dim = models.feature_dim
        label_dim = models.label_dim
        images = tf.placeholder(tf.float32, [None, feature_dim])
        labels = tf.placeholder(tf.float32, [None, label_dim])

        logits = None
        if model == 'fcn5':
            logits = models.model_fcn5(images)
        else:
            logits = models.model_fcn8(images)
        loss = models.loss(logits, labels)

        predictionCorrectness = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
        accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float"))

        lr = 0.05
        #optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
        optimizer = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss)

        init = tf.initialize_all_variables()
        sess.run(init)
        tf.train.start_queue_runners(sess=sess)
        batch_size_per_epoch = int((EPOCH_SIZE + FLAGS.batch_size - 1)/ FLAGS.batch_size)
        iterations = FLAGS.epochs * batch_size_per_epoch 
        average_batch_time = 0.0
        epochs_info = []
        average_loss = 0.0
        for step in range(iterations):
            start_time = time.time()
            imgs, labs = get_real_batch_data(FLAGS.batch_size, 10)
            _, loss_value = sess.run([optimizer, loss], feed_dict={images:imgs,labels:labs})
            average_loss += loss_value
            duration = time.time() - start_time
            average_batch_time += float(duration)
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            if step % FLAGS.log_step == 0:
                examples_per_sec = FLAGS.batch_size / duration
                sec_per_batch = float(duration)
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
            if step > 0 and step % (FLAGS.eval_step * batch_size_per_epoch) == 0:
                average_loss /= FLAGS.eval_step * batch_size_per_epoch
                accuracy_value = accuracy.eval(feed_dict={images: mnist.test.images, labels: mnist.test.labels})
                print("test accuracy %g"%accuracy_value)
                epochs_info.append('%d:%g:%s'%(step/(FLAGS.eval_step*batch_size_per_epoch), accuracy_value, average_loss)) 
                average_loss = 0.0
        average_batch_time /= iterations
        print 'average_batch_time: ', average_batch_time
        print ('epoch_info: %s' % ','.join(epochs_info))
Esempio n. 2
0
def train(model='fcn5'):
    if FLAGS.num_gpus < 2:
        print("The number of GPU should be 2 or more, if you use one GPU, please use fcn5_mnist.py to train")
        return

    config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=FLAGS.log_device_placement)

    with tf.Graph().as_default(), tf.device("/cpu:0"):
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        device_ids = FLAGS.device_ids.split(',')
        if len(device_ids) > FLAGS.num_gpus:
            print('The device_ids should have the same number of GPUs with num_gpus')
            return

        lr = 0.05
        #optimizer = tf.train.GradientDescentOptimizer(lr)
        optimizer = tf.train.MomentumOptimizer(lr, 0.9)

        tower_grads = []
        feed_vars = []
        average_loss_tensor = []
        for i in xrange(FLAGS.num_gpus):
            with tf.device('/gpu:%s'%device_ids[i]):
                with tf.name_scope('%s_%s' % ('TOWER', device_ids[i])) as scope:
                    feature_dim = models.feature_dim
                    label_dim = models.label_dim
                    images = tf.placeholder(tf.float32, [None, feature_dim], name='images')
                    labels = tf.placeholder(tf.float32, [None, label_dim], name='labels')
                    feed_vars.append((images, labels))

                    logits = models.model_fcn5(images)
                    loss = models.loss(logits, labels)
                    tf.add_to_collection('losses', loss)

                    #tf.add_n(tf.get_collection('losses'), name='total_loss')
                    losses = tf.get_collection('losses', scope)
                    total_loss = tf.add_n(losses, name='total_loss')
                    average_loss_tensor.append(total_loss)

                    tf.get_variable_scope().reuse_variables()
                    grads = optimizer.compute_gradients(total_loss)
                    tower_grads.append(grads)

        print('tower_grads: ', tower_grads, '\nlen: ', len(tower_grads))
        print ('total_loss: ', total_loss)

        grads = average_gradients(tower_grads)
        apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)

        train_op = apply_gradient_op
        average_op = tf.reduce_mean(average_loss_tensor, 0)
        saver = tf.train.Saver(tf.all_variables())

        init = tf.initialize_all_variables()
        sess = tf.Session(config=config)
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        real_batch_size = FLAGS.batch_size * FLAGS.num_gpus
        num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size)
        iterations = FLAGS.epochs * num_batches_per_epoch 
        average_batch_time = 0.0
        epochs_info = []

        step = 0
        average_loss = 0.0
        for step in range(iterations):
            start_time = time.time()
            imgs, labs = get_real_batch_data(real_batch_size, 10)
            feed_dict = {}
            for i in range(FLAGS.num_gpus):
                feed_dict[feed_vars[i][0]] = imgs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size]
                feed_dict[feed_vars[i][1]] = labs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] 
           # _, loss_value = sess.run([train_op, total_loss], feed_dict=feed_dict)
            _, loss_value = sess.run([train_op, average_op], feed_dict=feed_dict)
            duration = time.time() - start_time
            average_batch_time += float(duration)
            average_loss += loss_value

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % FLAGS.log_step == 0:
                examples_per_sec = (FLAGS.batch_size * FLAGS.num_gpus) / duration
                sec_per_batch = float(duration)
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

            if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0:
                average_loss /= num_batches_per_epoch * FLAGS.eval_step
                print ('epoch: %d, loss: %.2f' % (step/(FLAGS.eval_step*num_batches_per_epoch), average_loss))
                epochs_info.append('%d:-:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) 
                average_loss = 0.0

        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

        average_batch_time /= iterations
        print 'average_batch_time: ', average_batch_time
        print ('epoch_info: %s' % ','.join(epochs_info))
def train(model='fcn5'):

    config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=FLAGS.log_device_placement)

    if FLAGS.xla:
        # Turns on XLA.  XLA is not included in the standard build.  For single GPU this shows ~5% improvement
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    with tf.Graph().as_default(), tf.device("/" + FLAGS.local_ps_device + ":0"):
        global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        device_ids = FLAGS.device_ids
        if not device_ids:
            device_ids = [str(i) for i in range(FLAGS.num_gpus)]
        else:
            device_ids = device_ids.split(',')

        lr = 0.05
        #optimizer = tf.train.GradientDescentOptimizer(lr)
        optimizer = tf.train.MomentumOptimizer(lr, 0.9)

        def assign_to_device(device, ps_device=FLAGS.local_ps_device):
            worker_device = device
            ps_sizes = [0]
            if FLAGS.local_ps_device.lower == 'gpu':
                ps_sizes = [0] * FLAGS.num_gpus
            def _assign(op):
                if op.device:
                  return op.device
                if op.type not in ['Variable', 'VariableV2']:
                  return worker_device
                device_index, _ = min(enumerate(
                    ps_sizes), key=operator.itemgetter(1))
                device_name = '/' + FLAGS.local_ps_device +':' + str(device_index)
                var_size = op.outputs[0].get_shape().num_elements()
                ps_sizes[device_index] += var_size
                return device_name
            return _assign

        images = None
        labels = None
        if FLAGS.use_dataset:
            with tf.device('/CPU:0'):
                d_features = mnist.train.images
                d_labels = mnist.train.labels
                dataset = tf.contrib.data.Dataset.from_tensor_slices((d_features, d_labels))
                dataset = dataset.shuffle(buffer_size=60000)
                dataset = dataset.repeat()
                dataset = dataset.batch(FLAGS.batch_size)
                # Trick to get datasets to buffer the next epoch.  This is needed because
                # the data loading is occuring outside DataSets in python.  Normally preprocessing
                # would occur in DataSets and this odd looking line is not needed.  
                dataset = dataset.map(lambda x,y:(x,y),
                    num_threads=FLAGS.num_gpus,
                    output_buffer_size=FLAGS.num_gpus)
                iterator = dataset.make_initializable_iterator()
                images,labels = iterator.get_next()

        tower_grads = []
        feed_vars = []
        average_loss_tensor = []
        reuse_variables = False
        accuracy = None
        for i in xrange(FLAGS.num_gpus):
            with tf.device(assign_to_device('/gpu:%s'%device_ids[i])):
                with tf.name_scope('%s_%s' % ('TOWER', device_ids[i])) as scope:
                    if not FLAGS.use_dataset:
                        feature_dim = models.feature_dim
                        label_dim = models.label_dim
                        images = tf.placeholder(tf.float32, [None, feature_dim], name='images')
                        labels = tf.placeholder(tf.int64, [None, label_dim], name='labels')
                        feed_vars.append((images, labels))
                    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): 
                        logits = models.model_fcn5(images)
                    if i == 0:
                        # Prediction only on GPU:0
                        predictionCorrectness = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
                        accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float"))
                    loss = models.loss(logits, labels)
                    reuse_variables = True
                    average_loss_tensor.append(loss)
                    grads = optimizer.compute_gradients(loss)
                    tower_grads.append(grads)

        grads = average_gradients(tower_grads)
        apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)

        train_op = apply_gradient_op
        average_op = tf.reduce_mean(average_loss_tensor)
        saver = tf.train.Saver(tf.global_variables())

        init = tf.global_variables_initializer()
        sess = tf.Session(config=config)
        sess.run(init)
        if FLAGS.use_dataset:
            sess.run(iterator.initializer)
            
        real_batch_size = FLAGS.batch_size * FLAGS.num_gpus
        num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size)
        iterations = FLAGS.epochs * num_batches_per_epoch 
        average_batch_time = 0.0
        epochs_info = []

        step = 0
        average_loss = 0.0
        for step in range(iterations):
            start_time = time.time()
            feed_dict = {}
            if not FLAGS.use_dataset:
                imgs, labs = get_real_batch_data(real_batch_size, 10)
                for i in range(FLAGS.num_gpus):
                    feed_dict[feed_vars[i][0]] = imgs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size]
                    feed_dict[feed_vars[i][1]] = labs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] 
            _, loss_value = sess.run([train_op, average_op], feed_dict=feed_dict)
            duration = time.time() - start_time
            average_batch_time += float(duration)
            average_loss += loss_value

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % FLAGS.log_step == 0:
                examples_per_sec = (FLAGS.batch_size * FLAGS.num_gpus) / duration
                sec_per_batch = float(duration)
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))

            if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0:
                average_loss /= num_batches_per_epoch * FLAGS.eval_step
                print ('epoch: %d, loss: %.2f' % (step/(FLAGS.eval_step*num_batches_per_epoch), average_loss))
                epochs_info.append('%d:-:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) 
                average_loss = 0.0
                feed_dict = { images: mnist.test.images, labels :mnist.test.labels }
                if not FLAGS.use_dataset:
                    feed_dict = {}
                    feed_dict[feed_vars[0][0]] = mnist.test.images
                    feed_dict[feed_vars[0][1]] = mnist.test.labels
                accuracy_value = accuracy.eval(session=sess, feed_dict=feed_dict)
                print("test accuracy %g"%accuracy_value)

        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

        average_batch_time /= iterations
        print 'average_batch_time: ', average_batch_time
        print ('epoch_info: %s' % ','.join(epochs_info))
Esempio n. 4
0
def train(model='fcn5'):
    config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
    device_id = FLAGS.device_id
    device_str = ''
    if int(device_id) >= 0:
        device_str = '/gpu:%d' % int(device_id)
        config.allow_soft_placement = True
        config.intra_op_parallelism_threads = 1
        config.inter_op_parallelism_threads = 0
    else:
        device_str = '/cpu:0'
        num_threads = os.getenv('OMP_NUM_THREADS', 1)
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=int(num_threads))

    if FLAGS.xla:
        # Turns on XLA.  XLA is not included in the standard build.  For single GPU this shows ~5% improvement
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    with tf.Graph().as_default(), tf.device(device_str), tf.Session(
            config=config) as sess:
        feature_dim = models.feature_dim
        label_dim = models.label_dim
        images = None
        labels = None
        iterator = None
        if FLAGS.use_dataset:
            with tf.device('/CPU:0'):
                d_features = mnist.train.images
                d_labels = mnist.train.labels
                dataset = tf.contrib.data.Dataset.from_tensor_slices(
                    (d_features, d_labels))
                dataset = dataset.repeat()
                dataset = dataset.shuffle(buffer_size=60000)
                dataset = dataset.batch(FLAGS.batch_size)
                # Trick to get datasets to buffer the next epoch.  This is needed because
                # the data loading is occuring outside DataSets in python.  Normally preprocessing
                # would occur in DataSets and this odd looking line is not needed.
                dataset = dataset.map(lambda x, y: (x, y),
                                      num_threads=1,
                                      output_buffer_size=1)
                iterator = dataset.make_initializable_iterator()
                images, labels = iterator.get_next()

        else:
            images = tf.placeholder(tf.float32, [None, feature_dim],
                                    name="images_placeholder")
            labels = tf.placeholder(tf.int64, [None, label_dim],
                                    name="labels_placeholder")

        logits = None
        loss = None
        if model == 'fcn5':
            logits = models.model_fcn5(images)
        else:
            logits = models.model_fcn8(images)
        loss = models.loss(logits, labels)

        predictionCorrectness = tf.equal(tf.argmax(logits, 1),
                                         tf.argmax(labels, 1))
        accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float"))

        lr = 0.05
        optimizer = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss)

        init = tf.global_variables_initializer()

        sess.run(init)
        if FLAGS.use_dataset:
            sess.run(iterator.initializer)
        batch_size_per_epoch = int(
            (EPOCH_SIZE + FLAGS.batch_size - 1) / FLAGS.batch_size)
        iterations = FLAGS.epochs * batch_size_per_epoch
        average_batch_time = 0.0
        epochs_info = []
        average_loss = 0.0
        for step in range(iterations):
            start_time = time.time()
            imgs = None
            labs = None
            if FLAGS.use_dataset:
                _, loss_value = sess.run([optimizer, loss])
            else:
                imgs, labs = get_real_batch_data(FLAGS.batch_size, 10)
                _, loss_value = sess.run([optimizer, loss],
                                         feed_dict={
                                             images: imgs,
                                             labels: labs
                                         })
            duration = time.time() - start_time
            average_loss += loss_value
            average_batch_time += float(duration)
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            if step % FLAGS.log_step == 0:
                examples_per_sec = FLAGS.batch_size / duration
                sec_per_batch = float(duration)
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                )
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))
            if step > 0 and step % (FLAGS.eval_step *
                                    batch_size_per_epoch) == 0:
                average_loss /= FLAGS.eval_step * batch_size_per_epoch
                accuracy_value = accuracy.eval(feed_dict={
                    images: mnist.test.images,
                    labels: mnist.test.labels
                })
                print("test accuracy %g" % accuracy_value)
                epochs_info.append('%d:%g:%s' %
                                   (step /
                                    (FLAGS.eval_step * batch_size_per_epoch),
                                    accuracy_value, average_loss))
                average_loss = 0.0
        average_batch_time /= iterations
        print 'average_batch_time: ', average_batch_time
        print('epoch_info: %s' % ','.join(epochs_info))