Esempio n. 1
0
def main(argv=None):
    with tf.Graph().as_default():

        model_fn = select_model(FLAGS.model_type)
        # Open the metadata file and figure out nlabels, and size of epoch
        input_file = os.path.join(FLAGS.train_dir, 'md.json')
        print(input_file)
        with open(input_file, 'r') as f:
            md = json.load(f)

        images, labels, _ = distorted_inputs(FLAGS.train_dir, FLAGS.batch_size,
                                             FLAGS.image_size,
                                             FLAGS.num_preprocess_threads)
        logits = model_fn(md['nlabels'], images, 1 - FLAGS.pdrop, True)
        total_loss = loss(logits, labels)
        ini_global_step = 0

        train_op = optimizer(FLAGS.optim, FLAGS.eta, total_loss,
                             FLAGS.steps_per_decay, FLAGS.eta_decay_rate)
        saver = tf.train.Saver(tf.global_variables())
        summary_op = tf.summary.merge_all()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        tf.global_variables_initializer().run(session=sess)

        # This is total hackland, it only works to fine-tune iv3
        if FLAGS.pre_model:
            inception_variables = tf.get_collection(tf.GraphKeys.VARIABLES,
                                                    scope="InceptionV3")
            restorer = tf.train.Saver(inception_variables)
            restorer.restore(sess, FLAGS.pre_model)

        if FLAGS.pre_checkpoint_path:
            if tf.gfile.Exists(FLAGS.pre_checkpoint_path) is True:
                print('Trying to restore checkpoint from %s' %
                      FLAGS.pre_checkpoint_path)
                restorer = tf.train.Saver()
                restorer.restore(
                    sess,
                    tf.train.latest_checkpoint(FLAGS.pre_checkpoint_path))
                print('%s: Pre-trained model restored from %s' %
                      (datetime.now(), FLAGS.pre_checkpoint_path))
                ini_global_step = get_restored_step(FLAGS.pre_checkpoint_path)
                print('Initail Global Step is {}'.format(ini_global_step))

        run_dir = '%s/run-%d' % (FLAGS.train_dir, os.getpid())

        checkpoint_path = '%s/%s' % (run_dir, FLAGS.checkpoint)
        if tf.gfile.Exists(run_dir) is False:
            print('Creating %s' % run_dir)
            tf.gfile.MakeDirs(run_dir)

        tf.train.write_graph(sess.graph_def, run_dir, 'model.pb', as_text=True)

        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(run_dir, sess.graph)
        steps_per_train_epoch = int(md['train_counts'] / FLAGS.batch_size)
        num_steps = FLAGS.max_steps if FLAGS.epochs < 1 else FLAGS.epochs * steps_per_train_epoch
        print('Requested number of steps [%d]' % num_steps)

        for step in xrange(num_steps):
            step += ini_global_step
            start_time = time.time()
            _, loss_value = sess.run([train_op, total_loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.3f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            # Loss only actually evaluated every 100 steps?
            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if step % 1000 == 0 or (step + 1) == num_steps:
                saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 2
0
def main(argv=None):
    with tf.Graph().as_default():

        global_step = tf.Variable(0, trainable=False)

        model_fn = select_model(FLAGS.model_type)
        # Open the metadata file and figure out nlabels, and size of epoch
        input_file = os.path.join(FLAGS.train_dir, 'md.json')
        print(input_file)
        with open(input_file, 'r') as f:
            md = json.load(f)

        images, labels, _ = distorted_inputs(FLAGS.train_dir, FLAGS.batch_size,
                                             FLAGS.image_size,
                                             FLAGS.num_preprocess_threads)
        if not FLAGS.dual:
            logits = model_fn(md['nlabels'], images, 1 - FLAGS.pdrop, True)
            total_loss, accuracy = loss(logits, labels, global_step)
        else:
            with tf.variable_scope("net1") as scope:
                logits1 = model_fn(md['nlabels'], images, 1 - FLAGS.pdrop,
                                   True)
            with tf.variable_scope("net2") as scope:
                logits2 = model_fn(md['nlabels'], images, 1 - FLAGS.pdrop,
                                   True)

            pred1 = tf.argmax(logits1, 1)
            pred2 = tf.argmax(logits2, 1)

            update_step = tf.stop_gradient(
                tf.to_float(
                    tf.logical_or(tf.not_equal(pred1, pred2),
                                  global_step < FLAGS.init_iter)))

            with tf.variable_scope("net1") as scope:
                if FLAGS.min_batch_size == -1:
                    total_loss1, accuracy1 = loss(logits1, labels, global_step,
                                                  None, scope.name)
                else:
                    total_loss1, accuracy1 = loss(logits1, labels, global_step,
                                                  update_step, scope.name)
            with tf.variable_scope("net2") as scope:
                if FLAGS.min_batch_size == -1:
                    total_loss2, accuracy2 = loss(logits2, labels, global_step,
                                                  None, scope.name)
                else:
                    total_loss2, accuracy2 = loss(logits2, labels, global_step,
                                                  update_step, scope.name)

            disagree_rate = tf.reduce_mean(
                tf.to_float(tf.not_equal(pred1, pred2)))

        if not FLAGS.dual:
            train_op = optimizer(FLAGS.optim, FLAGS.eta, total_loss)
        else:
            with tf.variable_scope("net1") as scope:
                var_net1 = [
                    var for var in tf.all_variables()
                    if var.name.startswith("net1")
                ]
                train_op1 = optimizer(FLAGS.optim,
                                      FLAGS.eta,
                                      total_loss1,
                                      variables=var_net1,
                                      name=scope.name)
            with tf.variable_scope("net2") as scope:
                var_net2 = [
                    var for var in tf.all_variables()
                    if var.name.startswith("net2")
                ]
                train_op2 = optimizer(FLAGS.optim,
                                      FLAGS.eta,
                                      total_loss2,
                                      variables=var_net2,
                                      name=scope.name)
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=151)
        summary_op = tf.merge_all_summaries()
        init = tf.initialize_all_variables()
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        sess.run(init)

        # This is total hackland, it only works to fine-tune iv3
        if FLAGS.pre_model:
            inception_variables = tf.get_collection(tf.GraphKeys.VARIABLES,
                                                    scope="InceptionV3")
            restorer = tf.train.Saver(inception_variables)
            restorer.restore(sess, FLAGS.pre_model)

        if FLAGS.pre_checkpoint_path:
            if tf.gfile.Exists(FLAGS.pre_checkpoint_path) is True:
                print('Trying to restore checkpoint from %s' %
                      FLAGS.pre_checkpoint_point)
                restorer = tf.train.Saver()
                tf.train.latest_checkpoint(FLAGS.pre_checkpoint_path)
                print('%s: Pre-trained model restored from %s' %
                      (datetime.now(), FLAGS.pre_checkpoint_path))

        run_dir = '%s/run-%d' % (FLAGS.train_dir, os.getpid())

        checkpoint_path = '%s/%s' % (run_dir, FLAGS.checkpoint)
        if tf.gfile.Exists(run_dir) is False:
            print('Creating %s' % run_dir)
            tf.gfile.MakeDirs(run_dir)

        tf.train.write_graph(sess.graph_def, run_dir, 'model.pb', as_text=True)

        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(run_dir, sess.graph)
        steps_per_train_epoch = int(md['train_counts'] / FLAGS.batch_size)
        num_steps = FLAGS.max_steps if FLAGS.epochs < 1 else FLAGS.epochs * steps_per_train_epoch
        print('Requested number of steps [%d]' % num_steps)

        trainable_buffer_img = None
        trainable_buffer_lbl = None
        for step in range(num_steps):
            start_time = time.time()
            if FLAGS.Qloss:
                _, loss_value, acc_value, q_val = sess.run(
                    [train_op, total_loss, accuracy, Q_GLOBAL],
                    feed_dict={global_step: step})
                print(q_val)
            elif not FLAGS.dual:
                _, loss_value, acc_value = sess.run(
                    [train_op, total_loss, accuracy],
                    feed_dict={global_step: step})
            elif FLAGS.dual and (step < FLAGS.init_iter
                                 or FLAGS.min_batch_size != -1):
                _, _, loss_value, acc_value1, acc_value2, drate = sess.run(
                    [
                        train_op1, train_op2, total_loss1, accuracy1,
                        accuracy2, disagree_rate
                    ],
                    feed_dict={global_step: step})
            else:
                #loss_value, acc_value1, acc_value2, drate = (0,0,0,0)
                img, lbl, us, loss_value, acc_value1, acc_value2, drate = sess.run(
                    [
                        images, labels, update_step, total_loss1, accuracy1,
                        accuracy2, disagree_rate
                    ],
                    feed_dict={global_step: step})
                rel_img = img[us == 1]
                rel_lbl = lbl[us == 1]
                if trainable_buffer_img is None:
                    trainable_buffer_img = rel_img
                    trainable_buffer_lbl = rel_lbl
                else:
                    print(np.shape(trainable_buffer_lbl), np.shape(rel_lbl))
                    trainable_buffer_img = np.vstack(
                        (trainable_buffer_img, rel_img))
                    trainable_buffer_lbl = np.hstack(
                        (trainable_buffer_lbl, rel_lbl))

                if trainable_buffer_img.shape[0] >= FLAGS.batch_size:
                    batch_img = trainable_buffer_img[:FLAGS.batch_size]
                    batch_lbl = trainable_buffer_lbl[:FLAGS.batch_size]
                    _, _, loss_value, acc_value1, acc_value2, drate = sess.run(
                        [
                            train_op1, train_op2, total_loss1, accuracy1,
                            accuracy2, disagree_rate
                        ],
                        feed_dict={
                            global_step: step,
                            images: batch_img,
                            labels: batch_lbl
                        })
                    trainable_buffer_img = trainable_buffer_img[FLAGS.
                                                                batch_size:]
                    trainable_buffer_lbl = trainable_buffer_lbl[FLAGS.
                                                                batch_size:]
                #_, loss_value, acc_value2, drate = sess.run([train_op2, total_loss2, accuracy2, disagree_rate], feed_dict={global_step: step})
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 1 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                if not FLAGS.dual:
                    format_str = (
                        '%s: step %d, loss = %.3f, acc = %.3f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str %
                          (datetime.now(), step, loss_value, acc_value,
                           examples_per_sec, sec_per_batch))
                else:
                    format_str = (
                        '%s: step %d, loss = %.3f, acc1 = %.3f, acc2 = %.3f, disagree_rate = %.3f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str %
                          (datetime.now(), step, loss_value, acc_value1,
                           acc_value2, drate, examples_per_sec, sec_per_batch))

            # Loss only actually evaluated every 100 steps?
            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            if step % 200 == 0 or (step + 1) == num_steps:
                saver.save(sess, checkpoint_path, global_step=step)
Esempio n. 3
0
def main(argv=None):
    with tf.Graph().as_default():

        model_fn = select_model(FLAGS.model_type)
        # Open the metadata file and figure out nlabels, and size of epoch
        input_file = os.path.join(FLAGS.train_dir, 'md.json')
        print(input_file)
        with open(input_file, 'r') as f:
            md = json.load(f)

        images, labels, _ = distorted_inputs(FLAGS.train_dir, FLAGS.batch_size, FLAGS.image_size, FLAGS.num_preprocess_threads)
        logits = model_fn(md['nlabels'], images, 1-FLAGS.pdrop, True)
        total_loss = loss(logits, labels)

        train_op = optimizer(FLAGS.optim, FLAGS.eta, total_loss)
        saver = tf.train.Saver(tf.global_variables())
        summary_op = tf.summary.merge_all()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        tf.global_variables_initializer().run(session=sess)

        # This is total hackland, it only works to fine-tune iv3
        if FLAGS.pre_model:
            inception_variables = tf.get_collection(
                tf.GraphKeys.VARIABLES, scope="InceptionV3")
            restorer = tf.train.Saver(inception_variables)
            restorer.restore(sess, FLAGS.pre_model)

        if FLAGS.pre_checkpoint_path:
            if tf.gfile.Exists(FLAGS.pre_checkpoint_path) is True:
                print('Trying to restore checkpoint from %s' % FLAGS.pre_checkpoint_path)
                restorer = tf.train.Saver()
                tf.train.latest_checkpoint(FLAGS.pre_checkpoint_path)
                print('%s: Pre-trained model restored from %s' %
                      (datetime.now(), FLAGS.pre_checkpoint_path))


        run_dir = '%s/run-%d' % (FLAGS.train_dir, os.getpid())

        checkpoint_path = '%s/%s' % (run_dir, FLAGS.checkpoint)
        if tf.gfile.Exists(run_dir) is False:
            print('Creating %s' % run_dir)
            tf.gfile.MakeDirs(run_dir)

        tf.train.write_graph(sess.graph_def, run_dir, 'model.pb', as_text=True)

        tf.train.start_queue_runners(sess=sess)


        summary_writer = tf.summary.FileWriter(run_dir, sess.graph)
        steps_per_train_epoch = int(md['train_counts'] / FLAGS.batch_size)
        num_steps = FLAGS.max_steps if FLAGS.epochs < 1 else FLAGS.epochs * steps_per_train_epoch
        print('Requested number of steps [%d]' % num_steps)

        
        for step in xrange(num_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, total_loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)
                
                format_str = ('%s: step %d, loss = %.3f (%.1f examples/sec; %.3f ' 'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            # Loss only actually evaluated every 100 steps?
            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
                
            if step % 1000 == 0 or (step + 1) == num_steps:
                saver.save(sess, checkpoint_path, global_step=step)