Beispiel #1
0
def evaluate():
    print("in model evaluation")
    dataset = dataset_module.MyDataset(subset=FLAGS.subset)
    assert dataset.data_files()
    FLAGS.num_examples = dataset.num_examples_per_epoch(
    ) / FLAGS.subsample_factor
    """Evaluate model on Dataset for a number of steps."""
    with tf.Graph().as_default():
        # Get images and labels from the dataset.
        tensors_in, tensors_out = batching.inputs(dataset)

        # Number of classes in the Dataset label set plus 1.
        # Label 0 is reserved for an (unused) background class.
        num_classes = dataset.num_classes() + 1

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits_all = model.inference(tensors_in,
                                     num_classes,
                                     for_training=False)
        model.loss(logits_all, tensors_out, batch_size=FLAGS.batch_size)
        loss_op = slim.losses.get_losses()

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            model.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        graph_def = tf.get_default_graph().as_graph_def(add_shapes=True)
        summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir,
                                                graph_def=graph_def)

        while True:
            _eval_once(saver, summary_writer, logits_all, tensors_out, loss_op,
                       summary_op, tensors_in)
            if FLAGS.run_once:
                break
            time.sleep(FLAGS.eval_interval_secs)
Beispiel #2
0
def do_training(args):

    trainloader, testloader = build_dataset(
        args.dataset,
        dataroot=args.dataroot,
        batch_size=args.batch_size,
        eval_batch_size=args.eval_batch_size,
        num_workers=2)
    model = build_model(args.arch, num_classes=num_classes(args.dataset))
    model = torch.nn.DataParallel(model).cuda()

    # Calculate total number of model parameters
    num_params = sum(p.numel() for p in model.parameters())
    track.metric(iteration=0, num_params=num_params)

    optimizer = build_optimizer('SGD', params=model.parameters(), lr=args.lr)

    criterion = torch.nn.CrossEntropyLoss()

    best_acc = 0.0
    for epoch in range(args.epochs):
        track.debug("Starting epoch %d" % epoch)
        args.lr = adjust_learning_rate(epoch, optimizer, args.lr,
                                       args.schedule, args.gamma)
        train_loss, train_acc = train(trainloader, model, criterion, optimizer,
                                      epoch)
        test_loss, test_acc = test(testloader, model, criterion, epoch)
        track.debug(
            'Finished epoch %d... | train loss %.3f | train acc %.3f | test loss %.3f | test acc %.3f'
            % (epoch, train_loss, train_acc, test_loss, test_acc))
        # Save model
        model_fname = os.path.join(track.trial_dir(),
                                   "model{}.ckpt".format(epoch))
        torch.save(model, model_fname)
        if test_acc > best_acc:
            best_acc = test_acc
            best_fname = os.path.join(track.trial_dir(), "best.ckpt")
            track.debug("New best score! Saving model")
            torch.save(model, best_fname)
Beispiel #3
0
def train():
    dataset = dataset_module.MyDataset(subset=FLAGS.subset)
    #assert dataset.data_files()
    """Train on dataset for a number of steps."""
    # use gpu:0 instead of cpu0, to avoid RNN GPU variable uninitialized problem
    with tf.Graph().as_default(), tf.device('/gpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step -
                                        FLAGS.training_step_offset,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        if FLAGS.optimizer == "rmsprop":
            opt = tf.train.RMSPropOptimizer(lr,
                                            decay=RMSPROP_DECAY,
                                            momentum=FLAGS.momentum,
                                            epsilon=RMSPROP_EPSILON)
        elif FLAGS.optimizer == "sgd":
            opt = tf.train.MomentumOptimizer(lr,
                                             FLAGS.momentum,
                                             use_nesterov=False)
        elif FLAGS.optimizer == "adadelta":
            opt = tf.train.AdadeltaOptimizer()
        elif FLAGS.optimizer == "adam":
            opt = tf.train.AdamOptimizer()
        else:
            print("optimizer invalid: %s" % FLAGS.optimizer)
            return

        # Get images and labels for ImageNet and split the batch across GPUs.
        assert FLAGS.batch_size % FLAGS.num_gpus == 0, (
            'Batch size must be divisible by number of GPUs')
        split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus)

        # Override the number of preprocessing threads to account for the increased
        # number of GPU towers.
        #num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus
        # choose not to overide, to have a finer control of how many threads to use
        num_preprocess_threads = FLAGS.num_preprocess_threads

        net_inputs, net_outputs = batching.distorted_inputs(
            dataset, num_preprocess_threads=num_preprocess_threads)

        input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES))

        init_op = tf.initialize_all_variables()
        #初始化所有变量
        # Number of classes in the Dataset label set plus 1.
        # Label 0 is reserved for an (unused) background class.
        if FLAGS.background_class:
            num_classes = dataset.num_classes() + 1
        else:
            num_classes = dataset.num_classes()

        # Split the batch of images and labels for towers.
        # TODO: this might become invalid if we are doing detection
        input_splits = _tensor_list_splits(net_inputs, FLAGS.num_gpus)
        output_splits = _tensor_list_splits(net_outputs, FLAGS.num_gpus)

        # Calculate the gradients for each model tower.
        tower_grads = []
        for i in xrange(FLAGS.num_gpus):
            with tf.device('/gpu:%s' % i):
                with tf.name_scope('%s_%d' % (model.TOWER_NAME, i)) as scope:
                    if True:
                        # I don't see any improvements by pinning all variables on CPU, so I disabled this
                        # Force all Variables to reside on the CPU.
                        #with slim.arg_scope([slim.variable], device='/cpu:0'):

                        # do not use this line, as it will assign all operations to cpu
                        #with tf.device('/cpu:0'):

                        # Calculate the loss for one tower of the CNN model. This
                        # function constructs the entire CNN model but shares the
                        # variables across all towers.
                        loss = _tower_loss(input_splits[i], output_splits[i],
                                           num_classes, scope)

                        if i == 0:
                            # set different learning rates for different variables
                            if hasattr(model, 'learning_rate_multipliers'):
                                # this function returns a dictionary of [varname]=multiplier
                                # learning rate multiplier that equals to one is set by default
                                multiplier = model.learning_rate_multipliers()

                                # computing the vars that needs gradient
                                grad_var_list = []
                                for t in tf.trainable_variables():
                                    v = t.op.name
                                    if (v in multiplier) and (abs(
                                            multiplier[v]) < 1e-6):
                                        pass
                                    else:
                                        grad_var_list.append(t)
                                print("-" * 40 +
                                      "\n gradient will be computed for vars:")
                                for x in grad_var_list:
                                    print(x.op.name)
                            else:
                                multiplier = None
                                grad_var_list = None

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)

                    # Retain the Batch Normalization updates operations only from the
                    # final tower. Ideally, we should grab the updates from all towers
                    # but these stats accumulate extremely fast so we can ignore the
                    # other stats from the other towers without significant detriment.
                    batchnorm_updates = tf.get_collection(
                        ops.GraphKeys.UPDATE_OPS, scope)
                    #batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
                    #                                      scope)

                    # Calculate the gradients for the batch of data on this CNN
                    # tower.
                    grads = opt.compute_gradients(loss, var_list=grad_var_list)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        if FLAGS.EWC == "stat":
            grads, grads2 = _average_gradients(tower_grads, True)
            # merge grads2 into a dict of variable
            out = {}
            vard = {}
            for g2, v in grads2:
                out[v.op.name] = g2
                vard[v.op.name] = v
            grads2 = out
        else:
            grads = _average_gradients(tower_grads)

        # Add a summaries for the input processing and global_step.
        summaries.extend(input_summaries)

        # Add a summary to track the learning rate.
        summaries.append(tf.scalar_summary('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.histogram_summary(var.op.name + '/gradients', grad))

        if multiplier:
            print("-" * 40 + "\nusing learning rate multipliers")
            grads_out = []
            for g, v in grads:
                v_name = v.op.name
                if v_name in multiplier:
                    g_out = tf.mul(multiplier[v_name], g)
                    print(v_name, " * ", multiplier[v_name])
                else:
                    g_out = g
                    print(v_name, " * 1.00")
                grads_out.append((g_out, v))
            grads = grads_out

        # gradient clipping
        if FLAGS.clip_gradient_threshold > 0:
            print("-" * 40 + "\n Gradient Clipping On")
            t_list = [x[0] for x in grads]
            t_list, gnorm = tf.clip_by_global_norm(
                t_list,
                FLAGS.clip_gradient_threshold,
                name='gradient_clipping')
            grads = [(t_list[i], grads[i][1]) for i in range(len(t_list))]

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        # Track the moving averages of all trainable variables.
        # Note that we maintain a "double-average" of the BatchNormalization
        # global statistics. This is more complicated then need be but we employ
        # this for backward-compatibility with our previous models.
        variable_averages = tf.train.ExponentialMovingAverage(
            model.MOVING_AVERAGE_DECAY, global_step)

        # Another possiblility is to use tf.slim.get_variables().
        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(apply_gradient_op, variables_averages_op,
                            batchnorm_updates_op)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.merge_summary(summaries)

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement,
            intra_op_parallelism_threads=1)
        config.gpu_options.allow_growth = True

        sess = tf.Session(config=config)
        sess.run(init)

        #训练开始

        # TODO: not supported to load from different number of towers now
        if FLAGS.pretrained_model_checkpoint_path:
            assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
            #variables_to_restore = tf.get_collection(slim.variables.VARIABLES_TO_RESTORE)
            variables_to_restore = slim.get_variables_to_restore()

            # only restore those that are in the checkpoint
            existing_vars = util.tensors_in_checkpoint_file(
                FLAGS.pretrained_model_checkpoint_path)
            restore_new = []
            ignore_vars = []
            for x in variables_to_restore:
                if x.op.name in existing_vars:
                    restore_new.append(x)
                else:
                    ignore_vars.append(x.op.name)
            if len(ignore_vars) > 0:
                print(
                    "-" * 40 +
                    "\nWarning: Some variables does not exists in the checkpoint, ignore them: "
                )
                for x in ignore_vars:
                    print(x)
            variables_to_restore = restore_new

            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(
            FLAGS.train_dir,
            graph_def=sess.graph.as_graph_def(add_shapes=True))

        start_time = time.time()
        duration_compute = 0
        grads2_accu = None
        grads2_count = 0

        step_start = int(sess.run(global_step))
        try:
            for step in xrange(step_start, FLAGS.max_steps):
                # call a function in the model definition to do some extra work
                if hasattr(model, 'update_each_step'):
                    model.update_each_step(sess, step)

                if FLAGS.EWC == "stat":
                    # then run a stat mode
                    grads2_v = sess.run(grads2)

                    if grads2_count == 0:
                        grads2_accu = grads2_v
                    else:
                        for key in grads2_v.keys():
                            grads2_accu[key] += grads2_v[key]
                    grads2_count += 1

                    if step == (FLAGS.max_steps - 1):
                        # save the fisher infomation matirx
                        for key in grads2_accu.keys():
                            grads2_accu[key] /= grads2_count
                        fname = os.path.join(FLAGS.train_dir, "EWC_stat.pkl")
                        pickle.dump(grads2_accu, open(fname, "wb"))

                        # save the MAP file
                        vard_v = sess.run(vard)
                        fname = os.path.join(FLAGS.train_dir, "EWC_map.pkl")
                        pickle.dump(vard_v, open(fname, "wb"))

                    if (step + 1) % FLAGS.display_loss == 0:
                        print("processed ", step - step_start, " examples")
                    continue

                has_run_meta = False
                if FLAGS.profile:
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    start_time_compute = time.time()
                    _, loss_value = sess.run([train_op, loss],
                                             options=run_options,
                                             run_metadata=run_metadata)
                    duration_compute = duration_compute + time.time(
                    ) - start_time_compute

                    # Create the Timeline object, and write it to a json
                    tl = timeline.Timeline(run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    with open(os.path.join(FLAGS.train_dir, 'timeline.json'),
                              'w') as f:
                        f.write(ctf)
                    print("generated a time line profile for one session")
                else:
                    start_time_compute = time.time()
                    if (step + 1) % (FLAGS.display_summary * 10) == 0:
                        has_run_meta = True
                        # profile in a longer interval
                        run_options = tf.RunOptions(
                            trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()
                        _, loss_value, summary_str = \
                                              sess.run([train_op, loss, summary_op],
                                              options=run_options,
                                              run_metadata=run_metadata)
                        summary_writer.add_run_metadata(
                            run_metadata, 'step%d' % step)
                        summary_writer.add_summary(summary_str, step)
                        print('Adding run metadata for', step)

                        # Create the Timeline object, and write it to a json
                        tl = timeline.Timeline(run_metadata.step_stats)
                        ctf = tl.generate_chrome_trace_format()
                        with open(
                                os.path.join(FLAGS.train_dir, 'timeline.json'),
                                'w') as f:
                            f.write(ctf)
                        print("generated a time line profile for one session")
                    else:
                        _, loss_value = sess.run([train_op, loss])
                    duration_compute = duration_compute + time.time(
                    ) - start_time_compute

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if (step + 1) % FLAGS.display_loss == 0:
                    duration = (time.time() - start_time) / FLAGS.display_loss
                    duration_compute = duration_compute / FLAGS.display_loss

                    examples_per_sec = FLAGS.batch_size / float(duration)
                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch; compute %.1f examples/sec)')
                    print(format_str %
                          (datetime.now(), step, loss_value, examples_per_sec,
                           duration, FLAGS.batch_size / duration_compute))
                    duration_compute = 0
                    start_time = time.time()

                if (step +
                        1) % FLAGS.display_summary == 0 and not has_run_meta:
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str, step)

                # Save the model checkpoint periodically.
                if step % FLAGS.checkpoint_interval == 0 or (
                        step + 1) == FLAGS.max_steps:
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=global_step)

        except KeyboardInterrupt:
            print("Control C pressed. Saving model before exit. ")
            checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            saver.save(sess, checkpoint_path, global_step=global_step)
            sys.exit()