Example #1
0
def train(args):

    graph = tf.Graph()
    with graph.as_default():
        global_step = tf.train.create_global_step()

        # # placeholders for training data
        imgs = tf.placeholder(tf.float32,
                              [None, args.crop_height, args.crop_width, 3])
        scores = tf.placeholder(tf.float32, [None])
        dropout_keep_prob = tf.placeholder(tf.float32, [])
        lr = tf.placeholder(tf.float32, [])

        with tf.name_scope("create_models"):
            model = VggNetModel(num_classes=1,
                                dropout_keep_prob=dropout_keep_prob)
            y_hat = model.inference(imgs, True)
            y_hat = tf.reshape(y_hat, [
                -1,
            ])

        with tf.name_scope("create_loss"):
            reg_loss = mes(y_hat, scores)

        with tf.name_scope("create_optimize"):
            # optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(loss) # not converge ??
            var_list = [v for v in tf.trainable_variables()]
            optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(
                reg_loss, var_list=var_list)

        saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10)

        tf.summary.scalar('learning_rate', lr)
        tf.summary.scalar('reg_loss', reg_loss)
        # Build the summary Tensor based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        # Instantiate a SummaryWriter to output summaries and the Graph.
        timestamp = datetime.fromtimestamp(
            time.time()).strftime('%Y%m%d-%H:%M')
        summary_writer = tf.summary.FileWriter(os.path.join(
            args.logs_dir, 'train/{}-{}'.format(args.exp_name, timestamp)),
                                               filename_suffix=args.exp_name)
        summary_test = tf.summary.FileWriter(os.path.join(
            args.logs_dir, 'test/{}-{}'.format(args.exp_name, timestamp)),
                                             filename_suffix=args.exp_name)

        train_image_paths, train_scores, test_image_paths, test_scores = get_image_list(
            args)
        train_loader = train_generator(train_image_paths, train_scores)
        train_num_batchs = len(train_image_paths) // args.batch_size + 1
        test_loader = val_generator(test_image_paths, test_scores,
                                    args.batch_size)
        test_num_batchs = len(test_image_paths) // args.batch_size + 1

    with tf.Session(graph=graph) as sess:

        sess.run(tf.global_variables_initializer())

        ckpt = tf.train.get_checkpoint_state(args.ckpt_dir)
        counter = 0
        if ckpt and ckpt.model_checkpoint_path:
            counter = __load__(saver, sess, args.ckpt_dir)
        else:
            load(saver, sess, args.pretrain_models_path)

        start_time = time.time()
        start_step = counter  # if counter is not None else 0

        base_lr = args.learning_rate
        for step, (images, targets) in enumerate(train_loader, start_step):

            if step <= 500:
                base_lr = args.start_lr + (args.learning_rate -
                                           args.start_lr) * step / float(500)
            else:
                if (step + 1) % (0.5 * args.iter_max) == 0:
                    base_lr = base_lr / 5
                if (step + 1) % (0.8 * args.iter_max) == 0:
                    base_lr = base_lr / 5
            # base_lr=(base_lr-base_lr*0.001)/args.iter_max*(args) # other learning rate modify

            loss_, y_hat_, _ = sess.run(
                [reg_loss, y_hat, optimizer],
                feed_dict={
                    imgs: images,
                    scores: targets,
                    lr: base_lr,
                    dropout_keep_prob: args.dropout_keep_prob
                })

            if (step + 1) % args.summary_step == 0:
                # logger.info("targets labels is : {}".format(targets))
                # logger.info("predict lables is : {}".format(y_hat_))

                logger.info(
                    "step %d/%d,reg loss is %f, time %f,learning rate: %.8f" %
                    (step, args.iter_max, loss_,
                     (time.time() - start_time), base_lr))
                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           imgs:
                                           images,
                                           scores:
                                           targets,
                                           lr:
                                           base_lr,
                                           dropout_keep_prob:
                                           args.dropout_keep_prob
                                       })
                summary_writer.add_summary(summary_str, step)
                # summary_writer.flush()

            if (step + 1) % args.test_step == 0:
                if args.save_ckpt_file:
                    # saver.save(sess, args.checkpoint_dir + 'iteration_' + str(step) + '.ckpt',write_meta_graph=False)
                    save(saver, sess, args.ckpt_dir, step)
                test_loss = 0
                scores_set = np.array([])
                lables_set = np.array([])
                # for step, (images, targets) in enumerate(test_loader):
                for i in range(test_num_batchs):
                    images, targets = next(test_loader)
                    loss_, y_hat_ = sess.run(
                        [reg_loss, y_hat],
                        feed_dict={
                            imgs: images,
                            scores: targets,
                            lr: base_lr,
                            dropout_keep_prob: args.dropout_keep_prob
                        })
                    test_loss += loss_
                    scores_set = np.append(scores_set, y_hat_)
                    lables_set = np.append(lables_set, targets)
                    logger.info(
                        'test_loader step/len(test_loader) :{}/{}'.format(
                            i, test_num_batchs))

                # print(type(scores_set), type(lables_set))
                # logger.info("scores_set:{}, lables_set:{}.".format(scores_set,lables_set.shape))
                srocc, krocc, plcc, rmse, mse = evaluate_metric(
                    lables_set, scores_set)
                test_loss /= test_num_batchs
                logger.info(
                    "SROCC_v: %.3f\t KROCC: %.3f\t PLCC_v: %.3f\t RMSE_v: %.3f\t mse: %.3f\t test loss: %.3f\n"
                    % (srocc, krocc, plcc, rmse, mse, test_loss))
                s1 = tf.Summary(value=[
                    tf.Summary.Value(tag='test_loss', simple_value=test_loss)
                ])
                s2 = tf.Summary(value=[
                    tf.Summary.Value(tag='test_srocc', simple_value=srocc)
                ])
                summary_test.add_summary(s1, step)
                summary_test.add_summary(s2, step)

            if step == args.iter_max:
                saver.save(sess,
                           args.ckpt_dir + '/final_model_' + timestamp +
                           '.ckpt',
                           write_meta_graph=False)
                logger.info(
                    'save train_iqa final models max_iter: {}...'.format(
                        args.iter_max))
                break

        logger.info("Optimization finish!")
Example #2
0
def train(args):

    graph = tf.Graph()
    with graph.as_default():
        global_step = tf.train.create_global_step()

        # # placeholders for training data
        imgs = tf.placeholder(tf.float32,
                              [None, args.crop_height, args.crop_width, 3])
        dropout_keep_prob = tf.placeholder(tf.float32, [])
        lr = tf.placeholder(tf.float32, [])

        with tf.name_scope("create_models"):
            model = VggNetModel(num_classes=1,
                                dropout_keep_prob=dropout_keep_prob)
            y_hat = model.inference(imgs, True)

        with tf.name_scope("create_loss"):
            rank_loss = Rank_loss()
            loss = rank_loss.get_rankloss(y_hat, args.batch_size)

        with tf.name_scope("create_optimize"):
            # optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(loss)
            var_list = [v for v in tf.trainable_variables()]
            optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(
                loss, var_list=var_list)

        saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10)

        tf.summary.scalar('learning_rate', lr)
        tf.summary.scalar('rank_loss', loss)
        # Build the summary Tensor based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        # Instantiate a SummaryWriter to output summaries and the Graph.
        timestamp = datetime.fromtimestamp(
            time.time()).strftime('%Y%m%d-%H:%M')
        summary_writer = tf.summary.FileWriter(os.path.join(
            args.logs_dir, 'train/{}-{}'.format(args.exp_name, timestamp)),
                                               filename_suffix=args.exp_name)
        summary_test = tf.summary.FileWriter(os.path.join(
            args.logs_dir, 'test/{}-{}'.format(args.exp_name, timestamp)),
                                             filename_suffix=args.exp_name)

        train_data = Dataset({
            'root_dir': os.path.abspath('..'),
            'data_root': 'data',
            'split': 'live_train',
            'im_shape': [224, 224],
            'batch_size': args.batch_size
        })
        test_data = Dataset({
            'root_dir': os.path.abspath('..'),
            'data_root': 'data',
            'split': 'live_test',
            'im_shape': [224, 224],
            'batch_size': args.batch_size
        })

    with tf.Session(graph=graph) as sess:

        sess.run(tf.global_variables_initializer())
        model.load_original_weights(sess, args.vgg_models_path)

        # global_var = tf.global_variables()
        # var_list = sess.run(global_var)
        start_time = time.time()
        base_lr = args.learning_rate
        for step in range(args.iter_max):

            if (step + 1) % (0.5 * args.iter_max) == 0:
                base_lr = base_lr / 5
            if (step + 1) % (0.8 * args.iter_max) == 0:
                base_lr = base_lr / 5
            # base_lr=(base_lr-base_lr*0.001)/args.iter_max*(args) # other learning rate modify

            image_batch, label_batch = train_data.next_batch()
            loss_, _ = sess.run(
                [loss, optimizer],
                feed_dict={
                    imgs: image_batch,
                    lr: base_lr,
                    dropout_keep_prob: args.dropout_keep_prob
                })

            if (step + 1) % args.summary_step == 0:

                logger.info(
                    "step %d/%d,rank loss is %f, time %f,learning rate: %.8f" %
                    (step, args.iter_max, loss_,
                     (time.time() - start_time), base_lr))
                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           imgs:
                                           image_batch,
                                           lr:
                                           base_lr,
                                           dropout_keep_prob:
                                           args.dropout_keep_prob
                                       })
                summary_writer.add_summary(summary_str, step)
                # summary_writer.flush()

            if (step + 1) % args.test_step == 0:
                if args.save_ckpt_file:
                    # saver.save(sess, args.checkpoint_dir + 'iteration_' + str(step) + '.ckpt',write_meta_graph=False)
                    save(saver, sess, args.ckpt_dir, step)

                test_epoch_step = len(
                    test_data.scores) // test_data.batch_size + 1
                test_loss = 0
                for _ in range(test_epoch_step):
                    image_batch, label_batch = test_data.next_batch()
                    loss_, _ = sess.run(
                        [loss, optimizer],
                        feed_dict={
                            imgs: image_batch,
                            lr: base_lr,
                            dropout_keep_prob: args.dropout_keep_prob
                        })
                    test_loss += loss_
                test_loss /= test_epoch_step
                s = tf.Summary(value=[
                    tf.Summary.Value(tag='test_loss', simple_value=test_loss)
                ])
                summary_test.add_summary(s, step)
            if step == args.iter_max:
                saver.save(sess,
                           args.ckpt_dir + 'rank_model_final' + '.ckpt',
                           write_meta_graph=False)

        logger.info("Optimization finish!")
Example #3
0
    def train(self, net, num_epochs, optimizer, train_loader, test_loader):
        print("Starting training...")
        # Run training for some number of epochs.
        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1} / {num_epochs}\n")
            torch.cuda.empty_cache()

            # Save checkpoint periodically.
            is_checkpoint_epoch = (
                self.checkpoint_epochs and epoch % self.checkpoint_epochs == 0
            )
            if self.checkpoint_name and is_checkpoint_epoch:
                checkpoint.save(net, self.checkpoint_name, name=self.wandb_name)

            # Run training loop
            net.train()
            for inputs, targets in tqdm(train_loader):
                batch_size = inputs.shape[0]
                inputs = inputs.cuda() if self.use_cuda else inputs.cpu()
                targets = targets.cuda() if self.use_cuda else targets.cpu()

                # Sanity check training data shape sizes
                if self.input_shape:
                    expected_shape = tuple([batch_size] + self.input_shape)
                    assert (
                        inputs.shape == expected_shape
                    ), f"Bad shape: expected {expected_shape} got {inputs.shape}"
                if self.target_shape:
                    expected_shape = tuple([batch_size] + self.target_shape)
                    assert (
                        targets.shape == expected_shape
                    ), f"Bad shape: expected {expected_shape} got {target.shape}"

                # Get a prediction from the model
                optimizer.zero_grad()
                outputs = net(inputs)
                if self.output_shape:
                    expected_shape = tuple([batch_size] + self.output_shape)
                    assert (
                        outputs.shape == expected_shape
                    ), f"Bad shape: expected {expected_shape} got {outputs.shape}"

                # Run loss function on over the model's prediction
                loss = torch.tensor([0.0], requires_grad=True)
                loss = loss.cuda() if self.use_cuda else loss
                for loss_fn, weight in self.loss_fns:
                    loss = loss + weight * loss_fn(inputs, outputs, targets)

                # Calculate model weight gradients from the loss and update model.
                loss.backward()
                optimizer.step()
                if self.scheduler:
                    # Update the learning rate, according to the scheduler.
                    try:
                        self.scheduler.step()
                    except ValueError:
                        pass

                # Track metric information
                with torch.no_grad():
                    for metric_fn, _, train_tracker, _ in self.metric_fns:
                        metric_val = metric_fn(inputs, outputs, targets)
                        train_tracker.update(metric_val)

            # Check performance (loss) on validation set.
            net.eval()
            with torch.no_grad():
                for inputs, targets in tqdm(test_loader):
                    inputs = inputs.cuda() if self.use_cuda else inputs.cpu()
                    targets = targets.cuda() if self.use_cuda else targets.cpu()
                    outputs = net(inputs)
                    # Track metric information
                    for metric_fn, _, _, test_tracker in self.metric_fns:
                        metric_val = metric_fn(inputs, outputs, targets)
                        test_tracker.update(metric_val)

            # Log epoch metrics
            training_info = {}
            for _, name, train_tracker, test_tracker in self.metric_fns:
                training_info[f"Training {name}"] = train_tracker.value
                training_info[f"Validation {name}"] = test_tracker.value

            if self.scheduler:
                try:
                    training_info[f"Learning rate"] = self.scheduler.get_lr()[0]
                except ValueError:
                    pass  # Whatevs

            log_training_info(training_info, use_wandb=self.use_wandb)

        # Save final model checkpoint
        if self.checkpoint_name:
            checkpoint.save(
                net, self.checkpoint_name, name=self.wandb_name, use_wandb=self.use_wandb
            )