Esempio n. 1
0
class ImageInputDataLayer(caffe.Layer):
    def setup(self, bottom, top):
        self.top_names = ['data', 'label']

        # === Read input parameters ===

        # params is a python dictionary with layer parameters.
        self.params = eval(self.param_str)

        # store input as class variables
        self.batch_size = self.params['batch_size']

        # Create a batch loader to load the images.
        # we can disable reader when test
        if self.params['need_reader']:
            self.batch_reader = BatchReader(**self.params)
            self.batch_generator = self.batch_reader.batch_generator()

        # === reshape tops ===
        top[0].reshape(
            self.batch_size, 3, self.params['img_size'], self.params['img_size'])
        top[1].reshape(
            self.batch_size, 10)

    def forward(self, bottom, top):
        """
        Load data.
        """
        images, labels = self.batch_generator.next()
        top[0].data[...] = images
        top[1].data[...] = labels

    def reshape(self, bottom, top):
        # === reshape tops ===
        top[0].reshape(
            self.batch_size, 3, self.params['img_size'], self.params['img_size'])
        top[1].reshape(
            self.batch_size, 10)

    def backward(self, top, propagate_down, bottom):
        """
        These layers does not back propagate
        """
        pass
Esempio n. 2
0
def train(prefix, **arg_dict):
    batch_size = arg_dict['batch_size']
    num_labels = arg_dict['landmark_type'] * 2
    img_size = arg_dict['img_size']
    # batch generator
    _batch_reader = BatchReader(**arg_dict)
    _batch_generator = _batch_reader.batch_generator()

    with tf.Graph().as_default():
        images = tf.placeholder(tf.float32,
                                shape=[batch_size, img_size, img_size, 3])
        point_labels = tf.placeholder(tf.float32,
                                      shape=[batch_size, num_labels])

        logits = models.init(arg_dict['model'],
                             images,
                             num_labels,
                             is_training=True)

        loss = models.get_l2_loss(logits, point_labels, batch_size)

        # Create a variable to track the global step.
        global_step = tf.Variable(0, name='global_step', trainable=False)
        learning_rate = tf.train.exponential_decay(arg_dict['learning_rate'],
                                                   global_step,
                                                   30000,
                                                   0.5,
                                                   staircase=True)
        # Use the optimizer to apply the gradients that minimize the loss
        # (and also increment the global step counter) as a single training step.
        optimizer = tf.train.AdamOptimizer(learning_rate)
        train_op = optimizer.minimize(loss, global_step=global_step)

        sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True)))
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = tf.train.Saver(tf.global_variables())

        if arg_dict['restore_ckpt']:
            variables_to_restore = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, arg_dict['restore_ckpt'])
            print('Resume-trained model restored from: %s' %
                  arg_dict['restore_ckpt'])

        tf.train.write_graph(sess.graph.as_graph_def(), '.',
                             os.path.join(prefix, 'graph_struct.txt'))

        print("Start to training...")
        start_time = time.time()
        while not _batch_reader.should_stop():
            with tf.device('/gpu:0'):
                batch = _batch_generator.next()
                _, ploss, step, lr = sess.run(
                    [train_op, loss, global_step, learning_rate],
                    feed_dict={
                        images: batch[0],
                        point_labels: batch[1]
                    })
                if step % 10 == 0:
                    end_time = time.time()
                    cost_time, start_time = end_time - start_time, end_time
                    sample_per_sec = int(10 * batch_size / cost_time)
                    sec_per_step = cost_time / 10.0
                    print(
                        '[%s] epochs: %d, step: %d, lr: %f, landmark_loss: %.4f, sample/s: %d, sec/step: %.3f'
                        % (datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
                           _batch_reader.get_epoch(), step, lr, ploss,
                           sample_per_sec, sec_per_step))
            if step % 1024 == 0:
                checkpoint_path = os.path.join(prefix, 'model.ckpt')
                saver.save(sess, checkpoint_path)
                print('Saved checkpoint to %s' % checkpoint_path)
        checkpoint_path = os.path.join(prefix, 'model.ckpt')
        saver.save(sess, checkpoint_path)
        print('\nReview training parameter:\n%s\n' % (str(arg_dict)))
        print('Saved checkpoint to %s' % checkpoint_path)
        print('Bye Bye!')
Esempio n. 3
0
def train(prefix, **arg_dict):
    img_size = arg_dict['img_size']
    gpu_num = len(arg_dict["gpu_device"].split(','))
    batch_size = arg_dict["batch_size"]
    common_dict = {"global_step": 1}
    print ("batch_size = %d for gpu_num = %d" % (batch_size, gpu_num))
    if arg_dict["parallel_mode"] == "ModelParallel":
        print ("Working on model parallel.")
        if gpu_num <= 1:
            raise Exception("Model parallel only support more than 2 gpu number")
    elif arg_dict["parallel_mode"] == "DataParallel":
        print ("Working on data parallel")
    else:
        raise Exception("Unsupport parallel mode. see --help")
    # Creat tf_summary writer.
    try:
        from tensorboardX import SummaryWriter
        summary_dir = os.path.join(prefix, "tf_summary")
        if os.path.exists(summary_dir):
            print ("Delete old summary in first.")
            os.system("rm -rf {}".format(summary_dir))
        common_dict["tensorboard_writer"] = SummaryWriter(summary_dir)
        print ("Enable tensorboard summary.")
        print ("Please using 'python -m tensorboard.main --logdir={}'".format(summary_dir))
    except Exception as ex:
        common_dict["tensorboard_writer"] = None
        print ("Disable tensorboard summary. please install tensorboardX in first.")
        print ("Easy to install by 'pip install tensorboardX --user'")
    # batch generator
    _batch_reader = BatchReader(**arg_dict)
    _batch_generator = _batch_reader.batch_generator()
    # net
    model_params = json.loads(arg_dict["model_params"])
    model_params["image_size"] = arg_dict["img_size"]
    model_params["feature_dim"] = arg_dict["feature_dim"]
    model_params["class_num"] = arg_dict["label_num"]
    net =  models.init(arg_dict["model"], gpu_num=gpu_num, model_params=model_params,
                       parallel_mode=arg_dict["parallel_mode"], common_dict=common_dict)
    if arg_dict["parallel_mode"] == "DataParallel":
        net = nn.DataParallel(net)
        net.cuda()
    # print (net)
    if arg_dict["restore_ckpt"]:
        print ("Resotre ckpt from {}".format(arg_dict["restore_ckpt"]))
        net.load_state_dict(torch.load(arg_dict["restore_ckpt"]))
    # optimizer
    optimizer = optim.SGD(net.parameters(), lr=arg_dict['learning_rate'],
                          momentum=0.9, weight_decay=5e-4)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20000, gamma=0.95)
    # start loop
    print ("Start to training...")
    start_time = time.time()
    display = 100
    loss_list = []
    while not _batch_reader.should_stop():
        #  prepare data
        batch_st = time.time()
        batch = _batch_generator.next()
        datas = batch[0].cuda()
        labels = batch[1].cuda()
        batch_et = time.time()
        #  forward and backward
        loss = net(datas, labels)
        loss = loss.mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lossd = loss.data[0]
        #  display
        loss_list.append(lossd)
        if common_dict["global_step"] % display == 0:
            end_time = time.time()
            cost_time, start_time = end_time - start_time, end_time
            sample_per_sec = int(display * batch_size / cost_time)
            sec_per_step = cost_time / float(display)
            loss_display = np.mean(loss_list)
            lr = optimizer.param_groups[0]['lr']
            print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %.5f, '\
                   'sample/s: %d, sec/step: %.3f, batch time: %.3fs' % (
                   datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), 
                   _batch_reader.get_epoch(), common_dict["global_step"], lr, loss_display,
                   sample_per_sec, sec_per_step, batch_et - batch_st))
            loss_list = []
            if common_dict["tensorboard_writer"] is not None:
                common_dict["tensorboard_writer"].add_scalar("loss", loss_display,
                                                             common_dict["global_step"])
                common_dict["tensorboard_writer"].add_scalar("sample_per_sec", sample_per_sec,
                                                             common_dict["global_step"])
                common_dict["tensorboard_writer"].add_scalar("lr", lr,
                                                             common_dict["global_step"])
        if common_dict["global_step"] % 10000 == 0:
            # save checkpoint
            checkpoint_path = os.path.join(prefix, 'model.ckpt')
            torch.save(net.state_dict(), checkpoint_path)
            print ("save checkpoint to %s" % checkpoint_path)
        lr_scheduler.step()
        common_dict["global_step"] += 1
Esempio n. 4
0
def train(prefix, **arg_dict):
    num_labels = arg_dict['landmark_type'] * 2
    img_size = arg_dict['img_size']
    train_angle = arg_dict['train_angle']
    gpu_num = len(arg_dict["gpu_device"].split(','))
    batch_size = arg_dict['batch_size'] * gpu_num
    arg_dict['batch_size'] = batch_size
    print("real batch_size = %d for gpu_num = %d" % (batch_size, gpu_num))
    # batch generator
    _batch_reader = BatchReader(**arg_dict)
    _batch_generator = _batch_reader.batch_generator()
    # net
    ctx = [mx.gpu(i) for i in range(gpu_num)]
    net = models.init(num_label=num_labels, **arg_dict)
    if arg_dict["restore_ckpt"]:
        print "resotre checkpoint from %s" % (arg_dict["restore_ckpt"])
        net.load_params(arg_dict['restore_ckpt'], ctx=ctx)
    else:
        net.initialize(init=mx.init.Xavier(), ctx=ctx)
    print net
    # loss
    losses_func = []
    if train_angle:
        losses_func.append(gluon.loss.L2Loss(weight=0.5))  # landmark
        losses_func.append(gluon.loss.L2Loss(weight=0.5))  # angle
    else:
        losses_func.append(gluon.loss.L2Loss())  # landmark
    # trainer
    trainer = gluon.Trainer(net.collect_params(), "adam",
                            {"learning_rate": arg_dict['learning_rate']})
    # start loop
    print("Start to training...")
    start_time = time.time()
    step = 0
    display = 10
    loss_list = []
    while not _batch_reader.should_stop():
        batch = _batch_generator.next()
        image = nd.array(batch[0])
        image = nd.transpose(image.astype('float32'),
                             (0, 3, 1, 2)) / 127.5 - 1.0
        image_list = gluon.utils.split_and_load(image, ctx)
        landmark = nd.array(batch[1])
        landmark_list = gluon.utils.split_and_load(landmark, ctx)
        if train_angle:
            angle = nd.array(batch[2])
            angle_list = gluon.utils.split_and_load(angle, ctx)
        with autograd.record():
            losses = []
            if train_angle:
                for _i, _l, _a in zip(image_list, landmark_list, angle_list):
                    predicts = net(_i)
                    landmark_loss = losses_func[0](predicts[0], _l)
                    angle_loss = losses_func[1](predicts[1], _a)
                    losses.append(landmark_loss + angle_loss)
            else:
                for _i, _l in zip(image_list, landmark_list):
                    predicts = net(_i)
                    landmark_loss = losses_func[0](predicts, _l)
                    losses.append(landmark_loss)
        for loss in losses:
            loss.backward()
        trainer.step(batch_size)
        loss_list.append(np.mean([nd.mean(l).asscalar() for l in losses]))
        nd.waitall()
        if step % display == 0:
            end_time = time.time()
            cost_time, start_time = end_time - start_time, end_time
            sample_per_sec = int(display * batch_size / cost_time)
            sec_per_step = cost_time / float(display)
            loss_display = "[landmark: %.5f]" % (np.mean(loss_list))
            print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %s,'\
                   'sample/s: %d, sec/step: %.3f' % (
                   datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
                   _batch_reader.get_epoch(), step, trainer.learning_rate, loss_display,
                   sample_per_sec, sec_per_step))
            loss_list = []
        if step % 1024 == 0:
            # change lr
            trainer.set_learning_rate(trainer.learning_rate * 0.95)
            # save checkpoint
            checkpoint_path = os.path.join(prefix, 'model.params')
            net.save_params(checkpoint_path)
            print("save checkpoint to %s" % checkpoint_path)
        step += 1
Esempio n. 5
0
class ImageInputDataLayer(caffe.Layer):
    def setup(self, bottom, top):
        self.top_names = ['data', 'label']

        # === Read input parameters ===

        # params is a python dictionary with layer parameters.
        self.params = eval(self.param_str)

        # store input as class variables
        self.batch_size = self.params['batch_size']

        # store landmark_type
        self.num_points = self.params['landmark_type']

        # store data channels
        if self.params['img_format'] == 'RGB':
            self.num_channels = 3
        elif self.params['img_format'] == 'GRAY':
            self.num_channels = 1
        else:
            raise Exception("Unsupport img_format ...")

        # Create a batch loader to load the images.
        # we can disable reader when test
        if self.params['need_reader']:
            self.batch_reader = BatchReader(**self.params)
            self.batch_generator = self.batch_reader.batch_generator()

        # === reshape tops ===
        top[0].reshape(self.batch_size, self.num_channels,
                       self.params['img_size'], self.params['img_size'])
        top[1].reshape(self.batch_size, self.num_points * 2)

    def preProcessImage(self, imgs):
        """
        process images before feeding to CNNs
        imgs: N x 1 x W x H
        """
        imgs = imgs.astype(np.float32)
        for i, img in enumerate(imgs):
            m = img.mean()
            s = img.std()
            imgs[i] = (img - m) / s
        return imgs

    def forward(self, bottom, top):
        """
        Load data.
        """
        images, labels = self.batch_generator.next()
        #print 'liusanjun images num', len(images)
        top[0].data[...] = images
        top[1].data[...] = labels

    def reshape(self, bottom, top):
        # === reshape tops ===
        top[0].reshape(self.batch_size, self.num_channels,
                       self.params['img_size'], self.params['img_size'])
        top[1].reshape(self.batch_size, self.num_points * 2)

    def backward(self, top, propagate_down, bottom):
        """
        These layers does not back propagate
        """
        pass
def train(prefix, **arg_dict):
    img_size = arg_dict['img_size']
    gpu_num = len(arg_dict["gpu_device"].split(','))
    batch_size = arg_dict['batch_size'] * gpu_num
    arg_dict['batch_size'] = batch_size
    print ("real batch_size = %d for gpu_num = %d" % (batch_size, gpu_num))
    # batch generator
    _batch_reader = BatchReader(**arg_dict)
    _batch_generator = _batch_reader.batch_generator()
    # net
    ctx = [mx.gpu(i) for i in range(gpu_num)]
    model_params = json.loads(arg_dict["model_params"])
    model_params["feature_dim"] = arg_dict["feature_dim"]
    model_params["label_num"] = arg_dict["label_num"]
    net =  models.init(arg_dict["model"], model_params=model_params)
    if arg_dict["restore_ckpt"]:
        print "resotre checkpoint from %s" % (arg_dict["restore_ckpt"])
        net.initialize(init=mx.init.Xavier(), ctx=ctx)
        net.load_params(arg_dict['restore_ckpt'], ctx=ctx, allow_missing=True, ignore_extra=True)
    else:
        net.initialize(init=mx.init.Xavier(), ctx=ctx)
    print (net)
    # trainer
    trainer = gluon.Trainer(net.collect_params(), "sgd", # adam
                            {"learning_rate": arg_dict['learning_rate']})
    # start loop
    print ("Start to training...")
    start_time = time.time()
    step = 1
    display = 100
    loss_list = []
    while not _batch_reader.should_stop():
        batch = _batch_generator.next()
        data = nd.array(batch[0], dtype='float32')
        data = nd.transpose(data, (0,3,1,2))
        label = nd.array(batch[1], dtype='float32')
        data_list = gluon.utils.split_and_load(data, ctx)
        label_list = gluon.utils.split_and_load(label, ctx)
        #  normalization, in-place operation
        for i in range(gpu_num):
            data_list[i] -= 127.5
            data_list[i] *= 0.0078125
        # forward
        with autograd.record():
            losses = [net(x, y) for x, y in zip(data_list, label_list)]
        for l in losses:
            l.backward()
        trainer.step(batch_size)
        loss = np.mean([nd.mean(l).asscalar() for l in losses])
        loss_list.append(loss)
        nd.waitall()
        if step % display == 0:
            end_time = time.time()
            cost_time, start_time = end_time - start_time, end_time
            sample_per_sec = int(display * batch_size / cost_time)
            sec_per_step = cost_time / float(display)
            loss_display = "[loss: %.5f]" % (np.mean(loss_list))
            print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %s,'\
                   'sample/s: %d, sec/step: %.3f' % (
                   datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), 
                   _batch_reader.get_epoch(), step, trainer.learning_rate, loss_display,
                   sample_per_sec, sec_per_step))
            loss_list = []
        if step % 500000 == 0:
            # change lr
            trainer.set_learning_rate(trainer.learning_rate * 0.95)
            print ("change lr to %f" % trainer.learning_rate)
        if step % 100000 == 0:
            # save checkpoint
            checkpoint_path = os.path.join(prefix, 'model.params')
            net.save_params(checkpoint_path)
            print ("save checkpoint to %s" % checkpoint_path)
        step += 1
Esempio n. 7
0
def train(prefix, **arg_dict):
    batch_size = arg_dict['batch_size']
    num_labels = arg_dict['landmark_type'] * 2
    img_size = arg_dict['img_size']
    gpu_list = map(int, arg_dict['gpu_device'].split(','))
    assert (batch_size % len(gpu_list) == 0), "Batch size must exact division by gpu nums"

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # data input
        images = tf.placeholder(tf.float32, shape=[batch_size, img_size, img_size, 3])
        labels = tf.placeholder(tf.float32, shape=[batch_size, num_labels])
        images_split = tf.split(images, len(gpu_list), axis=0)
        labels_split = tf.split(labels, len(gpu_list), axis=0)
        # Create a variable to count the number of train() calls.
        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(0), trainable=False)
        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(arg_dict['learning_rate'],
                                        global_step,
                                        30000,
                                        0.8,
                                        staircase=True)
        # Create an optimizer that performs gradient descent.
        optimizer = tf.train.AdamOptimizer(lr)
        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(len(gpu_list)):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ("landmarks", i)) as scope:
                        loss = tower_loss(scope, images_split[i], labels_split[i], arg_dict['model'], num_labels)
                        tf.get_variable_scope().reuse_variables()
                        # Calculate the gradients for the batch of data on this tower.
                        grads = optimizer.compute_gradients(loss)
                        # Keep track of the gradients across all towers.
                        tower_grads.append(grads)
        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)
        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)
        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())
        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)
        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()
        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                          gpu_options = tf.GPUOptions(allow_growth=True)))
        sess.run(init)
        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())
        if arg_dict['restore_ckpt']:
            variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, arg_dict['restore_ckpt'])
            print ('Resume-trained model restored from: %s' % arg_dict['restore_ckpt'])

        print ("Start to training...")
        # batch generator
        _batch_reader = BatchReader(**arg_dict)
        _batch_generator = _batch_reader.batch_generator()
        start_time = time.time()
        while not _batch_reader.should_stop():
            batch = _batch_generator.next()
            _, _loss, _step, _lr = sess.run([train_op, loss, global_step, lr], 
                                         feed_dict={images: batch[0], labels: batch[1]})
            if _step % 10 == 0:
                end_time = time.time()
                cost_time, start_time = end_time - start_time, end_time
                sample_per_sec = int(10 * batch_size / cost_time)
                sec_per_step = cost_time / 10.0
                print ('[%s] epochs: %d, step: %d, lr: %f, landmark_loss: %.6f, sample/s: %d, sec/step: %.3f' % (
                       datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), 
                       _batch_reader.get_epoch(), _step, _lr, _loss, sample_per_sec, sec_per_step))
            if _step % 1024 == 0:
                checkpoint_path = os.path.join(prefix, 'model.ckpt')
                saver.save(sess, checkpoint_path)
                print ('Saved checkpoint to %s' % checkpoint_path)
        checkpoint_path = os.path.join(prefix, 'model.ckpt')
        saver.save(sess, checkpoint_path)
        print ('\nReview training parameter:\n%s\n'%(str(arg_dict)))
        print ('Saved checkpoint to %s' % checkpoint_path)
        print ('Bye Bye!')