Esempio n. 1
0
    def setup(self, bottom, top):
        self.top_names = ['data', 'label']

        # === Read input parameters ===

        # params is a python dictionary with layer parameters.
        self.params = eval(self.param_str)

        # store input as class variables
        self.batch_size = self.params['batch_size']

        # store landmark_type
        self.num_points = self.params['landmark_type']

        # store data channels
        if self.params['img_format'] == 'RGB':
            self.num_channels = 3
        elif self.params['img_format'] == 'GRAY':
            self.num_channels = 1
        else:
            raise Exception("Unsupport img_format ...")

        # Create a batch loader to load the images.
        # we can disable reader when test
        if self.params['need_reader']:
            self.batch_reader = BatchReader(**self.params)
            self.batch_generator = self.batch_reader.batch_generator()

        # === reshape tops ===
        top[0].reshape(self.batch_size, self.num_channels,
                       self.params['img_size'], self.params['img_size'])
        top[1].reshape(self.batch_size, self.num_points * 2)
Esempio n. 2
0
    def __init__(self, model_config, data_config, data_loader):
        self.model_config = model_config
        self.data_config = data_config

        self.data_loader = data_loader
        self.batch_reader = BatchReader(self.model_config, self.data_config, self.data_loader)

        self.model = Seq2SeqAttentionModel(self.model_config, self.data_loader.word2vec_vectors)
    def setup(self, bottom, top):
        self.top_names = ['data', 'label']

        # === Read input parameters ===

        # params is a python dictionary with layer parameters.
        self.params = eval(self.param_str)

        # store input as class variables
        self.batch_size = self.params['batch_size']

        # Create a batch loader to load the images.
        # we can disable reader when test
        if self.params['need_reader']:
            self.batch_reader = BatchReader(**self.params)
            self.batch_generator = self.batch_reader.batch_generator()

        # === reshape tops ===
        top[0].reshape(self.batch_size, 3, self.params['img_size'],
                       self.params['img_size'])
        top[1].reshape(self.batch_size, 136, 1, 1)
Esempio n. 4
0
    def test__partition_into_data_sets(self):
        all_X = np.random.rand(13, 7)
        all_y = np.random.rand(13)

        # make the call
        data = BatchReader()._partition_into_data_sets(all_X, all_y, 0.8)

        # check that each data set has expected size
        exp_data_set_sizes = {'train': 10, 'validation': 1, 'test': 2}
        for data_set in ['train', 'validation', 'test']:
            X, y = data[data_set]
            np.testing.assert_array_equal(X.shape,
                                          (exp_data_set_sizes[data_set], 7))
            np.testing.assert_array_equal(y.shape,
                                          exp_data_set_sizes[data_set])

        # check that no data is lost or added but only reordered and partitioned
        shuffled_all_X = np.concatenate([X for X, y in data.values()])
        shuffled_all_y = np.concatenate([y for X, y in data.values()])
        np.testing.assert_array_equal(np.sort(shuffled_all_X, axis=0),
                                      np.sort(all_X, axis=0))
        np.testing.assert_array_equal(np.sort(shuffled_all_y, axis=0),
                                      np.sort(all_y, axis=0))
Esempio n. 5
0
class ImageInputDataLayer(caffe.Layer):
    def setup(self, bottom, top):
        self.top_names = ['data', 'label']

        # === Read input parameters ===

        # params is a python dictionary with layer parameters.
        self.params = eval(self.param_str)

        # store input as class variables
        self.batch_size = self.params['batch_size']

        # Create a batch loader to load the images.
        # we can disable reader when test
        if self.params['need_reader']:
            self.batch_reader = BatchReader(**self.params)
            self.batch_generator = self.batch_reader.batch_generator()

        # === reshape tops ===
        top[0].reshape(
            self.batch_size, 3, self.params['img_size'], self.params['img_size'])
        top[1].reshape(
            self.batch_size, 10)

    def forward(self, bottom, top):
        """
        Load data.
        """
        images, labels = self.batch_generator.next()
        top[0].data[...] = images
        top[1].data[...] = labels

    def reshape(self, bottom, top):
        # === reshape tops ===
        top[0].reshape(
            self.batch_size, 3, self.params['img_size'], self.params['img_size'])
        top[1].reshape(
            self.batch_size, 10)

    def backward(self, top, propagate_down, bottom):
        """
        These layers does not back propagate
        """
        pass
Esempio n. 6
0
def train(prefix, **arg_dict):
    batch_size = arg_dict['batch_size']
    num_labels = arg_dict['landmark_type'] * 2
    img_size = arg_dict['img_size']
    # batch generator
    _batch_reader = BatchReader(**arg_dict)
    _batch_generator = _batch_reader.batch_generator()

    with tf.Graph().as_default():
        images = tf.placeholder(tf.float32,
                                shape=[batch_size, img_size, img_size, 3])
        point_labels = tf.placeholder(tf.float32,
                                      shape=[batch_size, num_labels])

        logits = models.init(arg_dict['model'],
                             images,
                             num_labels,
                             is_training=True)

        loss = models.get_l2_loss(logits, point_labels, batch_size)

        # Create a variable to track the global step.
        global_step = tf.Variable(0, name='global_step', trainable=False)
        learning_rate = tf.train.exponential_decay(arg_dict['learning_rate'],
                                                   global_step,
                                                   30000,
                                                   0.5,
                                                   staircase=True)
        # Use the optimizer to apply the gradients that minimize the loss
        # (and also increment the global step counter) as a single training step.
        optimizer = tf.train.AdamOptimizer(learning_rate)
        train_op = optimizer.minimize(loss, global_step=global_step)

        sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True)))
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = tf.train.Saver(tf.global_variables())

        if arg_dict['restore_ckpt']:
            variables_to_restore = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, arg_dict['restore_ckpt'])
            print('Resume-trained model restored from: %s' %
                  arg_dict['restore_ckpt'])

        tf.train.write_graph(sess.graph.as_graph_def(), '.',
                             os.path.join(prefix, 'graph_struct.txt'))

        print("Start to training...")
        start_time = time.time()
        while not _batch_reader.should_stop():
            with tf.device('/gpu:0'):
                batch = _batch_generator.next()
                _, ploss, step, lr = sess.run(
                    [train_op, loss, global_step, learning_rate],
                    feed_dict={
                        images: batch[0],
                        point_labels: batch[1]
                    })
                if step % 10 == 0:
                    end_time = time.time()
                    cost_time, start_time = end_time - start_time, end_time
                    sample_per_sec = int(10 * batch_size / cost_time)
                    sec_per_step = cost_time / 10.0
                    print(
                        '[%s] epochs: %d, step: %d, lr: %f, landmark_loss: %.4f, sample/s: %d, sec/step: %.3f'
                        % (datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
                           _batch_reader.get_epoch(), step, lr, ploss,
                           sample_per_sec, sec_per_step))
            if step % 1024 == 0:
                checkpoint_path = os.path.join(prefix, 'model.ckpt')
                saver.save(sess, checkpoint_path)
                print('Saved checkpoint to %s' % checkpoint_path)
        checkpoint_path = os.path.join(prefix, 'model.ckpt')
        saver.save(sess, checkpoint_path)
        print('\nReview training parameter:\n%s\n' % (str(arg_dict)))
        print('Saved checkpoint to %s' % checkpoint_path)
        print('Bye Bye!')
Esempio n. 7
0
def train(prefix, **arg_dict):
    img_size = arg_dict['img_size']
    gpu_num = len(arg_dict["gpu_device"].split(','))
    batch_size = arg_dict["batch_size"]
    common_dict = {"global_step": 1}
    print ("batch_size = %d for gpu_num = %d" % (batch_size, gpu_num))
    if arg_dict["parallel_mode"] == "ModelParallel":
        print ("Working on model parallel.")
        if gpu_num <= 1:
            raise Exception("Model parallel only support more than 2 gpu number")
    elif arg_dict["parallel_mode"] == "DataParallel":
        print ("Working on data parallel")
    else:
        raise Exception("Unsupport parallel mode. see --help")
    # Creat tf_summary writer.
    try:
        from tensorboardX import SummaryWriter
        summary_dir = os.path.join(prefix, "tf_summary")
        if os.path.exists(summary_dir):
            print ("Delete old summary in first.")
            os.system("rm -rf {}".format(summary_dir))
        common_dict["tensorboard_writer"] = SummaryWriter(summary_dir)
        print ("Enable tensorboard summary.")
        print ("Please using 'python -m tensorboard.main --logdir={}'".format(summary_dir))
    except Exception as ex:
        common_dict["tensorboard_writer"] = None
        print ("Disable tensorboard summary. please install tensorboardX in first.")
        print ("Easy to install by 'pip install tensorboardX --user'")
    # batch generator
    _batch_reader = BatchReader(**arg_dict)
    _batch_generator = _batch_reader.batch_generator()
    # net
    model_params = json.loads(arg_dict["model_params"])
    model_params["image_size"] = arg_dict["img_size"]
    model_params["feature_dim"] = arg_dict["feature_dim"]
    model_params["class_num"] = arg_dict["label_num"]
    net =  models.init(arg_dict["model"], gpu_num=gpu_num, model_params=model_params,
                       parallel_mode=arg_dict["parallel_mode"], common_dict=common_dict)
    if arg_dict["parallel_mode"] == "DataParallel":
        net = nn.DataParallel(net)
        net.cuda()
    # print (net)
    if arg_dict["restore_ckpt"]:
        print ("Resotre ckpt from {}".format(arg_dict["restore_ckpt"]))
        net.load_state_dict(torch.load(arg_dict["restore_ckpt"]))
    # optimizer
    optimizer = optim.SGD(net.parameters(), lr=arg_dict['learning_rate'],
                          momentum=0.9, weight_decay=5e-4)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20000, gamma=0.95)
    # start loop
    print ("Start to training...")
    start_time = time.time()
    display = 100
    loss_list = []
    while not _batch_reader.should_stop():
        #  prepare data
        batch_st = time.time()
        batch = _batch_generator.next()
        datas = batch[0].cuda()
        labels = batch[1].cuda()
        batch_et = time.time()
        #  forward and backward
        loss = net(datas, labels)
        loss = loss.mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lossd = loss.data[0]
        #  display
        loss_list.append(lossd)
        if common_dict["global_step"] % display == 0:
            end_time = time.time()
            cost_time, start_time = end_time - start_time, end_time
            sample_per_sec = int(display * batch_size / cost_time)
            sec_per_step = cost_time / float(display)
            loss_display = np.mean(loss_list)
            lr = optimizer.param_groups[0]['lr']
            print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %.5f, '\
                   'sample/s: %d, sec/step: %.3f, batch time: %.3fs' % (
                   datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), 
                   _batch_reader.get_epoch(), common_dict["global_step"], lr, loss_display,
                   sample_per_sec, sec_per_step, batch_et - batch_st))
            loss_list = []
            if common_dict["tensorboard_writer"] is not None:
                common_dict["tensorboard_writer"].add_scalar("loss", loss_display,
                                                             common_dict["global_step"])
                common_dict["tensorboard_writer"].add_scalar("sample_per_sec", sample_per_sec,
                                                             common_dict["global_step"])
                common_dict["tensorboard_writer"].add_scalar("lr", lr,
                                                             common_dict["global_step"])
        if common_dict["global_step"] % 10000 == 0:
            # save checkpoint
            checkpoint_path = os.path.join(prefix, 'model.ckpt')
            torch.save(net.state_dict(), checkpoint_path)
            print ("save checkpoint to %s" % checkpoint_path)
        lr_scheduler.step()
        common_dict["global_step"] += 1
Esempio n. 8
0
def main(argv=None):
    random.seed(2)
    print("Num GPUs Available: ",
          len(tf.config.experimental.list_physical_devices('GPU')))

    # Load data and preprocess data
    print("Loading data...")
    data_reader = DataReader(FLAGS.DATA_PATH, FLAGS.DATA_FILENAME,
                             FLAGS.NUM_MODEL)
    well_dic = data_reader.create_well_dictionary()

    print("Preprocessing data...")
    target_well = well_dic[str(FLAGS.WELL_TO_LEARN)]
    test_model_data = target_well[str(FLAGS.TRUE_MODEL)]

    preprocessor = Preprocessor(FLAGS.NUM_MODEL, FLAGS.TRUE_MODEL)
    well_data_zero_removed = preprocessor.remove_zero_wopr(target_well)
    serialized_data, end_indice = preprocessor.serialize_well_dataframe(
        well_data_zero_removed)
    scaled_data, scaler = preprocessor.scale_serialzed_data(serialized_data)

    # Split dataset and prepare batch
    batch_reader = BatchReader(scaled_data=scaled_data,
                               end_indice=end_indice,
                               train_split=FLAGS.TRAIN_SPLIT,
                               true_model=FLAGS.TRUE_MODEL,
                               buffer_size=FLAGS.BUFFER_SIZE,
                               batch_size=FLAGS.BATCH_SIZE)

    train_data = batch_reader.get_train_batch()
    val_data = batch_reader.get_val_batch()
    train_total_seq_length = batch_reader.get_seq_length()

    # Define Model
    print("Defining model...")
    model_builder = ModelBuilder(FLAGS.BATCH_SIZE)
    model = model_builder.contruct_model()
    model.summary()

    # Set Training callbacks
    history_logger = HistoryLogger()

    # Train the model
    print("Begin training the model...")
    for epoch_idx in range(FLAGS.EPOCHS):
        print('epochs : ' + str(epoch_idx + 1))
        model.fit(train_data,
                  epochs=1,
                  steps_per_epoch=train_total_seq_length / FLAGS.BATCH_SIZE,
                  verbose=2,
                  validation_data=val_data,
                  validation_steps=100,
                  use_multiprocessing=True,
                  callbacks=[history_logger])
        model.reset_states()

    # Save fig of loss history
    print("Saving loss history")
    plotter = Plotter(FLAGS.EPOCHS, FLAGS.WELL_TO_LEARN, FLAGS.TRUE_MODEL)
    plotter.plot_loss_history(history_logger.losses, history_logger.val_losses)

    # Inference (Cascade)
    print("Starting inference...")
    test_data = scaler.transform(test_model_data.values)
    total_timestep = test_data.shape[0]
    test_x, test_y = batch_reader.get_test_input_and_label(test_data)

    seq_in = test_x[FLAGS.OBSERVATION_DAY -
                    FLAGS.BATCH_SIZE:FLAGS.OBSERVATION_DAY, :, :]
    seq_out = test_x[:FLAGS.INPUT_SEQUENCE, :1, :].flatten().tolist(
    ) + test_y[:FLAGS.OBSERVATION_DAY + 1].tolist()

    pred_count = test_x.shape[0] - FLAGS.OBSERVATION_DAY

    # Do Inference from Observationday
    for i in range(1, pred_count):
        sample_in = seq_in
        pred_out = model.predict(sample_in)
        seq_out.append(pred_out[-1, :].item())
        seq_in = test_x[FLAGS.OBSERVATION_DAY - FLAGS.BATCH_SIZE +
                        i:FLAGS.OBSERVATION_DAY + i, :, :]

    model.reset_states()

    # Evaluate
    print("Start evaluating the model...")
    seq_out_array = np.asarray(seq_out)
    prediction_val = (seq_out_array - scaler.min_[0]) / scaler.scale_[0]
    true_val = test_model_data['WOPR'].to_numpy()

    # Plot prediction result
    print("Saving prediction result...")
    plotter.plot_prediction(total_timestep, true_val, prediction_val)

    # Calculate error and save into file
    print("Calculate MAPE and save it to result file...")
    result_handler = ResultHandler(true_val=true_val,
                                   pred_val=prediction_val,
                                   well_to_learn=FLAGS.WELL_TO_LEARN,
                                   true_model=FLAGS.TRUE_MODEL)
    result_handler.save_mape_to_csv(FLAGS.RESULT_FILENAME)

    # Clear Session
    tf.keras.backend.clear_session()
    print("Done")
Esempio n. 9
0
def train(prefix, **arg_dict):
    num_labels = arg_dict['landmark_type'] * 2
    img_size = arg_dict['img_size']
    train_angle = arg_dict['train_angle']
    gpu_num = len(arg_dict["gpu_device"].split(','))
    batch_size = arg_dict['batch_size'] * gpu_num
    arg_dict['batch_size'] = batch_size
    print("real batch_size = %d for gpu_num = %d" % (batch_size, gpu_num))
    # batch generator
    _batch_reader = BatchReader(**arg_dict)
    _batch_generator = _batch_reader.batch_generator()
    # net
    ctx = [mx.gpu(i) for i in range(gpu_num)]
    net = models.init(num_label=num_labels, **arg_dict)
    if arg_dict["restore_ckpt"]:
        print "resotre checkpoint from %s" % (arg_dict["restore_ckpt"])
        net.load_params(arg_dict['restore_ckpt'], ctx=ctx)
    else:
        net.initialize(init=mx.init.Xavier(), ctx=ctx)
    print net
    # loss
    losses_func = []
    if train_angle:
        losses_func.append(gluon.loss.L2Loss(weight=0.5))  # landmark
        losses_func.append(gluon.loss.L2Loss(weight=0.5))  # angle
    else:
        losses_func.append(gluon.loss.L2Loss())  # landmark
    # trainer
    trainer = gluon.Trainer(net.collect_params(), "adam",
                            {"learning_rate": arg_dict['learning_rate']})
    # start loop
    print("Start to training...")
    start_time = time.time()
    step = 0
    display = 10
    loss_list = []
    while not _batch_reader.should_stop():
        batch = _batch_generator.next()
        image = nd.array(batch[0])
        image = nd.transpose(image.astype('float32'),
                             (0, 3, 1, 2)) / 127.5 - 1.0
        image_list = gluon.utils.split_and_load(image, ctx)
        landmark = nd.array(batch[1])
        landmark_list = gluon.utils.split_and_load(landmark, ctx)
        if train_angle:
            angle = nd.array(batch[2])
            angle_list = gluon.utils.split_and_load(angle, ctx)
        with autograd.record():
            losses = []
            if train_angle:
                for _i, _l, _a in zip(image_list, landmark_list, angle_list):
                    predicts = net(_i)
                    landmark_loss = losses_func[0](predicts[0], _l)
                    angle_loss = losses_func[1](predicts[1], _a)
                    losses.append(landmark_loss + angle_loss)
            else:
                for _i, _l in zip(image_list, landmark_list):
                    predicts = net(_i)
                    landmark_loss = losses_func[0](predicts, _l)
                    losses.append(landmark_loss)
        for loss in losses:
            loss.backward()
        trainer.step(batch_size)
        loss_list.append(np.mean([nd.mean(l).asscalar() for l in losses]))
        nd.waitall()
        if step % display == 0:
            end_time = time.time()
            cost_time, start_time = end_time - start_time, end_time
            sample_per_sec = int(display * batch_size / cost_time)
            sec_per_step = cost_time / float(display)
            loss_display = "[landmark: %.5f]" % (np.mean(loss_list))
            print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %s,'\
                   'sample/s: %d, sec/step: %.3f' % (
                   datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
                   _batch_reader.get_epoch(), step, trainer.learning_rate, loss_display,
                   sample_per_sec, sec_per_step))
            loss_list = []
        if step % 1024 == 0:
            # change lr
            trainer.set_learning_rate(trainer.learning_rate * 0.95)
            # save checkpoint
            checkpoint_path = os.path.join(prefix, 'model.params')
            net.save_params(checkpoint_path)
            print("save checkpoint to %s" % checkpoint_path)
        step += 1
Esempio n. 10
0
class ImageInputDataLayer(caffe.Layer):
    def setup(self, bottom, top):
        self.top_names = ['data', 'label']

        # === Read input parameters ===

        # params is a python dictionary with layer parameters.
        self.params = eval(self.param_str)

        # store input as class variables
        self.batch_size = self.params['batch_size']

        # store landmark_type
        self.num_points = self.params['landmark_type']

        # store data channels
        if self.params['img_format'] == 'RGB':
            self.num_channels = 3
        elif self.params['img_format'] == 'GRAY':
            self.num_channels = 1
        else:
            raise Exception("Unsupport img_format ...")

        # Create a batch loader to load the images.
        # we can disable reader when test
        if self.params['need_reader']:
            self.batch_reader = BatchReader(**self.params)
            self.batch_generator = self.batch_reader.batch_generator()

        # === reshape tops ===
        top[0].reshape(self.batch_size, self.num_channels,
                       self.params['img_size'], self.params['img_size'])
        top[1].reshape(self.batch_size, self.num_points * 2)

    def preProcessImage(self, imgs):
        """
        process images before feeding to CNNs
        imgs: N x 1 x W x H
        """
        imgs = imgs.astype(np.float32)
        for i, img in enumerate(imgs):
            m = img.mean()
            s = img.std()
            imgs[i] = (img - m) / s
        return imgs

    def forward(self, bottom, top):
        """
        Load data.
        """
        images, labels = self.batch_generator.next()
        #print 'liusanjun images num', len(images)
        top[0].data[...] = images
        top[1].data[...] = labels

    def reshape(self, bottom, top):
        # === reshape tops ===
        top[0].reshape(self.batch_size, self.num_channels,
                       self.params['img_size'], self.params['img_size'])
        top[1].reshape(self.batch_size, self.num_points * 2)

    def backward(self, top, propagate_down, bottom):
        """
        These layers does not back propagate
        """
        pass
Esempio n. 11
0
class Seq2SeqAttentionTrain(object):
    def __init__(self, model_config, data_config, data_loader):
        self.model_config = model_config
        self.data_config = data_config

        self.data_loader = data_loader
        self.batch_reader = BatchReader(self.model_config, self.data_config, self.data_loader)

        self.model = Seq2SeqAttentionModel(self.model_config, self.data_loader.word2vec_vectors)

    def running_avg_loss(self, loss, running_avg_loss, summary_writer, step, decay=0.999):
        """
        calculate the running average of losses.
        :param loss: current runtime loss
        :param running_avg_loss: model output loss
        :param summary_writer: tensorflow summary writer
        :param step: running step
        :param decay: when running avg loss
        :return: average loss
        """
        if running_avg_loss == 0:
            running_avg_loss = loss
        else:
            running_avg_loss = running_avg_loss * decay + (1-decay) * running_avg_loss

        running_avg_loss = min(running_avg_loss, 12)
        loss_sum = tf.Summary()
        loss_sum.value.add(tag='running_avg_loss', simple_value=running_avg_loss)
        summary_writer.add_summary(loss_sum, step)

        return running_avg_loss

    def train(self):
        """
        train model
        :return:
        """
        """
            Train dir is different from log_root to avoid summary directory
        """
        with tf.device('/cpu:0'):
            saver = tf.train.Saver()
            summary_writer = tf.summary.FileWriter(self.model_config.train_dir)

            sv = tf.train.Supervisor(logdir=self.model_config.log_path,
                                     is_cheif=True,
                                     saver=saver,
                                     summary_op=None,
                                     save_model_secs=self.model_config.save_model_secs,
                                     global_step=self.model.global_step)
            session = sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True))

            running_avg_loss = 0
            step = 0

            while not sv.should_stop() and step < self.model_config.max_step:
                (article_batch, abstract_batch, target_batch, article_batch_lens, dec_output_lens,
                loss_weights, _, _) = self.batch_reader.next_batch()

                to_return = [self.model.optim, self.model.summarise, self.model.loss, self.model.global_step]
                result = session.run(to_return, feed_dict={
                    self.model.article: article_batch,
                    self.model.abstract: abstract_batch,
                    self.model.targets: target_batch,
                    self.model.article_length: article_batch_lens,
                    self.model.loss_weights: loss_weights})

                running_avg_loss = self.running_avg_loss(running_avg_loss, result[2], summary_writer, step)

                summary_writer.add_summary(result[1], result[3])
                step += 1

                if step % 100 == 0:
                    summary_writer.flush()

                print('{0} step, loss is {1}'.format(str(step), str(running_avg_loss)))

            sv.stop()

    def eval(self):
        """
        evaluate model
        :return:
        """
        saver = tf.train.Saver()
        summary_writer = tf.summary.FileWriter(self.model_config.eval_dir)
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        running_avg_loss = 0
        step = 0

        while True:
            time.sleep(60)

            try:
                ckpt_state = tf.train.get_checkpoint_state(self.model_config.log_root)
            except tf.errors.OutOfRangeError as e:
                tf.logging.error('Cannot restore checkpoint: %s', e)
                continue

            if not (ckpt_state and ckpt_state.model_checkpoint_path):
                tf.logging.info('No model to eval yet at %s', self.model_config.train_dir)
                continue

            tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
            saver.restore(sess, ckpt_state.model_checkpoint_path)

            (article_batch, abstract_batch, target_batch, article_batch_lens, dec_output_lens,
             loss_weights, _, _) = self.batch_reader.next_batch()

            to_return = [self.model.summarise, self.model.loss, self.model.global_step]
            result = sess.run(to_return, feed_dict={
                self.model.article: article_batch,
                self.model.abstract: abstract_batch,
                self.model.targets: target_batch,
                self.model.article_length: article_batch_lens,
                self.model.loss_weights: loss_weights})

            summary_writer.add_summary(result[0], result[2])
            running_avg_loss = self.running_avg_loss(
                running_avg_loss, result[1], summary_writer, result[2])
            if step % 100 == 0:
                summary_writer.flush()

            print('{0} step, loss is {1}'.format(str(result[2]), str(running_avg_loss)))
Esempio n. 12
0
def train(prefix, **arg_dict):
    img_size = arg_dict['img_size']
    gpu_num = len(arg_dict["gpu_device"].split(','))
    batch_size = arg_dict['batch_size'] * gpu_num
    arg_dict['batch_size'] = batch_size
    print ("real batch_size = %d for gpu_num = %d" % (batch_size, gpu_num))
    # batch generator
    _batch_reader = BatchReader(**arg_dict)
    _batch_generator = _batch_reader.batch_generator()
    # net
    ctx = [mx.gpu(i) for i in range(gpu_num)]
    model_params = json.loads(arg_dict["model_params"])
    model_params["feature_dim"] = arg_dict["feature_dim"]
    model_params["label_num"] = arg_dict["label_num"]
    net =  models.init(arg_dict["model"], model_params=model_params)
    if arg_dict["restore_ckpt"]:
        print "resotre checkpoint from %s" % (arg_dict["restore_ckpt"])
        net.initialize(init=mx.init.Xavier(), ctx=ctx)
        net.load_params(arg_dict['restore_ckpt'], ctx=ctx, allow_missing=True, ignore_extra=True)
    else:
        net.initialize(init=mx.init.Xavier(), ctx=ctx)
    print (net)
    # trainer
    trainer = gluon.Trainer(net.collect_params(), "sgd", # adam
                            {"learning_rate": arg_dict['learning_rate']})
    # start loop
    print ("Start to training...")
    start_time = time.time()
    step = 1
    display = 100
    loss_list = []
    while not _batch_reader.should_stop():
        batch = _batch_generator.next()
        data = nd.array(batch[0], dtype='float32')
        data = nd.transpose(data, (0,3,1,2))
        label = nd.array(batch[1], dtype='float32')
        data_list = gluon.utils.split_and_load(data, ctx)
        label_list = gluon.utils.split_and_load(label, ctx)
        #  normalization, in-place operation
        for i in range(gpu_num):
            data_list[i] -= 127.5
            data_list[i] *= 0.0078125
        # forward
        with autograd.record():
            losses = [net(x, y) for x, y in zip(data_list, label_list)]
        for l in losses:
            l.backward()
        trainer.step(batch_size)
        loss = np.mean([nd.mean(l).asscalar() for l in losses])
        loss_list.append(loss)
        nd.waitall()
        if step % display == 0:
            end_time = time.time()
            cost_time, start_time = end_time - start_time, end_time
            sample_per_sec = int(display * batch_size / cost_time)
            sec_per_step = cost_time / float(display)
            loss_display = "[loss: %.5f]" % (np.mean(loss_list))
            print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %s,'\
                   'sample/s: %d, sec/step: %.3f' % (
                   datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), 
                   _batch_reader.get_epoch(), step, trainer.learning_rate, loss_display,
                   sample_per_sec, sec_per_step))
            loss_list = []
        if step % 500000 == 0:
            # change lr
            trainer.set_learning_rate(trainer.learning_rate * 0.95)
            print ("change lr to %f" % trainer.learning_rate)
        if step % 100000 == 0:
            # save checkpoint
            checkpoint_path = os.path.join(prefix, 'model.params')
            net.save_params(checkpoint_path)
            print ("save checkpoint to %s" % checkpoint_path)
        step += 1
Esempio n. 13
0
def train(prefix, **arg_dict):
    batch_size = arg_dict['batch_size']
    num_labels = arg_dict['landmark_type'] * 2
    img_size = arg_dict['img_size']
    gpu_list = map(int, arg_dict['gpu_device'].split(','))
    assert (batch_size % len(gpu_list) == 0), "Batch size must exact division by gpu nums"

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # data input
        images = tf.placeholder(tf.float32, shape=[batch_size, img_size, img_size, 3])
        labels = tf.placeholder(tf.float32, shape=[batch_size, num_labels])
        images_split = tf.split(images, len(gpu_list), axis=0)
        labels_split = tf.split(labels, len(gpu_list), axis=0)
        # Create a variable to count the number of train() calls.
        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(0), trainable=False)
        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(arg_dict['learning_rate'],
                                        global_step,
                                        30000,
                                        0.8,
                                        staircase=True)
        # Create an optimizer that performs gradient descent.
        optimizer = tf.train.AdamOptimizer(lr)
        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(len(gpu_list)):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ("landmarks", i)) as scope:
                        loss = tower_loss(scope, images_split[i], labels_split[i], arg_dict['model'], num_labels)
                        tf.get_variable_scope().reuse_variables()
                        # Calculate the gradients for the batch of data on this tower.
                        grads = optimizer.compute_gradients(loss)
                        # Keep track of the gradients across all towers.
                        tower_grads.append(grads)
        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)
        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)
        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())
        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)
        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()
        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                          gpu_options = tf.GPUOptions(allow_growth=True)))
        sess.run(init)
        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())
        if arg_dict['restore_ckpt']:
            variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, arg_dict['restore_ckpt'])
            print ('Resume-trained model restored from: %s' % arg_dict['restore_ckpt'])

        print ("Start to training...")
        # batch generator
        _batch_reader = BatchReader(**arg_dict)
        _batch_generator = _batch_reader.batch_generator()
        start_time = time.time()
        while not _batch_reader.should_stop():
            batch = _batch_generator.next()
            _, _loss, _step, _lr = sess.run([train_op, loss, global_step, lr], 
                                         feed_dict={images: batch[0], labels: batch[1]})
            if _step % 10 == 0:
                end_time = time.time()
                cost_time, start_time = end_time - start_time, end_time
                sample_per_sec = int(10 * batch_size / cost_time)
                sec_per_step = cost_time / 10.0
                print ('[%s] epochs: %d, step: %d, lr: %f, landmark_loss: %.6f, sample/s: %d, sec/step: %.3f' % (
                       datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), 
                       _batch_reader.get_epoch(), _step, _lr, _loss, sample_per_sec, sec_per_step))
            if _step % 1024 == 0:
                checkpoint_path = os.path.join(prefix, 'model.ckpt')
                saver.save(sess, checkpoint_path)
                print ('Saved checkpoint to %s' % checkpoint_path)
        checkpoint_path = os.path.join(prefix, 'model.ckpt')
        saver.save(sess, checkpoint_path)
        print ('\nReview training parameter:\n%s\n'%(str(arg_dict)))
        print ('Saved checkpoint to %s' % checkpoint_path)
        print ('Bye Bye!')