def setup(self, bottom, top): self.top_names = ['data', 'label'] # === Read input parameters === # params is a python dictionary with layer parameters. self.params = eval(self.param_str) # store input as class variables self.batch_size = self.params['batch_size'] # store landmark_type self.num_points = self.params['landmark_type'] # store data channels if self.params['img_format'] == 'RGB': self.num_channels = 3 elif self.params['img_format'] == 'GRAY': self.num_channels = 1 else: raise Exception("Unsupport img_format ...") # Create a batch loader to load the images. # we can disable reader when test if self.params['need_reader']: self.batch_reader = BatchReader(**self.params) self.batch_generator = self.batch_reader.batch_generator() # === reshape tops === top[0].reshape(self.batch_size, self.num_channels, self.params['img_size'], self.params['img_size']) top[1].reshape(self.batch_size, self.num_points * 2)
def __init__(self, model_config, data_config, data_loader): self.model_config = model_config self.data_config = data_config self.data_loader = data_loader self.batch_reader = BatchReader(self.model_config, self.data_config, self.data_loader) self.model = Seq2SeqAttentionModel(self.model_config, self.data_loader.word2vec_vectors)
def setup(self, bottom, top): self.top_names = ['data', 'label'] # === Read input parameters === # params is a python dictionary with layer parameters. self.params = eval(self.param_str) # store input as class variables self.batch_size = self.params['batch_size'] # Create a batch loader to load the images. # we can disable reader when test if self.params['need_reader']: self.batch_reader = BatchReader(**self.params) self.batch_generator = self.batch_reader.batch_generator() # === reshape tops === top[0].reshape(self.batch_size, 3, self.params['img_size'], self.params['img_size']) top[1].reshape(self.batch_size, 136, 1, 1)
def test__partition_into_data_sets(self): all_X = np.random.rand(13, 7) all_y = np.random.rand(13) # make the call data = BatchReader()._partition_into_data_sets(all_X, all_y, 0.8) # check that each data set has expected size exp_data_set_sizes = {'train': 10, 'validation': 1, 'test': 2} for data_set in ['train', 'validation', 'test']: X, y = data[data_set] np.testing.assert_array_equal(X.shape, (exp_data_set_sizes[data_set], 7)) np.testing.assert_array_equal(y.shape, exp_data_set_sizes[data_set]) # check that no data is lost or added but only reordered and partitioned shuffled_all_X = np.concatenate([X for X, y in data.values()]) shuffled_all_y = np.concatenate([y for X, y in data.values()]) np.testing.assert_array_equal(np.sort(shuffled_all_X, axis=0), np.sort(all_X, axis=0)) np.testing.assert_array_equal(np.sort(shuffled_all_y, axis=0), np.sort(all_y, axis=0))
class ImageInputDataLayer(caffe.Layer): def setup(self, bottom, top): self.top_names = ['data', 'label'] # === Read input parameters === # params is a python dictionary with layer parameters. self.params = eval(self.param_str) # store input as class variables self.batch_size = self.params['batch_size'] # Create a batch loader to load the images. # we can disable reader when test if self.params['need_reader']: self.batch_reader = BatchReader(**self.params) self.batch_generator = self.batch_reader.batch_generator() # === reshape tops === top[0].reshape( self.batch_size, 3, self.params['img_size'], self.params['img_size']) top[1].reshape( self.batch_size, 10) def forward(self, bottom, top): """ Load data. """ images, labels = self.batch_generator.next() top[0].data[...] = images top[1].data[...] = labels def reshape(self, bottom, top): # === reshape tops === top[0].reshape( self.batch_size, 3, self.params['img_size'], self.params['img_size']) top[1].reshape( self.batch_size, 10) def backward(self, top, propagate_down, bottom): """ These layers does not back propagate """ pass
def train(prefix, **arg_dict): batch_size = arg_dict['batch_size'] num_labels = arg_dict['landmark_type'] * 2 img_size = arg_dict['img_size'] # batch generator _batch_reader = BatchReader(**arg_dict) _batch_generator = _batch_reader.batch_generator() with tf.Graph().as_default(): images = tf.placeholder(tf.float32, shape=[batch_size, img_size, img_size, 3]) point_labels = tf.placeholder(tf.float32, shape=[batch_size, num_labels]) logits = models.init(arg_dict['model'], images, num_labels, is_training=True) loss = models.get_l2_loss(logits, point_labels, batch_size) # Create a variable to track the global step. global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = tf.train.exponential_decay(arg_dict['learning_rate'], global_step, 30000, 0.5, staircase=True) # Use the optimizer to apply the gradients that minimize the loss # (and also increment the global step counter) as a single training step. optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.minimize(loss, global_step=global_step) sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver(tf.global_variables()) if arg_dict['restore_ckpt']: variables_to_restore = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, arg_dict['restore_ckpt']) print('Resume-trained model restored from: %s' % arg_dict['restore_ckpt']) tf.train.write_graph(sess.graph.as_graph_def(), '.', os.path.join(prefix, 'graph_struct.txt')) print("Start to training...") start_time = time.time() while not _batch_reader.should_stop(): with tf.device('/gpu:0'): batch = _batch_generator.next() _, ploss, step, lr = sess.run( [train_op, loss, global_step, learning_rate], feed_dict={ images: batch[0], point_labels: batch[1] }) if step % 10 == 0: end_time = time.time() cost_time, start_time = end_time - start_time, end_time sample_per_sec = int(10 * batch_size / cost_time) sec_per_step = cost_time / 10.0 print( '[%s] epochs: %d, step: %d, lr: %f, landmark_loss: %.4f, sample/s: %d, sec/step: %.3f' % (datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), _batch_reader.get_epoch(), step, lr, ploss, sample_per_sec, sec_per_step)) if step % 1024 == 0: checkpoint_path = os.path.join(prefix, 'model.ckpt') saver.save(sess, checkpoint_path) print('Saved checkpoint to %s' % checkpoint_path) checkpoint_path = os.path.join(prefix, 'model.ckpt') saver.save(sess, checkpoint_path) print('\nReview training parameter:\n%s\n' % (str(arg_dict))) print('Saved checkpoint to %s' % checkpoint_path) print('Bye Bye!')
def train(prefix, **arg_dict): img_size = arg_dict['img_size'] gpu_num = len(arg_dict["gpu_device"].split(',')) batch_size = arg_dict["batch_size"] common_dict = {"global_step": 1} print ("batch_size = %d for gpu_num = %d" % (batch_size, gpu_num)) if arg_dict["parallel_mode"] == "ModelParallel": print ("Working on model parallel.") if gpu_num <= 1: raise Exception("Model parallel only support more than 2 gpu number") elif arg_dict["parallel_mode"] == "DataParallel": print ("Working on data parallel") else: raise Exception("Unsupport parallel mode. see --help") # Creat tf_summary writer. try: from tensorboardX import SummaryWriter summary_dir = os.path.join(prefix, "tf_summary") if os.path.exists(summary_dir): print ("Delete old summary in first.") os.system("rm -rf {}".format(summary_dir)) common_dict["tensorboard_writer"] = SummaryWriter(summary_dir) print ("Enable tensorboard summary.") print ("Please using 'python -m tensorboard.main --logdir={}'".format(summary_dir)) except Exception as ex: common_dict["tensorboard_writer"] = None print ("Disable tensorboard summary. please install tensorboardX in first.") print ("Easy to install by 'pip install tensorboardX --user'") # batch generator _batch_reader = BatchReader(**arg_dict) _batch_generator = _batch_reader.batch_generator() # net model_params = json.loads(arg_dict["model_params"]) model_params["image_size"] = arg_dict["img_size"] model_params["feature_dim"] = arg_dict["feature_dim"] model_params["class_num"] = arg_dict["label_num"] net = models.init(arg_dict["model"], gpu_num=gpu_num, model_params=model_params, parallel_mode=arg_dict["parallel_mode"], common_dict=common_dict) if arg_dict["parallel_mode"] == "DataParallel": net = nn.DataParallel(net) net.cuda() # print (net) if arg_dict["restore_ckpt"]: print ("Resotre ckpt from {}".format(arg_dict["restore_ckpt"])) net.load_state_dict(torch.load(arg_dict["restore_ckpt"])) # optimizer optimizer = optim.SGD(net.parameters(), lr=arg_dict['learning_rate'], momentum=0.9, weight_decay=5e-4) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20000, gamma=0.95) # start loop print ("Start to training...") start_time = time.time() display = 100 loss_list = [] while not _batch_reader.should_stop(): # prepare data batch_st = time.time() batch = _batch_generator.next() datas = batch[0].cuda() labels = batch[1].cuda() batch_et = time.time() # forward and backward loss = net(datas, labels) loss = loss.mean() optimizer.zero_grad() loss.backward() optimizer.step() lossd = loss.data[0] # display loss_list.append(lossd) if common_dict["global_step"] % display == 0: end_time = time.time() cost_time, start_time = end_time - start_time, end_time sample_per_sec = int(display * batch_size / cost_time) sec_per_step = cost_time / float(display) loss_display = np.mean(loss_list) lr = optimizer.param_groups[0]['lr'] print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %.5f, '\ 'sample/s: %d, sec/step: %.3f, batch time: %.3fs' % ( datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), _batch_reader.get_epoch(), common_dict["global_step"], lr, loss_display, sample_per_sec, sec_per_step, batch_et - batch_st)) loss_list = [] if common_dict["tensorboard_writer"] is not None: common_dict["tensorboard_writer"].add_scalar("loss", loss_display, common_dict["global_step"]) common_dict["tensorboard_writer"].add_scalar("sample_per_sec", sample_per_sec, common_dict["global_step"]) common_dict["tensorboard_writer"].add_scalar("lr", lr, common_dict["global_step"]) if common_dict["global_step"] % 10000 == 0: # save checkpoint checkpoint_path = os.path.join(prefix, 'model.ckpt') torch.save(net.state_dict(), checkpoint_path) print ("save checkpoint to %s" % checkpoint_path) lr_scheduler.step() common_dict["global_step"] += 1
def main(argv=None): random.seed(2) print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) # Load data and preprocess data print("Loading data...") data_reader = DataReader(FLAGS.DATA_PATH, FLAGS.DATA_FILENAME, FLAGS.NUM_MODEL) well_dic = data_reader.create_well_dictionary() print("Preprocessing data...") target_well = well_dic[str(FLAGS.WELL_TO_LEARN)] test_model_data = target_well[str(FLAGS.TRUE_MODEL)] preprocessor = Preprocessor(FLAGS.NUM_MODEL, FLAGS.TRUE_MODEL) well_data_zero_removed = preprocessor.remove_zero_wopr(target_well) serialized_data, end_indice = preprocessor.serialize_well_dataframe( well_data_zero_removed) scaled_data, scaler = preprocessor.scale_serialzed_data(serialized_data) # Split dataset and prepare batch batch_reader = BatchReader(scaled_data=scaled_data, end_indice=end_indice, train_split=FLAGS.TRAIN_SPLIT, true_model=FLAGS.TRUE_MODEL, buffer_size=FLAGS.BUFFER_SIZE, batch_size=FLAGS.BATCH_SIZE) train_data = batch_reader.get_train_batch() val_data = batch_reader.get_val_batch() train_total_seq_length = batch_reader.get_seq_length() # Define Model print("Defining model...") model_builder = ModelBuilder(FLAGS.BATCH_SIZE) model = model_builder.contruct_model() model.summary() # Set Training callbacks history_logger = HistoryLogger() # Train the model print("Begin training the model...") for epoch_idx in range(FLAGS.EPOCHS): print('epochs : ' + str(epoch_idx + 1)) model.fit(train_data, epochs=1, steps_per_epoch=train_total_seq_length / FLAGS.BATCH_SIZE, verbose=2, validation_data=val_data, validation_steps=100, use_multiprocessing=True, callbacks=[history_logger]) model.reset_states() # Save fig of loss history print("Saving loss history") plotter = Plotter(FLAGS.EPOCHS, FLAGS.WELL_TO_LEARN, FLAGS.TRUE_MODEL) plotter.plot_loss_history(history_logger.losses, history_logger.val_losses) # Inference (Cascade) print("Starting inference...") test_data = scaler.transform(test_model_data.values) total_timestep = test_data.shape[0] test_x, test_y = batch_reader.get_test_input_and_label(test_data) seq_in = test_x[FLAGS.OBSERVATION_DAY - FLAGS.BATCH_SIZE:FLAGS.OBSERVATION_DAY, :, :] seq_out = test_x[:FLAGS.INPUT_SEQUENCE, :1, :].flatten().tolist( ) + test_y[:FLAGS.OBSERVATION_DAY + 1].tolist() pred_count = test_x.shape[0] - FLAGS.OBSERVATION_DAY # Do Inference from Observationday for i in range(1, pred_count): sample_in = seq_in pred_out = model.predict(sample_in) seq_out.append(pred_out[-1, :].item()) seq_in = test_x[FLAGS.OBSERVATION_DAY - FLAGS.BATCH_SIZE + i:FLAGS.OBSERVATION_DAY + i, :, :] model.reset_states() # Evaluate print("Start evaluating the model...") seq_out_array = np.asarray(seq_out) prediction_val = (seq_out_array - scaler.min_[0]) / scaler.scale_[0] true_val = test_model_data['WOPR'].to_numpy() # Plot prediction result print("Saving prediction result...") plotter.plot_prediction(total_timestep, true_val, prediction_val) # Calculate error and save into file print("Calculate MAPE and save it to result file...") result_handler = ResultHandler(true_val=true_val, pred_val=prediction_val, well_to_learn=FLAGS.WELL_TO_LEARN, true_model=FLAGS.TRUE_MODEL) result_handler.save_mape_to_csv(FLAGS.RESULT_FILENAME) # Clear Session tf.keras.backend.clear_session() print("Done")
def train(prefix, **arg_dict): num_labels = arg_dict['landmark_type'] * 2 img_size = arg_dict['img_size'] train_angle = arg_dict['train_angle'] gpu_num = len(arg_dict["gpu_device"].split(',')) batch_size = arg_dict['batch_size'] * gpu_num arg_dict['batch_size'] = batch_size print("real batch_size = %d for gpu_num = %d" % (batch_size, gpu_num)) # batch generator _batch_reader = BatchReader(**arg_dict) _batch_generator = _batch_reader.batch_generator() # net ctx = [mx.gpu(i) for i in range(gpu_num)] net = models.init(num_label=num_labels, **arg_dict) if arg_dict["restore_ckpt"]: print "resotre checkpoint from %s" % (arg_dict["restore_ckpt"]) net.load_params(arg_dict['restore_ckpt'], ctx=ctx) else: net.initialize(init=mx.init.Xavier(), ctx=ctx) print net # loss losses_func = [] if train_angle: losses_func.append(gluon.loss.L2Loss(weight=0.5)) # landmark losses_func.append(gluon.loss.L2Loss(weight=0.5)) # angle else: losses_func.append(gluon.loss.L2Loss()) # landmark # trainer trainer = gluon.Trainer(net.collect_params(), "adam", {"learning_rate": arg_dict['learning_rate']}) # start loop print("Start to training...") start_time = time.time() step = 0 display = 10 loss_list = [] while not _batch_reader.should_stop(): batch = _batch_generator.next() image = nd.array(batch[0]) image = nd.transpose(image.astype('float32'), (0, 3, 1, 2)) / 127.5 - 1.0 image_list = gluon.utils.split_and_load(image, ctx) landmark = nd.array(batch[1]) landmark_list = gluon.utils.split_and_load(landmark, ctx) if train_angle: angle = nd.array(batch[2]) angle_list = gluon.utils.split_and_load(angle, ctx) with autograd.record(): losses = [] if train_angle: for _i, _l, _a in zip(image_list, landmark_list, angle_list): predicts = net(_i) landmark_loss = losses_func[0](predicts[0], _l) angle_loss = losses_func[1](predicts[1], _a) losses.append(landmark_loss + angle_loss) else: for _i, _l in zip(image_list, landmark_list): predicts = net(_i) landmark_loss = losses_func[0](predicts, _l) losses.append(landmark_loss) for loss in losses: loss.backward() trainer.step(batch_size) loss_list.append(np.mean([nd.mean(l).asscalar() for l in losses])) nd.waitall() if step % display == 0: end_time = time.time() cost_time, start_time = end_time - start_time, end_time sample_per_sec = int(display * batch_size / cost_time) sec_per_step = cost_time / float(display) loss_display = "[landmark: %.5f]" % (np.mean(loss_list)) print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %s,'\ 'sample/s: %d, sec/step: %.3f' % ( datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), _batch_reader.get_epoch(), step, trainer.learning_rate, loss_display, sample_per_sec, sec_per_step)) loss_list = [] if step % 1024 == 0: # change lr trainer.set_learning_rate(trainer.learning_rate * 0.95) # save checkpoint checkpoint_path = os.path.join(prefix, 'model.params') net.save_params(checkpoint_path) print("save checkpoint to %s" % checkpoint_path) step += 1
class ImageInputDataLayer(caffe.Layer): def setup(self, bottom, top): self.top_names = ['data', 'label'] # === Read input parameters === # params is a python dictionary with layer parameters. self.params = eval(self.param_str) # store input as class variables self.batch_size = self.params['batch_size'] # store landmark_type self.num_points = self.params['landmark_type'] # store data channels if self.params['img_format'] == 'RGB': self.num_channels = 3 elif self.params['img_format'] == 'GRAY': self.num_channels = 1 else: raise Exception("Unsupport img_format ...") # Create a batch loader to load the images. # we can disable reader when test if self.params['need_reader']: self.batch_reader = BatchReader(**self.params) self.batch_generator = self.batch_reader.batch_generator() # === reshape tops === top[0].reshape(self.batch_size, self.num_channels, self.params['img_size'], self.params['img_size']) top[1].reshape(self.batch_size, self.num_points * 2) def preProcessImage(self, imgs): """ process images before feeding to CNNs imgs: N x 1 x W x H """ imgs = imgs.astype(np.float32) for i, img in enumerate(imgs): m = img.mean() s = img.std() imgs[i] = (img - m) / s return imgs def forward(self, bottom, top): """ Load data. """ images, labels = self.batch_generator.next() #print 'liusanjun images num', len(images) top[0].data[...] = images top[1].data[...] = labels def reshape(self, bottom, top): # === reshape tops === top[0].reshape(self.batch_size, self.num_channels, self.params['img_size'], self.params['img_size']) top[1].reshape(self.batch_size, self.num_points * 2) def backward(self, top, propagate_down, bottom): """ These layers does not back propagate """ pass
class Seq2SeqAttentionTrain(object): def __init__(self, model_config, data_config, data_loader): self.model_config = model_config self.data_config = data_config self.data_loader = data_loader self.batch_reader = BatchReader(self.model_config, self.data_config, self.data_loader) self.model = Seq2SeqAttentionModel(self.model_config, self.data_loader.word2vec_vectors) def running_avg_loss(self, loss, running_avg_loss, summary_writer, step, decay=0.999): """ calculate the running average of losses. :param loss: current runtime loss :param running_avg_loss: model output loss :param summary_writer: tensorflow summary writer :param step: running step :param decay: when running avg loss :return: average loss """ if running_avg_loss == 0: running_avg_loss = loss else: running_avg_loss = running_avg_loss * decay + (1-decay) * running_avg_loss running_avg_loss = min(running_avg_loss, 12) loss_sum = tf.Summary() loss_sum.value.add(tag='running_avg_loss', simple_value=running_avg_loss) summary_writer.add_summary(loss_sum, step) return running_avg_loss def train(self): """ train model :return: """ """ Train dir is different from log_root to avoid summary directory """ with tf.device('/cpu:0'): saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(self.model_config.train_dir) sv = tf.train.Supervisor(logdir=self.model_config.log_path, is_cheif=True, saver=saver, summary_op=None, save_model_secs=self.model_config.save_model_secs, global_step=self.model.global_step) session = sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) running_avg_loss = 0 step = 0 while not sv.should_stop() and step < self.model_config.max_step: (article_batch, abstract_batch, target_batch, article_batch_lens, dec_output_lens, loss_weights, _, _) = self.batch_reader.next_batch() to_return = [self.model.optim, self.model.summarise, self.model.loss, self.model.global_step] result = session.run(to_return, feed_dict={ self.model.article: article_batch, self.model.abstract: abstract_batch, self.model.targets: target_batch, self.model.article_length: article_batch_lens, self.model.loss_weights: loss_weights}) running_avg_loss = self.running_avg_loss(running_avg_loss, result[2], summary_writer, step) summary_writer.add_summary(result[1], result[3]) step += 1 if step % 100 == 0: summary_writer.flush() print('{0} step, loss is {1}'.format(str(step), str(running_avg_loss))) sv.stop() def eval(self): """ evaluate model :return: """ saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(self.model_config.eval_dir) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) running_avg_loss = 0 step = 0 while True: time.sleep(60) try: ckpt_state = tf.train.get_checkpoint_state(self.model_config.log_root) except tf.errors.OutOfRangeError as e: tf.logging.error('Cannot restore checkpoint: %s', e) continue if not (ckpt_state and ckpt_state.model_checkpoint_path): tf.logging.info('No model to eval yet at %s', self.model_config.train_dir) continue tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) (article_batch, abstract_batch, target_batch, article_batch_lens, dec_output_lens, loss_weights, _, _) = self.batch_reader.next_batch() to_return = [self.model.summarise, self.model.loss, self.model.global_step] result = sess.run(to_return, feed_dict={ self.model.article: article_batch, self.model.abstract: abstract_batch, self.model.targets: target_batch, self.model.article_length: article_batch_lens, self.model.loss_weights: loss_weights}) summary_writer.add_summary(result[0], result[2]) running_avg_loss = self.running_avg_loss( running_avg_loss, result[1], summary_writer, result[2]) if step % 100 == 0: summary_writer.flush() print('{0} step, loss is {1}'.format(str(result[2]), str(running_avg_loss)))
def train(prefix, **arg_dict): img_size = arg_dict['img_size'] gpu_num = len(arg_dict["gpu_device"].split(',')) batch_size = arg_dict['batch_size'] * gpu_num arg_dict['batch_size'] = batch_size print ("real batch_size = %d for gpu_num = %d" % (batch_size, gpu_num)) # batch generator _batch_reader = BatchReader(**arg_dict) _batch_generator = _batch_reader.batch_generator() # net ctx = [mx.gpu(i) for i in range(gpu_num)] model_params = json.loads(arg_dict["model_params"]) model_params["feature_dim"] = arg_dict["feature_dim"] model_params["label_num"] = arg_dict["label_num"] net = models.init(arg_dict["model"], model_params=model_params) if arg_dict["restore_ckpt"]: print "resotre checkpoint from %s" % (arg_dict["restore_ckpt"]) net.initialize(init=mx.init.Xavier(), ctx=ctx) net.load_params(arg_dict['restore_ckpt'], ctx=ctx, allow_missing=True, ignore_extra=True) else: net.initialize(init=mx.init.Xavier(), ctx=ctx) print (net) # trainer trainer = gluon.Trainer(net.collect_params(), "sgd", # adam {"learning_rate": arg_dict['learning_rate']}) # start loop print ("Start to training...") start_time = time.time() step = 1 display = 100 loss_list = [] while not _batch_reader.should_stop(): batch = _batch_generator.next() data = nd.array(batch[0], dtype='float32') data = nd.transpose(data, (0,3,1,2)) label = nd.array(batch[1], dtype='float32') data_list = gluon.utils.split_and_load(data, ctx) label_list = gluon.utils.split_and_load(label, ctx) # normalization, in-place operation for i in range(gpu_num): data_list[i] -= 127.5 data_list[i] *= 0.0078125 # forward with autograd.record(): losses = [net(x, y) for x, y in zip(data_list, label_list)] for l in losses: l.backward() trainer.step(batch_size) loss = np.mean([nd.mean(l).asscalar() for l in losses]) loss_list.append(loss) nd.waitall() if step % display == 0: end_time = time.time() cost_time, start_time = end_time - start_time, end_time sample_per_sec = int(display * batch_size / cost_time) sec_per_step = cost_time / float(display) loss_display = "[loss: %.5f]" % (np.mean(loss_list)) print ('[%s] epochs: %d, step: %d, lr: %.5f, loss: %s,'\ 'sample/s: %d, sec/step: %.3f' % ( datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), _batch_reader.get_epoch(), step, trainer.learning_rate, loss_display, sample_per_sec, sec_per_step)) loss_list = [] if step % 500000 == 0: # change lr trainer.set_learning_rate(trainer.learning_rate * 0.95) print ("change lr to %f" % trainer.learning_rate) if step % 100000 == 0: # save checkpoint checkpoint_path = os.path.join(prefix, 'model.params') net.save_params(checkpoint_path) print ("save checkpoint to %s" % checkpoint_path) step += 1
def train(prefix, **arg_dict): batch_size = arg_dict['batch_size'] num_labels = arg_dict['landmark_type'] * 2 img_size = arg_dict['img_size'] gpu_list = map(int, arg_dict['gpu_device'].split(',')) assert (batch_size % len(gpu_list) == 0), "Batch size must exact division by gpu nums" with tf.Graph().as_default(), tf.device('/cpu:0'): # data input images = tf.placeholder(tf.float32, shape=[batch_size, img_size, img_size, 3]) labels = tf.placeholder(tf.float32, shape=[batch_size, num_labels]) images_split = tf.split(images, len(gpu_list), axis=0) labels_split = tf.split(labels, len(gpu_list), axis=0) # Create a variable to count the number of train() calls. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(arg_dict['learning_rate'], global_step, 30000, 0.8, staircase=True) # Create an optimizer that performs gradient descent. optimizer = tf.train.AdamOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in xrange(len(gpu_list)): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ("landmarks", i)) as scope: loss = tower_loss(scope, images_split[i], labels_split[i], arg_dict['model'], num_labels) tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this tower. grads = optimizer.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Apply the gradients to adjust the shared variables. apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options = tf.GPUOptions(allow_growth=True))) sess.run(init) # Create a saver. saver = tf.train.Saver(tf.global_variables()) if arg_dict['restore_ckpt']: variables_to_restore = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, arg_dict['restore_ckpt']) print ('Resume-trained model restored from: %s' % arg_dict['restore_ckpt']) print ("Start to training...") # batch generator _batch_reader = BatchReader(**arg_dict) _batch_generator = _batch_reader.batch_generator() start_time = time.time() while not _batch_reader.should_stop(): batch = _batch_generator.next() _, _loss, _step, _lr = sess.run([train_op, loss, global_step, lr], feed_dict={images: batch[0], labels: batch[1]}) if _step % 10 == 0: end_time = time.time() cost_time, start_time = end_time - start_time, end_time sample_per_sec = int(10 * batch_size / cost_time) sec_per_step = cost_time / 10.0 print ('[%s] epochs: %d, step: %d, lr: %f, landmark_loss: %.6f, sample/s: %d, sec/step: %.3f' % ( datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), _batch_reader.get_epoch(), _step, _lr, _loss, sample_per_sec, sec_per_step)) if _step % 1024 == 0: checkpoint_path = os.path.join(prefix, 'model.ckpt') saver.save(sess, checkpoint_path) print ('Saved checkpoint to %s' % checkpoint_path) checkpoint_path = os.path.join(prefix, 'model.ckpt') saver.save(sess, checkpoint_path) print ('\nReview training parameter:\n%s\n'%(str(arg_dict))) print ('Saved checkpoint to %s' % checkpoint_path) print ('Bye Bye!')