def train(config, train_dataloader, valid_dataloader, device, model, loss_class, optimizer, lr_scheduler, post_process_class, eval_class, pre_best_model_dict, logger, vdl_writer=None): cal_metric_during_train = config['Global'].get('cal_metric_during_train', False) log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] print_batch_step = config['Global']['print_batch_step'] eval_batch_step = config['Global']['eval_batch_step'] global_step = 0 start_eval_step = 0 if type(eval_batch_step) == list and len(eval_batch_step) >= 2: start_eval_step = eval_batch_step[0] eval_batch_step = eval_batch_step[1] if len(valid_dataloader) == 0: logger.info( 'No Images in eval dataset, evaluation during training will be disabled' ) start_eval_step = 1e111 logger.info( "During the training process, after the {}th iteration, an evaluation is run every {} iterations" .format(start_eval_step, eval_batch_step)) save_epoch_step = config['Global']['save_epoch_step'] save_model_dir = config['Global']['save_model_dir'] if not os.path.exists(save_model_dir): os.makedirs(save_model_dir) main_indicator = eval_class.main_indicator best_model_dict = {main_indicator: 0} best_model_dict.update(pre_best_model_dict) train_stats = TrainingStats(log_smooth_window, ['lr']) model_average = False model.train() use_srn = config['Architecture']['algorithm'] == "SRN" if 'start_epoch' in best_model_dict: start_epoch = best_model_dict['start_epoch'] else: start_epoch = 1 for epoch in range(start_epoch, epoch_num + 1): train_dataloader = build_dataloader(config, 'Train', device, logger, seed=epoch) train_batch_cost = 0.0 train_reader_cost = 0.0 batch_sum = 0 batch_start = time.time() for idx, batch in enumerate(train_dataloader): train_reader_cost += time.time() - batch_start if idx >= len(train_dataloader): break lr = optimizer.get_lr() images = batch[0] if use_srn: others = batch[-4:] preds = model(images, others) model_average = True else: preds = model(images) loss = loss_class(preds, batch) avg_loss = loss['loss'] avg_loss.backward() optimizer.step() optimizer.clear_grad() train_batch_cost += time.time() - batch_start batch_sum += len(images) if not isinstance(lr_scheduler, float): lr_scheduler.step() # logger and visualdl stats = {k: v.numpy().mean() for k, v in loss.items()} stats['lr'] = lr train_stats.update(stats) if cal_metric_during_train: # only rec and cls need batch = [item.numpy() for item in batch] post_result = post_process_class(preds, batch[1]) eval_class(post_result, batch) metric = eval_class.get_metric() train_stats.update(metric) if vdl_writer is not None and dist.get_rank() == 0: for k, v in train_stats.get().items(): vdl_writer.add_scalar('TRAIN/{}'.format(k), v, global_step) vdl_writer.add_scalar('TRAIN/lr', lr, global_step) if dist.get_rank( ) == 0 and global_step > 0 and global_step % print_batch_step == 0: logs = train_stats.log() strs = 'epoch: [{}/{}], iter: {}, {}, reader_cost: {:.5f} s, batch_cost: {:.5f} s, samples: {}, ips: {:.5f}'.format( epoch, epoch_num, global_step, logs, train_reader_cost / print_batch_step, train_batch_cost / print_batch_step, batch_sum, batch_sum / train_batch_cost) logger.info(strs) train_batch_cost = 0.0 train_reader_cost = 0.0 batch_sum = 0 # eval if global_step > start_eval_step and \ (global_step - start_eval_step) % eval_batch_step == 0 and dist.get_rank() == 0: if model_average: Model_Average = paddle.incubate.optimizer.ModelAverage( 0.15, parameters=model.parameters(), min_average_window=10000, max_average_window=15625) Model_Average.apply() cur_metric = eval(model, valid_dataloader, post_process_class, eval_class, use_srn=use_srn) cur_metric_str = 'cur metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in cur_metric.items()])) logger.info(cur_metric_str) # logger metric if vdl_writer is not None: for k, v in cur_metric.items(): if isinstance(v, (float, int)): vdl_writer.add_scalar('EVAL/{}'.format(k), cur_metric[k], global_step) if cur_metric[main_indicator] >= best_model_dict[ main_indicator]: best_model_dict.update(cur_metric) best_model_dict['best_epoch'] = epoch save_model(model, optimizer, save_model_dir, logger, is_best=True, prefix='best_accuracy', best_model_dict=best_model_dict, epoch=epoch) best_str = 'best metric, {}'.format(', '.join([ '{}: {}'.format(k, v) for k, v in best_model_dict.items() ])) logger.info(best_str) # logger best metric if vdl_writer is not None: vdl_writer.add_scalar( 'EVAL/best_{}'.format(main_indicator), best_model_dict[main_indicator], global_step) global_step += 1 optimizer.clear_grad() batch_start = time.time() if dist.get_rank() == 0: save_model(model, optimizer, save_model_dir, logger, is_best=False, prefix='latest', best_model_dict=best_model_dict, epoch=epoch) if dist.get_rank() == 0 and epoch > 0 and epoch % save_epoch_step == 0: save_model(model, optimizer, save_model_dir, logger, is_best=False, prefix='iter_epoch_{}'.format(epoch), best_model_dict=best_model_dict, epoch=epoch) best_str = 'best metric, {}'.format(', '.join( ['{}: {}'.format(k, v) for k, v in best_model_dict.items()])) logger.info(best_str) if dist.get_rank() == 0 and vdl_writer is not None: vdl_writer.close() return
def train_eval_det_run(config, exe, train_info_dict, eval_info_dict, is_slim=None): """ Feed data to the model and fetch the measures and loss for detection Args: config: config exe: train_info_dict: information dict for training eval_info_dict: information dict for evaluation """ train_batch_id = 0 log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] print_batch_step = config['Global']['print_batch_step'] eval_batch_step = config['Global']['eval_batch_step'] start_eval_step = 0 if type(eval_batch_step) == list and len(eval_batch_step) >= 2: start_eval_step = eval_batch_step[0] eval_batch_step = eval_batch_step[1] logger.info( "During the training process, after the {}th iteration, an evaluation is run every {} iterations" .format(start_eval_step, eval_batch_step)) save_epoch_step = config['Global']['save_epoch_step'] save_model_dir = config['Global']['save_model_dir'] if not os.path.exists(save_model_dir): os.makedirs(save_model_dir) train_stats = TrainingStats(log_smooth_window, train_info_dict['fetch_name_list']) best_eval_hmean = -1 best_batch_id = 0 best_epoch = 0 train_loader = train_info_dict['reader'] for epoch in range(epoch_num): train_loader.start() try: while True: t1 = time.time() train_outs = exe.run( program=train_info_dict['compile_program'], fetch_list=train_info_dict['fetch_varname_list'], return_numpy=False) stats = {} for tno in range(len(train_outs)): fetch_name = train_info_dict['fetch_name_list'][tno] fetch_value = np.mean(np.array(train_outs[tno])) stats[fetch_name] = fetch_value t2 = time.time() train_batch_elapse = t2 - t1 train_stats.update(stats) if train_batch_id > 0 and train_batch_id \ % print_batch_step == 0: logs = train_stats.log() strs = 'epoch: {}, iter: {}, {}, time: {:.3f}'.format( epoch, train_batch_id, logs, train_batch_elapse) logger.info(strs) if train_batch_id > start_eval_step and\ (train_batch_id - start_eval_step) % eval_batch_step == 0: metrics = eval_det_run(exe, config, eval_info_dict, "eval") hmean = metrics['hmean'] if hmean >= best_eval_hmean: best_eval_hmean = hmean best_batch_id = train_batch_id best_epoch = epoch save_path = save_model_dir + "/best_accuracy" if is_slim is None: save_model(train_info_dict['train_program'], save_path) else: import paddleslim as slim if is_slim == "prune": slim.prune.save_model( exe, train_info_dict['train_program'], save_path) elif is_slim == "quant": save_model(eval_info_dict['program'], save_path) else: raise ValueError( "Only quant and prune are supported currently. But received {}" .format(is_slim)) strs = 'Test iter: {}, metrics:{}, best_hmean:{:.6f}, best_epoch:{}, best_batch_id:{}'.format( train_batch_id, metrics, best_eval_hmean, best_epoch, best_batch_id) logger.info(strs) train_batch_id += 1 except fluid.core.EOFException: train_loader.reset() if epoch == 0 and save_epoch_step == 1: save_path = save_model_dir + "/iter_epoch_0" if is_slim is None: save_model(train_info_dict['train_program'], save_path) else: import paddleslim as slim if is_slim == "prune": slim.prune.save_model(exe, train_info_dict['train_program'], save_path) elif is_slim == "quant": save_model(eval_info_dict['program'], save_path) else: raise ValueError( "Only quant and prune are supported currently. But received {}" .format(is_slim)) if epoch > 0 and epoch % save_epoch_step == 0: save_path = save_model_dir + "/iter_epoch_%d" % (epoch) if is_slim is None: save_model(train_info_dict['train_program'], save_path) else: import paddleslim as slim if is_slim == "prune": slim.prune.save_model(exe, train_info_dict['train_program'], save_path) elif is_slim == "quant": save_model(eval_info_dict['program'], save_path) else: raise ValueError( "Only quant and prune are supported currently. But received {}" .format(is_slim)) return
def train_eval_cls_run(config, exe, train_info_dict, eval_info_dict, is_slim=None): train_batch_id = 0 log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] print_batch_step = config['Global']['print_batch_step'] eval_batch_step = config['Global']['eval_batch_step'] start_eval_step = 0 if type(eval_batch_step) == list and len(eval_batch_step) >= 2: start_eval_step = eval_batch_step[0] eval_batch_step = eval_batch_step[1] logger.info( "During the training process, after the {}th iteration, an evaluation is run every {} iterations" .format(start_eval_step, eval_batch_step)) save_epoch_step = config['Global']['save_epoch_step'] save_model_dir = config['Global']['save_model_dir'] if not os.path.exists(save_model_dir): os.makedirs(save_model_dir) train_stats = TrainingStats(log_smooth_window, ['loss', 'acc']) best_eval_acc = -1 best_batch_id = 0 best_epoch = 0 train_loader = train_info_dict['reader'] for epoch in range(epoch_num): train_loader.start() try: while True: t1 = time.time() train_outs = exe.run( program=train_info_dict['compile_program'], fetch_list=train_info_dict['fetch_varname_list'], return_numpy=False) fetch_map = dict( zip(train_info_dict['fetch_name_list'], range(len(train_outs)))) loss = np.mean(np.array(train_outs[fetch_map['total_loss']])) lr = np.mean(np.array(train_outs[fetch_map['lr']])) acc = np.mean(np.array(train_outs[fetch_map['acc']])) t2 = time.time() train_batch_elapse = t2 - t1 stats = {'loss': loss, 'acc': acc} train_stats.update(stats) if train_batch_id > start_eval_step and (train_batch_id - start_eval_step) \ % print_batch_step == 0: logs = train_stats.log() strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format( epoch, train_batch_id, lr, logs, train_batch_elapse) logger.info(strs) if train_batch_id > 0 and\ train_batch_id % eval_batch_step == 0: model_average = train_info_dict['model_average'] if model_average != None: model_average.apply(exe) metrics = eval_cls_run(exe, eval_info_dict) eval_acc = metrics['avg_acc'] eval_sample_num = metrics['total_sample_num'] if eval_acc > best_eval_acc: best_eval_acc = eval_acc best_batch_id = train_batch_id best_epoch = epoch save_path = save_model_dir + "/best_accuracy" if is_slim is None: save_model(train_info_dict['train_program'], save_path) else: import paddleslim as slim if is_slim == "prune": slim.prune.save_model( exe, train_info_dict['train_program'], save_path) elif is_slim == "quant": save_model(eval_info_dict['program'], save_path) else: raise ValueError( "Only quant and prune are supported currently. But received {}" .format(is_slim)) strs = 'Test iter: {}, acc:{:.6f}, best_acc:{:.6f}, best_epoch:{}, best_batch_id:{}, eval_sample_num:{}'.format( train_batch_id, eval_acc, best_eval_acc, best_epoch, best_batch_id, eval_sample_num) logger.info(strs) train_batch_id += 1 except fluid.core.EOFException: train_loader.reset() if epoch == 0 and save_epoch_step == 1: save_path = save_model_dir + "/iter_epoch_0" if is_slim is None: save_model(train_info_dict['train_program'], save_path) else: import paddleslim as slim if is_slim == "prune": slim.prune.save_model(exe, train_info_dict['train_program'], save_path) elif is_slim == "quant": save_model(eval_info_dict['program'], save_path) else: raise ValueError( "Only quant and prune are supported currently. But received {}" .format(is_slim)) if epoch > 0 and epoch % save_epoch_step == 0: save_path = save_model_dir + "/iter_epoch_%d" % (epoch) if is_slim is None: save_model(train_info_dict['train_program'], save_path) else: import paddleslim as slim if is_slim == "prune": slim.prune.save_model(exe, train_info_dict['train_program'], save_path) elif is_slim == "quant": save_model(eval_info_dict['program'], save_path) else: raise ValueError( "Only quant and prune are supported currently. But received {}" .format(is_slim)) return
def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict): train_batch_id = 0 log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] print_batch_step = config['Global']['print_batch_step'] eval_batch_step = config['Global']['eval_batch_step'] save_epoch_step = config['Global']['save_epoch_step'] save_model_dir = config['Global']['save_model_dir'] train_stats = TrainingStats(log_smooth_window, ['loss', 'acc']) best_eval_acc = -1 best_batch_id = 0 best_epoch = 0 train_loader = train_info_dict['reader'] for epoch in range(epoch_num): train_loader.start() try: while True: t1 = time.time() train_outs = exe.run( program=train_info_dict['compile_program'], fetch_list=train_info_dict['fetch_varname_list'], return_numpy=False) fetch_map = dict( zip(train_info_dict['fetch_name_list'], range(len(train_outs)))) loss = np.mean(np.array(train_outs[fetch_map['total_loss']])) lr = np.mean(np.array(train_outs[fetch_map['lr']])) preds_idx = fetch_map['decoded_out'] preds = np.array(train_outs[preds_idx]) preds_lod = train_outs[preds_idx].lod()[0] labels_idx = fetch_map['label'] labels = np.array(train_outs[labels_idx]) labels_lod = train_outs[labels_idx].lod()[0] acc, acc_num, img_num = cal_predicts_accuracy( config['Global']['char_ops'], preds, preds_lod, labels, labels_lod) t2 = time.time() train_batch_elapse = t2 - t1 stats = {'loss': loss, 'acc': acc} train_stats.update(stats) if train_batch_id > 0 and train_batch_id \ % print_batch_step == 0: logs = train_stats.log() strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format( epoch, train_batch_id, lr, logs, train_batch_elapse) logger.info(strs) if train_batch_id > 0 and\ train_batch_id % eval_batch_step == 0: metrics = eval_rec_run(exe, config, eval_info_dict, "eval") eval_acc = metrics['avg_acc'] eval_sample_num = metrics['total_sample_num'] if eval_acc > best_eval_acc: best_eval_acc = eval_acc best_batch_id = train_batch_id best_epoch = epoch save_path = save_model_dir + "/best_accuracy" save_model(train_info_dict['train_program'], save_path) strs = 'Test iter: {}, acc:{:.6f}, best_acc:{:.6f}, best_epoch:{}, best_batch_id:{}, eval_sample_num:{}'.format( train_batch_id, eval_acc, best_eval_acc, best_epoch, best_batch_id, eval_sample_num) logger.info(strs) train_batch_id += 1 except fluid.core.EOFException: train_loader.reset() if epoch > 0 and epoch % save_epoch_step == 0: save_path = save_model_dir + "/iter_epoch_%d" % (epoch) save_model(train_info_dict['train_program'], save_path) return
def train_eval_det_run(config, exe, train_info_dict, eval_info_dict): train_batch_id = 0 log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] print_batch_step = config['Global']['print_batch_step'] eval_batch_step = config['Global']['eval_batch_step'] save_epoch_step = config['Global']['save_epoch_step'] save_model_dir = config['Global']['save_model_dir'] train_stats = TrainingStats(log_smooth_window, train_info_dict['fetch_name_list']) best_eval_hmean = -1 best_batch_id = 0 best_epoch = 0 train_loader = train_info_dict['reader'] for epoch in range(epoch_num): train_loader.start() try: while True: t1 = time.time() train_outs = exe.run( program=train_info_dict['compile_program'], fetch_list=train_info_dict['fetch_varname_list'], return_numpy=False) stats = {} for tno in range(len(train_outs)): fetch_name = train_info_dict['fetch_name_list'][tno] fetch_value = np.mean(np.array(train_outs[tno])) stats[fetch_name] = fetch_value t2 = time.time() train_batch_elapse = t2 - t1 train_stats.update(stats) if train_batch_id > 0 and train_batch_id \ % print_batch_step == 0: logs = train_stats.log() strs = 'epoch: {}, iter: {}, {}, time: {:.3f}'.format( epoch, train_batch_id, logs, train_batch_elapse) logger.info(strs) if train_batch_id > 0 and\ train_batch_id % eval_batch_step == 0: metrics = eval_det_run(exe, config, eval_info_dict, "eval") hmean = metrics['hmean'] if hmean >= best_eval_hmean: best_eval_hmean = hmean best_batch_id = train_batch_id best_epoch = epoch save_path = save_model_dir + "/best_accuracy" save_model(train_info_dict['train_program'], save_path) strs = 'Test iter: {}, metrics:{}, best_hmean:{:.6f}, best_epoch:{}, best_batch_id:{}'.format( train_batch_id, metrics, best_eval_hmean, best_epoch, best_batch_id) logger.info(strs) train_batch_id += 1 except fluid.core.EOFException: train_loader.reset() if epoch > 0 and epoch % save_epoch_step == 0: save_path = save_model_dir + "/iter_epoch_%d" % (epoch) save_model(train_info_dict['train_program'], save_path) return
def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict): train_batch_id = 0 log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] print_batch_step = config['Global']['print_batch_step'] eval_batch_step = config['Global']['eval_batch_step'] start_eval_step = 0 if type(eval_batch_step) == list and len(eval_batch_step) >= 2: start_eval_step = eval_batch_step[0] eval_batch_step = eval_batch_step[1] logger.info( "During the training process, after the {}th iteration, an evaluation is run every {} iterations" .format(start_eval_step, eval_batch_step)) save_epoch_step = config['Global']['save_epoch_step'] save_model_dir = config['Global']['save_model_dir'] if not os.path.exists(save_model_dir): os.makedirs(save_model_dir) train_stats = TrainingStats(log_smooth_window, ['loss', 'acc']) best_eval_acc = -1 best_batch_id = 0 best_epoch = 0 train_loader = train_info_dict['reader'] for epoch in range(epoch_num): train_loader.start() try: while True: t1 = time.time() train_outs = exe.run( program=train_info_dict['compile_program'], fetch_list=train_info_dict['fetch_varname_list'], return_numpy=False) fetch_map = dict( zip(train_info_dict['fetch_name_list'], range(len(train_outs)))) loss = np.mean(np.array(train_outs[fetch_map['total_loss']])) lr = np.mean(np.array(train_outs[fetch_map['lr']])) preds_idx = fetch_map['decoded_out'] preds = np.array(train_outs[preds_idx]) labels_idx = fetch_map['label'] labels = np.array(train_outs[labels_idx]) if config['Global']['loss_type'] != 'srn': preds_lod = train_outs[preds_idx].lod()[0] labels_lod = train_outs[labels_idx].lod()[0] acc, acc_num, img_num = cal_predicts_accuracy( config['Global']['char_ops'], preds, preds_lod, labels, labels_lod) else: acc, acc_num, img_num = cal_predicts_accuracy_srn( config['Global']['char_ops'], preds, labels, config['Global']['max_text_length']) t2 = time.time() train_batch_elapse = t2 - t1 stats = {'loss': loss, 'acc': acc} train_stats.update(stats) if train_batch_id > start_eval_step and ( train_batch_id - start_eval_step) % print_batch_step == 0: logs = train_stats.log() strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format( epoch, train_batch_id, lr, logs, train_batch_elapse) logger.info(strs) if train_batch_id > 0 and train_batch_id % eval_batch_step == 0: model_average = train_info_dict['model_average'] if model_average != None: model_average.apply(exe) metrics = eval_rec_run(exe, config, eval_info_dict, "eval") eval_acc = metrics['avg_acc'] eval_sample_num = metrics['total_sample_num'] if eval_acc > best_eval_acc: best_eval_acc = eval_acc best_batch_id = train_batch_id best_epoch = epoch save_path = save_model_dir + "/best_accuracy" save_model(train_info_dict['train_program'], save_path) strs = 'Test iter: {}, acc:{:.6f}, best_acc:{:.6f}, best_epoch:{}, best_batch_id:{}, eval_sample_num:{}'.\ format(train_batch_id, eval_acc, best_eval_acc, best_epoch, best_batch_id, eval_sample_num) logger.info(strs) train_batch_id += 1 except fluid.core.EOFException: train_loader.reset() if epoch == 0 and save_epoch_step == 1: save_path = save_model_dir + "/iter_epoch_0" save_model(train_info_dict['train_program'], save_path) if epoch > 0 and epoch % save_epoch_step == 0: save_path = save_model_dir + "/iter_epoch_%d" % epoch save_model(train_info_dict['train_program'], save_path) return
def main(): config = load_config(FLAGS.config) merge_config(FLAGS.opt) print(config) alg = config['Global']['algorithm'] assert alg in ['EAST', 'DB'] # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] check_gpu(use_gpu) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) det_model = create_module( config['Architecture']['function'])(params=config) startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): train_loader, train_outputs = det_model(mode="train") train_fetch_list = [v.name for v in train_outputs] train_loss = train_outputs[0] opt_params = config['Optimizer'] optimizer = create_module(opt_params['function'])(opt_params) optimizer.minimize(train_loss) global_lr = optimizer._global_learning_rate() global_lr.persistable = True train_fetch_list.append(global_lr.name) eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): eval_loader, eval_outputs = det_model(mode="eval") eval_fetch_list = [v.name for v in eval_outputs] eval_prog = eval_prog.clone(for_test=True) train_reader = reader.train_reader(config=config) train_loader.set_sample_list_generator(train_reader, places=place) exe.run(startup_prog) # compile program for multi-devices train_compile_program = create_multi_devices_program( train_prog, train_loss.name) pretrain_weights = config['Global']['pretrain_weights'] if pretrain_weights is not None: load_pretrain(exe, train_prog, pretrain_weights) print("pretrain weights loaded!") train_batch_id = 0 if alg == 'EAST': train_log_keys = ['loss_total', 'loss_cls', 'loss_offset'] elif alg == 'DB': train_log_keys = [ 'loss_total', 'loss_shrink', 'loss_threshold', 'loss_binary' ] log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] print_step = config['Global']['print_step'] eval_step = config['Global']['eval_step'] save_epoch_step = config['Global']['save_epoch_step'] save_dir = config['Global']['save_dir'] train_stats = TrainingStats(log_smooth_window, train_log_keys) best_eval_hmean = -1 best_batch_id = 0 best_epoch = 0 for epoch in range(epoch_num): train_loader.start() try: while True: t1 = time.time() train_outs = exe.run(program=train_compile_program, fetch_list=train_fetch_list, return_numpy=False) loss_total = np.mean(np.array(train_outs[0])) if alg == 'EAST': loss_cls = np.mean(np.array(train_outs[1])) loss_offset = np.mean(np.array(train_outs[2])) stats = {'loss_total':loss_total, 'loss_cls':loss_cls,\ 'loss_offset':loss_offset} elif alg == 'DB': loss_shrink_maps = np.mean(np.array(train_outs[1])) loss_threshold_maps = np.mean(np.array(train_outs[2])) loss_binary_maps = np.mean(np.array(train_outs[3])) stats = {'loss_total':loss_total, 'loss_shrink':loss_shrink_maps, \ 'loss_threshold':loss_threshold_maps, 'loss_binary':loss_binary_maps} lr = np.mean(np.array(train_outs[-1])) t2 = time.time() train_batch_elapse = t2 - t1 # stats = {'loss_total':loss_total, 'loss_cls':loss_cls,\ # 'loss_offset':loss_offset} train_stats.update(stats) if train_batch_id > 0 and train_batch_id % print_step == 0: logs = train_stats.log() strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format( epoch, train_batch_id, lr, logs, train_batch_elapse) logger.info(strs) if train_batch_id > 0 and\ train_batch_id % eval_step == 0: metrics = eval_det_run(exe, eval_prog, eval_fetch_list, config, "eval") hmean = metrics['hmean'] if hmean >= best_eval_hmean: best_eval_hmean = hmean best_batch_id = train_batch_id best_epoch = epoch save_path = save_dir + "/best_accuracy" save_model(train_prog, save_path) strs = 'Test iter: {}, metrics:{}, best_hmean:{:.6f}, best_epoch:{}, best_batch_id:{}'.format( train_batch_id, metrics, best_eval_hmean, best_epoch, best_batch_id) logger.info(strs) train_batch_id += 1 except fluid.core.EOFException: train_loader.reset() if epoch > 0 and epoch % save_epoch_step == 0: save_path = save_dir + "/iter_epoch_%d" % (epoch) save_model(train_prog, save_path)
def main(): config = load_config(FLAGS.config) merge_config(FLAGS.opt) char_ops = CharacterOps(config['Global']) config['Global']['char_num'] = char_ops.get_char_num() print(config) # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] check_gpu(use_gpu) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) rec_model = create_module( config['Architecture']['function'])(params=config) startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): train_loader, train_outputs = rec_model(mode="train") save_var = train_outputs[1] if "gradient_clip" in config['Global']: gradient_clip = config['Global']['gradient_clip'] clip = fluid.clip.GradientClipByGlobalNorm(gradient_clip) fluid.clip.set_gradient_clip(clip, program=train_prog) train_fetch_list = [v.name for v in train_outputs] train_loss = train_outputs[0] opt_params = config['Optimizer'] optimizer = create_module(opt_params['function'])(opt_params) optimizer.minimize(train_loss) global_lr = optimizer._global_learning_rate() global_lr.persistable = True train_fetch_list.append(global_lr.name) train_reader = reader.train_eval_reader(config=config, char_ops=char_ops, mode="train") train_loader.set_sample_list_generator(train_reader, places=place) eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): eval_loader, eval_outputs = rec_model(mode="eval") eval_fetch_list = [v.name for v in eval_outputs] eval_prog = eval_prog.clone(for_test=True) exe.run(startup_prog) eval_reader = reader.train_eval_reader(config=config, char_ops=char_ops, mode="eval") eval_loader.set_sample_list_generator(eval_reader, places=place) # compile program for multi-devices train_compile_program = create_multi_devices_program( train_prog, train_loss.name) pretrain_weights = config['Global']['pretrain_weights'] if pretrain_weights is not None: load_pretrain(exe, train_prog, pretrain_weights) train_batch_id = 0 train_log_keys = ['loss', 'acc'] log_smooth_window = config['Global']['log_smooth_window'] epoch_num = config['Global']['epoch_num'] loss_type = config['Global']['loss_type'] print_step = config['Global']['print_step'] eval_step = config['Global']['eval_step'] save_epoch_step = config['Global']['save_epoch_step'] save_dir = config['Global']['save_dir'] train_stats = TrainingStats(log_smooth_window, train_log_keys) best_eval_acc = -1 best_batch_id = 0 best_epoch = 0 for epoch in range(epoch_num): train_loader.start() try: while True: t1 = time.time() train_outs = exe.run(program=train_compile_program, fetch_list=train_fetch_list, return_numpy=False) loss = np.mean(np.array(train_outs[0])) lr = np.mean(np.array(train_outs[-1])) preds = np.array(train_outs[1]) preds_lod = train_outs[1].lod()[0] labels = np.array(train_outs[2]) labels_lod = train_outs[2].lod()[0] acc, acc_num, img_num = cal_predicts_accuracy( char_ops, preds, preds_lod, labels, labels_lod) t2 = time.time() train_batch_elapse = t2 - t1 stats = {'loss': loss, 'acc': acc} train_stats.update(stats) if train_batch_id > 0 and train_batch_id % print_step == 0: logs = train_stats.log() strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format( epoch, train_batch_id, lr, logs, train_batch_elapse) logger.info(strs) if train_batch_id > 0 and train_batch_id % eval_step == 0: outs = eval_run(exe, eval_prog, eval_loader, eval_fetch_list, char_ops, train_batch_id, "eval") eval_acc, acc_num, sample_num = outs if eval_acc > best_eval_acc: best_eval_acc = eval_acc best_batch_id = train_batch_id best_epoch = epoch save_path = save_dir + "/best_accuracy" save_model(train_prog, save_path) strs = 'Test iter: {}, acc:{:.6f}, best_acc:{:.6f}, best_epoch:{}, best_batch_id:{}, sample_num:{}'.format( train_batch_id, eval_acc, best_eval_acc, best_epoch, best_batch_id, sample_num) logger.info(strs) train_batch_id += 1 except fluid.core.EOFException: train_loader.reset() if epoch > 0 and epoch % save_epoch_step == 0: save_path = save_dir + "/iter_epoch_%d" % (epoch) save_model(train_prog, save_path)