def run(dataloader, exe, program, fetchs, epoch=0, mode='train'): """ Feed data to the model and fetch the measures and loss Args: dataloader(fluid dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_list = [f[1] for f in fetchs.values()] for m in metric_list: m.reset() batch_time = AverageMeter('cost', '.3f') tic = time.time() for idx, batch in enumerate(dataloader()): metrics = exe.run(program=program, feed=batch, fetch_list=fetch_list) batch_time.update(time.time() - tic) tic = time.time() for i, m in enumerate(metrics): metric_list[i].update(m[0], len(batch[0])) fetchs_str = ''.join([m.value for m in metric_list] + [batch_time.value]) logger.info("[epoch:{:3d}][{:s}][step:{:4d}]{:s}".format( epoch, mode, idx, fetchs_str)) end_str = ''.join([m.mean for m in metric_list] + [batch_time.total]) logger.info("END [epoch:{:3d}][{:s}]{:s}".format(epoch, mode, end_str))
def run(dataloader, exe, program, fetchs, epoch=0, mode='train'): """ Feed data to the model and fetch the measures and loss Args: dataloader(fluid dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_list = [f[1] for f in fetchs.values()] for m in metric_list: m.reset() batch_time = AverageMeter('cost', ':6.3f') tic = time.time() trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) for idx, batch in enumerate(dataloader()): metrics = exe.run(program=program, feed=batch, fetch_list=fetch_list) batch_time.update(time.time() - tic) tic = time.time() for i, m in enumerate(metrics): metric_list[i].update(m[0], len(batch[0])) fetchs_str = ''.join([str(m) for m in metric_list] + [str(batch_time)]) if trainer_id == 0: logger.info("[epoch:%3d][%s][step:%4d]%s" % (epoch, mode, idx, fetchs_str)) if trainer_id == 0: logger.info("END [epoch:%3d][%s]%s" % (epoch, mode, fetchs_str))
def create_metric(out, feeds, topk=5, classes_num=1000, use_distillation=False): """ Create measures of model accuracy, such as top1 and top5 Args: out(variable): model output variable feeds(dict): dict of model input variables(included label) topk(int): usually top5 classes_num(int): num of classes Returns: fetchs(dict): dict of measures """ # just need student label to get metrics if use_distillation: out = out[1] fetchs = OrderedDict() label = feeds['label'] softmax_out = fluid.layers.softmax(out, use_cudnn=False) top1 = fluid.layers.accuracy(softmax_out, label=label, k=1) fetchs['top1'] = (top1, AverageMeter('top1', ':2.4f', True)) k = min(topk, classes_num) topk = fluid.layers.accuracy(softmax_out, label=label, k=k) topk_name = 'top{}'.format(k) fetchs[topk_name] = (topk, AverageMeter(topk_name, ':2.4f', True)) return fetchs
def run(dataloader, exe, program, fetchs, epoch=0, mode='train', vdl_writer=None): """ Feed data to the model and fetch the measures and loss Args: dataloader(fluid dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_list = [f[1] for f in fetchs.values()] for m in metric_list: m.reset() batch_time = AverageMeter('elapse', '.3f') tic = time.time() for idx, batch in enumerate(dataloader()): metrics = exe.run(program=program, feed=batch, fetch_list=fetch_list) batch_time.update(time.time() - tic) tic = time.time() for i, m in enumerate(metrics): metric_list[i].update(m[0], len(batch[0])) fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list] + [batch_time.value]) + 's' if vdl_writer: global total_step logger.scaler('loss', metrics[0][0], total_step, vdl_writer) total_step += 1 if mode == 'eval': logger.info("{:s} step:{:<4d} {:s}s".format(mode, idx, fetchs_str)) else: epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) logger.info("{:s} {:s} {:s}".format( logger.coloring(epoch_str, "HEADER") if idx == 0 else epoch_str, logger.coloring(step_str, "PURPLE"), logger.coloring(fetchs_str, 'OKGREEN'))) end_str = ''.join([str(m.mean) + ' ' for m in metric_list] + [batch_time.total]) + 's' if mode == 'eval': logger.info("END {:s} {:s}s".format(mode, end_str)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s}".format( logger.coloring(end_epoch_str, "RED"), logger.coloring(mode, "PURPLE"), logger.coloring(end_str, "OKGREEN"))) # return top1_acc in order to save the best model if mode == 'valid': return fetchs["top1"][1].avg
def create_fetchs(out, feeds, architecture, topk=5, classes_num=1000, epsilon=None, use_mix=False, use_distillation=False): """ Create fetchs as model outputs(included loss and measures), will call create_loss and create_metric(if use_mix). Args: out(variable): model output variable feeds(dict): dict of model input variables(included label) architecture(dict): architecture information, name(such as ResNet50) is needed topk(int): usually top5 classes_num(int): num of classes epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0 use_mix(bool): whether to use mix(include mixup, cutmix, fmix) Returns: fetchs(dict): dict of model outputs(included loss and measures) """ fetchs = OrderedDict() loss = create_loss(out, feeds, architecture, classes_num, epsilon, use_mix, use_distillation) fetchs['loss'] = (loss, AverageMeter('loss', ':2.4f', True)) if not use_mix: metric = create_metric(out, feeds, topk, classes_num, use_distillation) fetchs.update(metric) return fetchs
def update_metric(trainer, out, batch, batch_size): # calc metric if trainer.train_metric_func is not None: metric_dict = trainer.train_metric_func(out, batch[-1]) for key in metric_dict: if key not in trainer.output_info: trainer.output_info[key] = AverageMeter(key, '7.5f') trainer.output_info[key].update(metric_dict[key].numpy()[0], batch_size)
def run(dataloader, exe, program, fetchs, epoch=0, mode='train'): """ Feed data to the model and fetch the measures and loss Args: dataloader(fluid dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_list = [f[1] for f in fetchs.values()] for m in metric_list: m.reset() batch_time = AverageMeter('elapse', '.3f') tic = time.time() for idx, batch in enumerate(dataloader()): metrics = exe.run(program=program, feed=batch, fetch_list=fetch_list) batch_time.update(time.time() - tic) tic = time.time() for i, m in enumerate(metrics): metric_list[i].update(m[0], len(batch[0])) fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list] + [batch_time.value]) if mode == 'eval': logger.info("{:s} step:{:<4d} {:s}s".format(mode, idx, fetchs_str)) else: logger.info("epoch:{:<3d} {:s} step:{:<4d} {:s}s".format( epoch, mode, idx, fetchs_str)) end_str = ''.join([str(m.mean) + ' ' for m in metric_list] + [batch_time.total]) if mode == 'eval': logger.info("END {:s} {:s}s".format(mode, end_str)) else: logger.info("END epoch:{:<3d} {:s} {:s}s".format(epoch, mode, end_str)) # return top1_acc in order to save the best model if mode == 'valid': return fetchs["top1"][1].avg
def build(config, main_prog, startup_prog, is_train=True, is_distributed=True): """ Build a program using a model and an optimizer 1. create feeds 2. create a dataloader 3. create a model 4. create fetchs 5. create an optimizer Args: config(dict): config main_prog(): main program startup_prog(): startup program is_train(bool): train or valid is_distributed(bool): whether to use distributed training method Returns: dataloader(): a bridge between the model and the data fetchs(dict): dict of model outputs(included loss and measures) """ with fluid.program_guard(main_prog, startup_prog): with fluid.unique_name.guard(): use_mix = config.get('use_mix') and is_train use_distillation = config.get('use_distillation') feeds = create_feeds(config.image_shape, use_mix=use_mix) dataloader = create_dataloader(feeds.values()) out = create_model(config.ARCHITECTURE, feeds['image'], config.classes_num, is_train) fetchs = create_fetchs(out, feeds, config.ARCHITECTURE, config.topk, config.classes_num, epsilon=config.get('ls_epsilon'), use_mix=use_mix, use_distillation=use_distillation) if is_train: optimizer = create_optimizer(config) lr = optimizer._global_learning_rate() fetchs['lr'] = (lr, AverageMeter('lr', 'f', need_avg=False)) optimizer = mixed_precision_optimizer(config, optimizer) if is_distributed: optimizer = dist_optimizer(config, optimizer) optimizer.minimize(fetchs['loss'][0]) if config.get('use_ema'): global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter( ) ema = ExponentialMovingAverage(config.get('ema_decay'), thres_steps=global_steps) ema.update() return dataloader, fetchs, ema return dataloader, fetchs
def create_metric(out, feeds, architecture, topk=5, classes_num=1000, config=None, use_distillation=False): """ Create measures of model accuracy, such as top1 and top5 Args: out(variable): model output variable feeds(dict): dict of model input variables(included label) topk(int): usually top5 classes_num(int): num of classes config(dict) : model config Returns: fetchs(dict): dict of measures """ label = paddle.reshape(feeds['label'], [-1, 1]) if architecture["name"] == "GoogLeNet": assert len(out) == 3, "GoogLeNet should have 3 outputs" out = out[0] else: # just need student label to get metrics if use_distillation: out = out[1] softmax_out = F.softmax(out) fetchs = OrderedDict() # set top1 to fetchs top1 = paddle.metric.accuracy(softmax_out, label=label, k=1) fetchs['top1'] = (top1, AverageMeter('top1', '.4f', need_avg=True)) # set topk to fetchs k = min(topk, classes_num) topk = paddle.metric.accuracy(softmax_out, label=label, k=k) topk_name = 'top{}'.format(k) fetchs[topk_name] = (topk, AverageMeter(topk_name, '.4f', need_avg=True)) return fetchs
def build(config, main_prog, startup_prog, is_train=True): """ Build a program using a model and an optimizer 1. create feeds 2. create a dataloader 3. create a model 4. create fetchs 5. create an optimizer Args: config(dict): config main_prog(): main program startup_prog(): startup program is_train(bool): train or valid Returns: dataloader(): a bridge between the model and the data fetchs(dict): dict of model outputs(included loss and measures) """ with fluid.program_guard(main_prog, startup_prog): with fluid.unique_name.guard(): use_mix = config.get('use_mix') and is_train use_distillation = config.get('use_distillation') feeds = create_feeds(config.image_shape, use_mix=use_mix) dataloader = create_dataloader(feeds.values()) out = create_model(config.ARCHITECTURE, feeds['image'], config.classes_num) fetchs = create_fetchs( out, feeds, config.ARCHITECTURE, config.topk, config.classes_num, epsilon=config.get('ls_epsilon'), use_mix=use_mix, use_distillation=use_distillation) if is_train: optimizer = create_optimizer(config) lr = optimizer._global_learning_rate() fetchs['lr'] = (lr, AverageMeter('lr', 'f', need_avg=False)) optimizer = dist_optimizer(config, optimizer) optimizer.minimize(fetchs['loss'][0]) return dataloader, fetchs
def run(dataloader, config, net, optimizer=None, lr_scheduler=None, epoch=0, mode='train', vdl_writer=None): """ Feed data to the model and fetch the measures and loss Args: dataloader(paddle dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ print_interval = config.get("print_interval", 10) use_mix = config.get("use_mix", False) and mode == "train" multilabel = config.get("multilabel", False) classes_num = config.get("classes_num") metric_list = [ ("loss", AverageMeter( 'loss', '7.5f', postfix=",")), ("lr", AverageMeter( 'lr', 'f', postfix=",", need_avg=False)), ("batch_time", AverageMeter( 'batch_cost', '.5f', postfix=" s,")), ("reader_time", AverageMeter( 'reader_cost', '.5f', postfix=" s,")), ] if not use_mix: if not multilabel: topk_name = 'top{}'.format(config.topk) metric_list.insert( 0, (topk_name, AverageMeter( topk_name, '.5f', postfix=","))) metric_list.insert( 0, ("top1", AverageMeter( "top1", '.5f', postfix=","))) else: metric_list.insert( 0, ("multilabel_accuracy", AverageMeter( "multilabel_accuracy", '.5f', postfix=","))) metric_list.insert( 0, ("hamming_distance", AverageMeter( "hamming_distance", '.5f', postfix=","))) metric_list = OrderedDict(metric_list) tic = time.time() for idx, batch in enumerate(dataloader()): # avoid statistics from warmup time if idx == 10: metric_list["batch_time"].reset() metric_list["reader_time"].reset() metric_list['reader_time'].update(time.time() - tic) batch_size = len(batch[0]) feeds = create_feeds(batch, use_mix, classes_num, multilabel) fetchs = create_fetchs(feeds, net, config, mode) if mode == 'train': avg_loss = fetchs['loss'] avg_loss.backward() optimizer.step() optimizer.clear_grad() lr_value = optimizer._global_learning_rate().numpy()[0] metric_list['lr'].update(lr_value, batch_size) if lr_scheduler is not None: if lr_scheduler.update_specified: curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx update = max( 0, curr_global_counter - lr_scheduler.update_start_step ) % lr_scheduler.update_step_interval == 0 if update: lr_scheduler.step() else: lr_scheduler.step() for name, fetch in fetchs.items(): metric_list[name].update(fetch.numpy()[0], batch_size) metric_list["batch_time"].update(time.time() - tic) tic = time.time() if vdl_writer and mode == "train": global total_step logger.scaler( name="lr", value=lr_value, step=total_step, writer=vdl_writer) for name, fetch in fetchs.items(): logger.scaler( name="train_{}".format(name), value=fetch.numpy()[0], step=total_step, writer=vdl_writer) total_step += 1 fetchs_str = ' '.join([ str(metric_list[key].mean) if "time" in key else str(metric_list[key].value) for key in metric_list ]) if idx % print_interval == 0: ips_info = "ips: {:.5f} images/sec".format( batch_size / metric_list["batch_time"].avg) if mode == "train": epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) eta_sec = ((config["epochs"] - epoch) * len(dataloader) - idx ) * metric_list["batch_time"].avg eta_str = "eta: {:s}".format( str(datetime.timedelta(seconds=int(eta_sec)))) logger.info("{:s}, {:s}, {:s} {:s}, {:s}".format( epoch_str, step_str, fetchs_str, ips_info, eta_str)) else: logger.info("{:s} step:{:<4d}, {:s} {:s}".format( mode, idx, fetchs_str, ips_info)) end_str = ' '.join([str(m.mean) for m in metric_list.values()] + [metric_list['batch_time'].total]) ips_info = "ips: {:.5f} images/sec.".format( batch_size * metric_list["batch_time"].count / metric_list["batch_time"].sum) if mode == 'eval': logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str, ips_info)) # return top1_acc in order to save the best model if mode == 'valid': if multilabel: return metric_list['multilabel_accuracy'].avg else: return metric_list['top1'].avg
def run(dataloader, exe, program, feeds, fetchs, epoch=0, mode='train', config=None, vdl_writer=None, lr_scheduler=None): """ Feed data to the model and fetch the measures and loss Args: dataloader(paddle io dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_list = [ ("lr", AverageMeter('lr', 'f', postfix=",", need_avg=False)), ("batch_time", AverageMeter('batch_cost', '.5f', postfix=" s,")), ("reader_time", AverageMeter('reader_cost', '.5f', postfix=" s,")), ] topk_name = 'top{}'.format(config.topk) metric_list.insert(0, ("loss", fetchs["loss"][1])) use_mix = config.get("use_mix", False) and mode == "train" if not use_mix: metric_list.insert(0, (topk_name, fetchs[topk_name][1])) metric_list.insert(0, ("top1", fetchs["top1"][1])) metric_list = OrderedDict(metric_list) for m in metric_list.values(): m.reset() use_dali = config.get('use_dali', False) dataloader = dataloader if use_dali else dataloader() tic = time.time() idx = 0 batch_size = None while True: # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG try: batch = next(dataloader) except StopIteration: break except RuntimeError: logger.warning( "Except RuntimeError when reading data from dataloader, try to read once again..." ) continue idx += 1 # ignore the warmup iters if idx == 5: metric_list["batch_time"].reset() metric_list["reader_time"].reset() metric_list['reader_time'].update(time.time() - tic) if use_dali: batch_size = batch[0]["feed_image"].shape()[0] feed_dict = batch[0] else: batch_size = batch[0].shape()[0] feed_dict = { key.name: batch[idx] for idx, key in enumerate(feeds.values()) } metrics = exe.run(program=program, feed=feed_dict, fetch_list=fetch_list) for name, m in zip(fetchs.keys(), metrics): metric_list[name].update(np.mean(m), batch_size) metric_list["batch_time"].update(time.time() - tic) if mode == "train": metric_list['lr'].update(lr_scheduler.get_lr()) fetchs_str = ' '.join([ str(metric_list[key].mean) if "time" in key else str(metric_list[key].value) for key in metric_list ]) ips_info = " ips: {:.5f} images/sec.".format( batch_size / metric_list["batch_time"].avg) fetchs_str += ips_info if lr_scheduler is not None: if lr_scheduler.update_specified: curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx update = max( 0, curr_global_counter - lr_scheduler.update_start_step ) % lr_scheduler.update_step_interval == 0 if update: lr_scheduler.step() else: lr_scheduler.step() if vdl_writer: global total_step logger.scaler('loss', metrics[0][0], total_step, vdl_writer) total_step += 1 if mode == 'valid': if idx % config.get('print_interval', 10) == 0: logger.info("{:s} step:{:<4d} {:s}".format( mode, idx, fetchs_str)) else: epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) if idx % config.get('print_interval', 10) == 0: logger.info("{:s} {:s} {:s}".format( logger.coloring(epoch_str, "HEADER") if idx == 0 else epoch_str, logger.coloring(step_str, "PURPLE"), logger.coloring(fetchs_str, 'OKGREEN'))) tic = time.time() end_str = ' '.join([str(m.mean) for m in metric_list.values()] + [metric_list["batch_time"].total]) ips_info = "ips: {:.5f} images/sec.".format( batch_size * metric_list["batch_time"].count / metric_list["batch_time"].sum) if mode == 'valid': logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str, ips_info)) if use_dali: dataloader.reset() # return top1_acc in order to save the best model if mode == 'valid': return fetchs["top1"][1].avg
def create_fetchs(out, feeds, architecture, topk=5, epsilon=None, class_num=None, use_mix=False, config=None, mode="Train"): """ Create fetchs as model outputs(included loss and measures), will call create_loss and create_metric(if use_mix). Args: out(variable): model output variable feeds(dict): dict of model input variables. If use mix_up, it will not include label. architecture(dict): architecture information, name(such as ResNet50) is needed topk(int): usually top5 epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0 class_num(int): the class number of network, required if use_mix use_mix(bool): whether to use mix(include mixup, cutmix, fmix) config(dict): model config Returns: fetchs(dict): dict of model outputs(included loss and measures) """ fetchs = OrderedDict() # build loss if use_mix: if class_num is None: msg = "When use MixUp, CutMix and so on, you must set class_num." logger.error(msg) raise Exception(msg) target = paddle.reshape(feeds['target'], [-1, class_num]) else: target = paddle.reshape(feeds['label'], [-1, 1]) loss_func = build_loss(config["Loss"][mode]) loss_dict = loss_func(out, target) loss_out = loss_dict["loss"] fetchs['loss'] = (loss_out, AverageMeter('loss', '7.4f', need_avg=True)) # build metric if not use_mix: metric_func = build_metrics(config["Metric"][mode]) metric_dict = metric_func(out, target) for key in metric_dict: if mode != "Train" and paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce( metric_dict[key], op=paddle.distributed.ReduceOp.SUM) metric_dict[key] = metric_dict[ key] / paddle.distributed.get_world_size() fetchs[key] = (metric_dict[key], AverageMeter(key, '7.4f', need_avg=True)) return fetchs
def run(dataloader, exe, program, feeds, fetchs, epoch=0, mode='train', config=None, vdl_writer=None, lr_scheduler=None, profiler_options=None): """ Feed data to the model and fetch the measures and loss Args: dataloader(paddle io dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or evaluation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_dict = OrderedDict([("lr", AverageMeter('lr', 'f', postfix=",", need_avg=False))]) for k in fetchs: metric_dict[k] = fetchs[k][1] metric_dict["batch_time"] = AverageMeter('batch_cost', '.5f', postfix=" s,") metric_dict["reader_time"] = AverageMeter('reader_cost', '.5f', postfix=" s,") for m in metric_dict.values(): m.reset() use_dali = config["Global"].get('use_dali', False) tic = time.time() if not use_dali: dataloader = dataloader() idx = 0 batch_size = None while True: # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG try: batch = next(dataloader) except StopIteration: break except RuntimeError: logger.warning( "Except RuntimeError when reading data from dataloader, try to read once again..." ) continue idx += 1 # ignore the warmup iters if idx == 5: metric_dict["batch_time"].reset() metric_dict["reader_time"].reset() metric_dict['reader_time'].update(time.time() - tic) profiler.add_profiler_step(profiler_options) if use_dali: batch_size = batch[0]["data"].shape()[0] feed_dict = batch[0] else: batch_size = batch[0].shape()[0] feed_dict = { key.name: batch[idx] for idx, key in enumerate(feeds.values()) } metrics = exe.run(program=program, feed=feed_dict, fetch_list=fetch_list) for name, m in zip(fetchs.keys(), metrics): metric_dict[name].update(np.mean(m), batch_size) metric_dict["batch_time"].update(time.time() - tic) if mode == "train": metric_dict['lr'].update(lr_scheduler.get_lr()) fetchs_str = ' '.join([ str(metric_dict[key].mean) if "time" in key else str(metric_dict[key].value) for key in metric_dict ]) ips_info = " ips: {:.5f} images/sec.".format( batch_size / metric_dict["batch_time"].avg) fetchs_str += ips_info if lr_scheduler is not None: lr_scheduler.step() if vdl_writer: global total_step logger.scaler('loss', metrics[0][0], total_step, vdl_writer) total_step += 1 if mode == 'eval': if idx % config.get('print_interval', 10) == 0: logger.info("{:s} step:{:<4d} {:s}".format( mode, idx, fetchs_str)) else: epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) if idx % config.get('print_interval', 10) == 0: logger.info("{:s} {:s} {:s}".format(epoch_str, step_str, fetchs_str)) tic = time.time() end_str = ' '.join([str(m.mean) for m in metric_dict.values()] + [metric_dict["batch_time"].total]) ips_info = "ips: {:.5f} images/sec.".format(batch_size / metric_dict["batch_time"].avg) if mode == 'eval': logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str, ips_info)) if use_dali: dataloader.reset() # return top1_acc in order to save the best model if mode == 'eval': return fetchs["top1"][1].avg
def update_loss(trainer, loss_dict, batch_size): # update_output_info for key in loss_dict: if key not in trainer.output_info: trainer.output_info[key] = AverageMeter(key, '7.5f') trainer.output_info[key].update(loss_dict[key].numpy()[0], batch_size)
def run(dataloader, config, net, optimizer=None, lr_scheduler=None, epoch=0, mode='train'): """ Feed data to the model and fetch the measures and loss Args: dataloader(paddle dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ print_interval = config.get("print_interval", 10) use_mix = config.get("use_mix", False) and mode == "train" metric_list = [ ("loss", AverageMeter('loss', '7.5f', postfix=",")), ("lr", AverageMeter('lr', 'f', postfix=",", need_avg=False)), ("batch_time", AverageMeter('batch_cost', '.5f', postfix=" s,")), ("reader_time", AverageMeter('reader_cost', '.5f', postfix=" s,")), ] if not use_mix: topk_name = 'top{}'.format(config.topk) metric_list.insert( 0, (topk_name, AverageMeter(topk_name, '.5f', postfix=","))) metric_list.insert(0, ("top1", AverageMeter("top1", '.5f', postfix=","))) metric_list = OrderedDict(metric_list) tic = time.time() for idx, batch in enumerate(dataloader()): # avoid statistics from warmup time if idx == 10: metric_list["batch_time"].reset() metric_list["reader_time"].reset() metric_list['reader_time'].update(time.time() - tic) batch_size = len(batch[0]) feeds = create_feeds(batch, use_mix) fetchs = create_fetchs(feeds, net, config, mode) if mode == 'train': avg_loss = fetchs['loss'] avg_loss.backward() optimizer.step() optimizer.clear_grad() metric_list['lr'].update( optimizer._global_learning_rate().numpy()[0], batch_size) if lr_scheduler is not None: if lr_scheduler.update_specified: curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx update = max( 0, curr_global_counter - lr_scheduler.update_start_step ) % lr_scheduler.update_step_interval == 0 if update: lr_scheduler.step() else: lr_scheduler.step() for name, fetch in fetchs.items(): metric_list[name].update(fetch.numpy()[0], batch_size) metric_list["batch_time"].update(time.time() - tic) tic = time.time() fetchs_str = ' '.join([ str(metric_list[key].mean) if "time" in key else str(metric_list[key].value) for key in metric_list ]) if idx % print_interval == 0: ips_info = "ips: {:.5f} images/sec.".format( batch_size / metric_list["batch_time"].avg) if mode == 'eval': logger.info("{:s} step:{:<4d}, {:s} {:s}".format( mode, idx, fetchs_str, ips_info)) else: epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) logger.info("{:s}, {:s}, {:s} {:s}".format( logger.coloring(epoch_str, "HEADER") if idx == 0 else epoch_str, logger.coloring(step_str, "PURPLE"), logger.coloring(fetchs_str, 'OKGREEN'), logger.coloring(ips_info, 'OKGREEN'))) end_str = ' '.join([str(m.mean) for m in metric_list.values()] + [metric_list['batch_time'].total]) ips_info = "ips: {:.5f} images/sec.".format( batch_size * metric_list["batch_time"].count / metric_list["batch_time"].sum) if mode == 'eval': logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s} {:s}".format( logger.coloring(end_epoch_str, "RED"), logger.coloring(mode, "PURPLE"), logger.coloring(end_str, "OKGREEN"), logger.coloring(ips_info, "OKGREEN"), )) # return top1_acc in order to save the best model if mode == 'valid': return metric_list['top1'].avg
def run(dataloader, exe, program, fetchs, epoch=0, mode='train', config=None, vdl_writer=None): """ Feed data to the model and fetch the measures and loss Args: dataloader(fluid dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_list = [f[1] for f in fetchs.values()] for m in metric_list: m.reset() batch_time = AverageMeter('elapse', '.5f', need_avg=True) tic = time.time() dataloader = dataloader if config.get('use_dali') else dataloader()() for idx, batch in enumerate(dataloader): if idx == 10: for m in metric_list: m.reset() batch_time.reset() batch_size = batch[0]["feed_image"].shape()[0] metrics = exe.run(program=program, feed=batch, fetch_list=fetch_list) batch_time.update(time.time() - tic) for i, m in enumerate(metrics): metric_list[i].update(np.mean(m), batch_size) fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list] + [batch_time.mean]) + 's' ips_info = " ips: {:.5f} images/sec.".format(batch_size / batch_time.avg) fetchs_str += ips_info if vdl_writer: global total_step logger.scaler('loss', metrics[0][0], total_step, vdl_writer) total_step += 1 if mode == 'eval': if idx % config.get('print_interval', 10) == 0: logger.info("{:s} step:{:<4d} {:s}".format( mode, idx, fetchs_str)) else: epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) if idx % config.get('print_interval', 10) == 0: logger.info("{:s} {:s} {:s}".format( epoch_str if idx == 0 else epoch_str, step_str, fetchs_str)) tic = time.time() if config.get('use_dali'): dataloader.reset() end_str = ''.join([str(m.mean) + ' ' for m in metric_list] + [batch_time.total]) + 's' ips_info = "ips: {:.5f} images/sec.".format(batch_size * batch_time.count / batch_time.sum) if mode == 'eval': logger.info("END {:s} {:s}s {:s}".format(mode, end_str, ips_info)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str, ips_info)) # return top1_acc in order to save the best model if mode == 'valid': return fetchs["top1"][1].avg
def train(self): assert self.mode == "train" print_batch_step = self.config['Global']['print_batch_step'] save_interval = self.config["Global"]["save_interval"] best_metric = { "metric": 0.0, "epoch": 0, } # key: # val: metrics list word self.output_info = dict() self.time_info = { "batch_cost": AverageMeter("batch_cost", '.5f', postfix=" s,"), "reader_cost": AverageMeter("reader_cost", ".5f", postfix=" s,"), } # global iter counter self.global_step = 0 if self.config["Global"]["checkpoints"] is not None: metric_info = init_model(self.config["Global"], self.model, self.optimizer) if metric_info is not None: best_metric.update(metric_info) self.max_iter = len(self.train_dataloader) - 1 if platform.system( ) == "Windows" else len(self.train_dataloader) for epoch_id in range(best_metric["epoch"] + 1, self.config["Global"]["epochs"] + 1): acc = 0.0 # for one epoch train self.train_epoch_func(self, epoch_id, print_batch_step) if self.use_dali: self.train_dataloader.reset() metric_msg = ", ".join([ "{}: {:.5f}".format(key, self.output_info[key].avg) for key in self.output_info ]) logger.info("[Train][Epoch {}/{}][Avg]{}".format( epoch_id, self.config["Global"]["epochs"], metric_msg)) self.output_info.clear() # eval model and save model if possible if self.config["Global"][ "eval_during_train"] and epoch_id % self.config["Global"][ "eval_interval"] == 0: acc = self.eval(epoch_id) if acc > best_metric["metric"]: best_metric["metric"] = acc best_metric["epoch"] = epoch_id save_load.save_model( self.model, self.optimizer, best_metric, self.output_dir, model_name=self.config["Arch"]["name"], prefix="best_model") logger.info("[Eval][Epoch {}][best metric: {}]".format( epoch_id, best_metric["metric"])) logger.scaler(name="eval_acc", value=acc, step=epoch_id, writer=self.vdl_writer) self.model.train() # save model if epoch_id % save_interval == 0: save_load.save_model(self.model, self.optimizer, { "metric": acc, "epoch": epoch_id }, self.output_dir, model_name=self.config["Arch"]["name"], prefix="epoch_{}".format(epoch_id)) # save the latest model save_load.save_model(self.model, self.optimizer, { "metric": acc, "epoch": epoch_id }, self.output_dir, model_name=self.config["Arch"]["name"], prefix="latest") if self.vdl_writer is not None: self.vdl_writer.close()
def classification_eval(engine, epoch_id=0): output_info = dict() time_info = { "batch_cost": AverageMeter("batch_cost", '.5f', postfix=" s,"), "reader_cost": AverageMeter("reader_cost", ".5f", postfix=" s,"), } print_batch_step = engine.config["Global"]["print_batch_step"] metric_key = None tic = time.time() accum_samples = 0 total_samples = len( engine.eval_dataloader.dataset ) if not engine.use_dali else engine.eval_dataloader.size max_iter = len(engine.eval_dataloader) - 1 if platform.system( ) == "Windows" else len(engine.eval_dataloader) for iter_id, batch in enumerate(engine.eval_dataloader): if iter_id >= max_iter: break if iter_id == 5: for key in time_info: time_info[key].reset() if engine.use_dali: batch = [ paddle.to_tensor(batch[0]['data']), paddle.to_tensor(batch[0]['label']) ] time_info["reader_cost"].update(time.time() - tic) batch_size = batch[0].shape[0] batch[0] = paddle.to_tensor(batch[0]).astype("float32") if not engine.config["Global"].get("use_multilabel", False): batch[1] = batch[1].reshape([-1, 1]).astype("int64") # image input if engine.amp: amp_level = engine.config['AMP'].get("level", "O1").upper() with paddle.amp.auto_cast(custom_black_list={ "flatten_contiguous_range", "greater_than" }, level=amp_level): out = engine.model(batch[0]) else: out = engine.model(batch[0]) # just for DistributedBatchSampler issue: repeat sampling current_samples = batch_size * paddle.distributed.get_world_size() accum_samples += current_samples # gather Tensor when distributed if paddle.distributed.get_world_size() > 1: label_list = [] paddle.distributed.all_gather(label_list, batch[1]) labels = paddle.concat(label_list, 0) if isinstance(out, dict): if "Student" in out: out = out["Student"] if isinstance(out, dict): out = out["logits"] elif "logits" in out: out = out["logits"] else: msg = "Error: Wrong key in out!" raise Exception(msg) if isinstance(out, list): preds = [] for x in out: pred_list = [] paddle.distributed.all_gather(pred_list, x) pred_x = paddle.concat(pred_list, 0) preds.append(pred_x) else: pred_list = [] paddle.distributed.all_gather(pred_list, out) preds = paddle.concat(pred_list, 0) if accum_samples > total_samples and not engine.use_dali: preds = preds[:total_samples + current_samples - accum_samples] labels = labels[:total_samples + current_samples - accum_samples] current_samples = total_samples + current_samples - accum_samples else: labels = batch[1] preds = out # calc loss if engine.eval_loss_func is not None: if engine.amp and engine.config["AMP"].get("use_fp16_test", False): amp_level = engine.config['AMP'].get("level", "O1").upper() with paddle.amp.auto_cast(custom_black_list={ "flatten_contiguous_range", "greater_than" }, level=amp_level): loss_dict = engine.eval_loss_func(preds, labels) else: loss_dict = engine.eval_loss_func(preds, labels) for key in loss_dict: if key not in output_info: output_info[key] = AverageMeter(key, '7.5f') output_info[key].update(loss_dict[key].numpy()[0], current_samples) # calc metric if engine.eval_metric_func is not None: metric_dict = engine.eval_metric_func(preds, labels) for key in metric_dict: if metric_key is None: metric_key = key if key not in output_info: output_info[key] = AverageMeter(key, '7.5f') output_info[key].update(metric_dict[key].numpy()[0], current_samples) time_info["batch_cost"].update(time.time() - tic) if iter_id % print_batch_step == 0: time_msg = "s, ".join([ "{}: {:.5f}".format(key, time_info[key].avg) for key in time_info ]) ips_msg = "ips: {:.5f} images/sec".format( batch_size / time_info["batch_cost"].avg) metric_msg = ", ".join([ "{}: {:.5f}".format(key, output_info[key].val) for key in output_info ]) logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format( epoch_id, iter_id, len(engine.eval_dataloader), metric_msg, time_msg, ips_msg)) tic = time.time() if engine.use_dali: engine.eval_dataloader.reset() metric_msg = ", ".join([ "{}: {:.5f}".format(key, output_info[key].avg) for key in output_info ]) logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg)) # do not try to save best eval.model if engine.eval_metric_func is None: return -1 # return 1st metric in the dict return output_info[metric_key].avg
def run(dataloader, exe, program, feeds, fetchs, epoch=0, mode='train', config=None, vdl_writer=None, lr_scheduler=None): """ Feed data to the model and fetch the measures and loss Args: dataloader(paddle io dataloader): exe(): program(): fetchs(dict): dict of measures and the loss epoch(int): epoch of training or validation model(str): log only Returns: """ fetch_list = [f[0] for f in fetchs.values()] metric_list = [f[1] for f in fetchs.values()] if mode == "train": metric_list.append(AverageMeter('lr', 'f', need_avg=False)) for m in metric_list: m.reset() batch_time = AverageMeter('elapse', '.3f') use_dali = config.get('use_dali', False) dataloader = dataloader if use_dali else dataloader() tic = time.time() for idx, batch in enumerate(dataloader): # ignore the warmup iters if idx == 5: batch_time.reset() if use_dali: batch_size = batch[0]["feed_image"].shape()[0] feed_dict = batch[0] else: batch_size = batch[0].shape()[0] feed_dict = { key.name: batch[idx] for idx, key in enumerate(feeds.values()) } metrics = exe.run(program=program, feed=feed_dict, fetch_list=fetch_list) batch_time.update(time.time() - tic) for i, m in enumerate(metrics): metric_list[i].update(np.mean(m), batch_size) if mode == "train": metric_list[-1].update(lr_scheduler.get_lr()) fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list] + [batch_time.mean]) + 's' ips_info = " ips: {:.5f} images/sec.".format(batch_size / batch_time.avg) fetchs_str += ips_info if lr_scheduler is not None: if lr_scheduler.update_specified: curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx update = max( 0, curr_global_counter - lr_scheduler.update_start_step ) % lr_scheduler.update_step_interval == 0 if update: lr_scheduler.step() else: lr_scheduler.step() if vdl_writer: global total_step logger.scaler('loss', metrics[0][0], total_step, vdl_writer) total_step += 1 if mode == 'valid': if idx % config.get('print_interval', 10) == 0: logger.info("{:s} step:{:<4d} {:s}".format( mode, idx, fetchs_str)) else: epoch_str = "epoch:{:<3d}".format(epoch) step_str = "{:s} step:{:<4d}".format(mode, idx) if idx % config.get('print_interval', 10) == 0: logger.info("{:s} {:s} {:s}".format( logger.coloring(epoch_str, "HEADER") if idx == 0 else epoch_str, logger.coloring(step_str, "PURPLE"), logger.coloring(fetchs_str, 'OKGREEN'))) tic = time.time() end_str = ''.join([str(m.mean) + ' ' for m in metric_list] + [batch_time.total]) + 's' ips_info = "ips: {:.5f} images/sec.".format(batch_size * batch_time.count / batch_time.sum) if mode == 'valid': logger.info("END {:s} {:s}s {:s}".format(mode, end_str, ips_info)) else: end_epoch_str = "END epoch:{:<3d}".format(epoch) logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str, ips_info)) if use_dali: dataloader.reset() # return top1_acc in order to save the best model if mode == 'valid': return fetchs["top1"][1].avg